In [50]:
import pandas as pd
import csv
from collections import defaultdict
from datetime import datetime
import requests
import bs4

In [51]:
input_file_path = 'wordle_texts.csv'
output_file_path = 'filtered_wordle_scores_per_day.csv'


last_message_of_day = {}


def is_valid_datetime(date_str):
    try:
        datetime.strptime(date_str, '%Y-%m-%d %H:%M:%S')
        return True
    except ValueError:
        return False


with open(input_file_path, 'r', newline='', encoding='latin-1') as inp:
    reader = csv.reader(inp)
    
    
    header = next(reader)
    
    
    for row in reader:
        
        if "Tom " in row[12] or "tom " in row[12]:
            datetime_str = row[1]  
            if is_valid_datetime(datetime_str):
                timestamp = datetime.strptime(datetime_str, '%Y-%m-%d %H:%M:%S')  
                date_only = timestamp.date()  # 
                if date_only not in last_message_of_day or last_message_of_day[date_only][0] < timestamp:
                    last_message_of_day[date_only] = (timestamp, row)


with open(output_file_path, 'w', newline='', encoding='latin-1') as outp:
    writer = csv.writer(outp)
    
    
    writer.writerow(header)
    
    
    for date in sorted(last_message_of_day.keys()):
        writer.writerow(last_message_of_day[date][1])


In [52]:
df = pd.read_csv('filtered_wordle_scores_per_day.csv')


In [53]:
print(df.head(100))

         Chat Session         Message Date Delivered Date  \
0   Ellen Davis & Dad  2023-07-19 22:36:36            NaN   
1   Ellen Davis & Dad  2023-07-20 22:07:29            NaN   
2   Ellen Davis & Dad  2023-07-21 12:42:07            NaN   
3   Ellen Davis & Dad  2023-07-22 17:46:58            NaN   
4   Ellen Davis & Dad  2023-07-23 09:06:14            NaN   
..                ...                  ...            ...   
95  Ellen Davis & Dad  2024-06-01 17:29:04            NaN   
96  Ellen Davis & Dad  2024-06-02 13:14:35            NaN   
97  Ellen Davis & Dad  2024-06-03 12:09:54            NaN   
98  Ellen Davis & Dad  2024-06-04 14:14:20            NaN   
99  Ellen Davis & Dad  2024-06-05 10:58:10            NaN   

              Read Date  Edited Date Service      Type     Sender ID  \
0   2023-07-20 15:04:51          NaN     SMS  Incoming  1.919602e+10   
1   2023-07-20 22:32:10          NaN     SMS  Incoming  1.919602e+10   
2   2023-07-21 21:40:52          NaN     SMS  Incom

In [54]:
columns_to_keep = ['Message Date', 'Text']
df = df[columns_to_keep]

In [55]:
print(df.head(5))

          Message Date                                               Text
0  2023-07-19 22:36:36  Beth 5\nEllen 5\nLucy 4\nTom 4\nJen 3\nPete 3\...
1  2023-07-20 22:07:29  Lucy 5\nMarcos 5\nPete 5\nTom 5\nJen 4\nGrant ...
2  2023-07-21 12:42:07           Beth 5\nGrant 5\nMarcos 5\nPete 4\nTom 4
3  2023-07-22 17:46:58                    Pete 6\nBeth 5\nTom 5\nMarcos 3
4  2023-07-23 09:06:14                                      Pete 4\nTom 3


In [56]:
# Convert the 'Message Date' column to datetime
df['Message Date'] = pd.to_datetime(df['Message Date'])

# Extract year, month, day, and time
df['Year'] = df['Message Date'].dt.year
df['Month'] = df['Message Date'].dt.month
df['Day'] = df['Message Date'].dt.day
df['Time'] = df['Message Date'].dt.time
df['Message Date'] = df['Message Date'].dt.date

# Rearrange the columns
df = df[['Message Date', 'Year', 'Month', 'Day', 'Time', 'Text']]

# Display the first few rows of the DataFrame to verify
print(df.head())

# Save the modified DataFrame to a new CSV file
df.to_csv('wordle_texts_modified.csv', index=False)



  Message Date  Year  Month  Day      Time  \
0   2023-07-19  2023      7   19  22:36:36   
1   2023-07-20  2023      7   20  22:07:29   
2   2023-07-21  2023      7   21  12:42:07   
3   2023-07-22  2023      7   22  17:46:58   
4   2023-07-23  2023      7   23  09:06:14   

                                                Text  
0  Beth 5\nEllen 5\nLucy 4\nTom 4\nJen 3\nPete 3\...  
1  Lucy 5\nMarcos 5\nPete 5\nTom 5\nJen 4\nGrant ...  
2           Beth 5\nGrant 5\nMarcos 5\nPete 4\nTom 4  
3                    Pete 6\nBeth 5\nTom 5\nMarcos 3  
4                                      Pete 4\nTom 3  


In [57]:

print(df.dtypes)


Message Date    object
Year             int32
Month            int32
Day              int32
Time            object
Text            object
dtype: object


MUST ADRESS: Sometimes dad sends update scores after 11:59 PM!!!


In [58]:
# Split the text by new lines
df['Text'] = df['Text'].apply(lambda x: x.split('\n') if isinstance(x, str) else [])

# Use explode to create new rows for each element in the list
expanded_df = df.explode('Text').reset_index(drop=True)

print("Expanded DataFrame:")
print(expanded_df)

Expanded DataFrame:
     Message Date  Year  Month  Day      Time      Text
0      2023-07-19  2023      7   19  22:36:36    Beth 5
1      2023-07-19  2023      7   19  22:36:36   Ellen 5
2      2023-07-19  2023      7   19  22:36:36    Lucy 4
3      2023-07-19  2023      7   19  22:36:36     Tom 4
4      2023-07-19  2023      7   19  22:36:36     Jen 3
...           ...   ...    ...  ...       ...       ...
1077   2024-07-18  2024      7   18  14:34:00    Beth 5
1078   2024-07-18  2024      7   18  14:34:00  Marcos 5
1079   2024-07-18  2024      7   18  14:34:00     Tom 5
1080   2024-07-18  2024      7   18  14:34:00  Smitty 3
1081   2024-07-18  2024      7   18  14:34:00    Slim 4

[1082 rows x 6 columns]


In [59]:
unique_counts = expanded_df.groupby('Message Date').size()

print("Unique counts by 'Message Date':")
print(unique_counts)

Unique counts by 'Message Date':
Message Date
2023-07-19    8
2023-07-20    7
2023-07-21    5
2023-07-22    4
2023-07-23    2
             ..
2024-07-14    6
2024-07-15    7
2024-07-16    8
2024-07-17    6
2024-07-18    5
Length: 132, dtype: int64


In [60]:
expanded_df['Player'] = expanded_df['Text'].str.extract(r'^([A-Za-z]+)')
expanded_df['Score'] = expanded_df['Text'].str.extract(r'(\d+\.?\d*)$')
print(expanded_df)

     Message Date  Year  Month  Day      Time      Text  Player Score
0      2023-07-19  2023      7   19  22:36:36    Beth 5    Beth     5
1      2023-07-19  2023      7   19  22:36:36   Ellen 5   Ellen     5
2      2023-07-19  2023      7   19  22:36:36    Lucy 4    Lucy     4
3      2023-07-19  2023      7   19  22:36:36     Tom 4     Tom     4
4      2023-07-19  2023      7   19  22:36:36     Jen 3     Jen     3
...           ...   ...    ...  ...       ...       ...     ...   ...
1077   2024-07-18  2024      7   18  14:34:00    Beth 5    Beth     5
1078   2024-07-18  2024      7   18  14:34:00  Marcos 5  Marcos     5
1079   2024-07-18  2024      7   18  14:34:00     Tom 5     Tom     5
1080   2024-07-18  2024      7   18  14:34:00  Smitty 3  Smitty     3
1081   2024-07-18  2024      7   18  14:34:00    Slim 4    Slim     4

[1082 rows x 8 columns]


# Deal with FAIL cases!!!

Dr.Labarr sugested making a seperate column with a 0 for Non-fail and a 1 for Fail


In [61]:
expanded_df['Fail'] = expanded_df['Score'].isna().astype(int)

print(expanded_df)

     Message Date  Year  Month  Day      Time      Text  Player Score  Fail
0      2023-07-19  2023      7   19  22:36:36    Beth 5    Beth     5     0
1      2023-07-19  2023      7   19  22:36:36   Ellen 5   Ellen     5     0
2      2023-07-19  2023      7   19  22:36:36    Lucy 4    Lucy     4     0
3      2023-07-19  2023      7   19  22:36:36     Tom 4     Tom     4     0
4      2023-07-19  2023      7   19  22:36:36     Jen 3     Jen     3     0
...           ...   ...    ...  ...       ...       ...     ...   ...   ...
1077   2024-07-18  2024      7   18  14:34:00    Beth 5    Beth     5     0
1078   2024-07-18  2024      7   18  14:34:00  Marcos 5  Marcos     5     0
1079   2024-07-18  2024      7   18  14:34:00     Tom 5     Tom     5     0
1080   2024-07-18  2024      7   18  14:34:00  Smitty 3  Smitty     3     0
1081   2024-07-18  2024      7   18  14:34:00    Slim 4    Slim     4     0

[1082 rows x 9 columns]


Lets add a column for start words used by different players

In [62]:
# Replace NA values in the 'Player' column with "National Average"
expanded_df['Player'] = expanded_df['Player'].fillna('National Average')

# Apply the case_when logic
expanded_df['Player'] = expanded_df['Player'].replace({
    'Idabelle': 'Isabelle',
    'Isabele': 'Isabelle',
    'Jen': 'Jennifer',
    'Metedith': 'Meredith',
    'NA': 'National Average'
})


start_words = {
    'Beth': 'great',
    'smitty': 'audio',
    'grant': 'grant' ,
    'pete': 'route',
    'kate': 'ascot',
    'Lucy': 'weird',
    'Meredith': 'least',
    'slim': 'games'
}

secondary_words = {
    'Beth': 'pious',
    'Lucy': 'nasty'}
tertiary_words = {
    'Lucy': 'cough'}
expanded_df['Start_Words'] = expanded_df['Player'].map(start_words)
expanded_df['secondary_words'] = expanded_df['Player'].map(secondary_words)
expanded_df['tertiary_words'] = expanded_df['Player'].map(tertiary_words)


In [63]:
print(expanded_df)

     Message Date  Year  Month  Day      Time      Text    Player Score  Fail  \
0      2023-07-19  2023      7   19  22:36:36    Beth 5      Beth     5     0   
1      2023-07-19  2023      7   19  22:36:36   Ellen 5     Ellen     5     0   
2      2023-07-19  2023      7   19  22:36:36    Lucy 4      Lucy     4     0   
3      2023-07-19  2023      7   19  22:36:36     Tom 4       Tom     4     0   
4      2023-07-19  2023      7   19  22:36:36     Jen 3  Jennifer     3     0   
...           ...   ...    ...  ...       ...       ...       ...   ...   ...   
1077   2024-07-18  2024      7   18  14:34:00    Beth 5      Beth     5     0   
1078   2024-07-18  2024      7   18  14:34:00  Marcos 5    Marcos     5     0   
1079   2024-07-18  2024      7   18  14:34:00     Tom 5       Tom     5     0   
1080   2024-07-18  2024      7   18  14:34:00  Smitty 3    Smitty     3     0   
1081   2024-07-18  2024      7   18  14:34:00    Slim 4      Slim     4     0   

     Start_Words secondary_

Lets webscrape the previous wordle words for each day

In [64]:

URL = 'https://wordfinder.yourdictionary.com/wordle/answers/'
req = requests.get(URL)
soup = bs4.BeautifulSoup(req.text, 'html.parser')

HEADERS = ['Date', 'Wordle', 'Answer']


all_rows = []

# would need to change range as new months are added to the website
for table_index in range(9):
    try:
        table = soup.find_all('table')[table_index]
        rows = table.find_all('tr')

        #we start on first complete day. (I.E. if current day word is available, we dont grab it until tmr)
        for row in rows[1:]: 
            cells = row.find_all('td')
            Date = cells[0].text.strip() # why is year not grabbed?
            Wordle = cells[1].text.strip()
            Answer = cells[2].text.strip()
            all_rows.append([Date, Wordle, Answer])

    except IndexError:
        print(f"Table {table_index} does not exist on the page.")


with open('wordle_answers.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(HEADERS)
    writer.writerows(all_rows)

print(f"Data from tables 0 through 8 has been written to 'wordle_answers.csv'.")



Data from tables 0 through 8 has been written to 'wordle_answers.csv'.


In [65]:
answers = pd.read_csv('wordle_answers.csv')
answers = answers.drop(0).reset_index(drop=True)

In [66]:
# why is the 30th and 31st of each month?????
month_map = {
    'Jan': '01',
    'Feb': '02',
    'Mar': '03',
    'Apr': '04',
    'May': '05',
    'Jun': '06',
    'Jul': '07',
    'Aug': '08',
    'Sep': '09',
    'Oct': '10',
    'Nov': '11',
    'Dec': '12'
}

# Function to transform the date format
def transform_date(date_str, wordle_number):
    # Remove the period after the month and split by space
    parts = date_str.replace('.', '').split()
    # Ensure parts has exactly two elements
    if len(parts) != 2:
        raise ValueError(f"Date string format is incorrect: {date_str}")
    # Map the month name to its corresponding number
    month_number = month_map[parts[0]]
    # Determine the year based on the Wordle Answer number
    year = 2024 if wordle_number >= 926 else 2023 # looked up this value from website
    return f"{year}-{month_number}-{parts[1]}"

# Apply the function to the 'Date' column
answers['Date'] = answers.apply(lambda row: transform_date(row['Date'], row['Wordle']), axis=1)

# Display the transformed DataFrame
print(answers)

           Date  Wordle Answer
0    2024-07-19    1126  REFER
1    2024-07-18    1125  NERDY
2    2024-07-17    1124  QUITE
3    2024-07-16    1123  DECOY
4    2024-07-15    1122  SWOON
..          ...     ...    ...
257  2023-11-05     869  FLARE
258  2023-11-04     868  MANIA
259  2023-11-03     867  ARDOR
260  2023-11-02     866  UNTIL
261  2023-11-01     865  NOISE

[262 rows x 3 columns]


In [67]:
answers_dictionary = {}

# Loop through each row index
for i in range(len(answers)):
    date = answers.iloc[i, 0]  # Assuming the first column is 'Date'
    answer = answers.iloc[i, 2]  # Assuming third column is 'Answer'
    
    answers_dictionary[date] = answer

print(answers_dictionary )

answers_dictionary = {datetime.strptime(k, '%Y-%m-%d'): v for k, v in answers_dictionary.items()}


{'2024-07-19': 'REFER', '2024-07-18': 'NERDY', '2024-07-17': 'QUITE', '2024-07-16': 'DECOY', '2024-07-15': 'SWOON', '2024-07-14': 'VIDEO', '2024-07-13': 'ENACT', '2024-07-12': 'JIFFY', '2024-07-11': 'CAMEO', '2024-07-10': 'GAUNT', '2024-07-09': 'BLARE', '2024-07-08': 'SHAPE', '2024-07-07': 'CANON', '2024-07-06': 'SCOFF', '2024-07-05': 'CRUSH', '2024-07-04': 'DEBUT', '2024-07-03': 'THIGH', '2024-07-02': 'INLAY', '2024-07-01': 'ADAGE', '2024-06-30': 'BUDDY', '2024-06-29': 'ZEBRA', '2024-06-28': 'DROVE', '2024-06-27': 'ORDER', '2024-06-26': 'KNEAD', '2024-06-25': 'SAVOR', '2024-06-24': 'DOLLY', '2024-06-23': 'BUGLE', '2024-06-22': 'EDICT', '2024-06-21': 'PAINT', '2024-06-20': 'SCENT', '2024-06-19': 'TERSE', '2024-06-18': 'COVER', '2024-06-17': 'PRIOR', '2024-06-16': 'GRIND', '2024-06-15': 'PROUD', '2024-06-14': 'VAULT', '2024-06-13': 'ANGST', '2024-06-12': 'DETER', '2024-06-11': 'SWUNG', '2024-06-10': 'MANGA', '2024-06-09': 'CROWD', '2024-06-08': 'HENCE', '2024-06-07': 'MELON', '2024-06-0

In [68]:
expanded_df['wordle_answers'] = expanded_df['Message Date'].map(answers_dictionary)
print(expanded_df)

     Message Date  Year  Month  Day      Time      Text    Player Score  Fail  \
0      2023-07-19  2023      7   19  22:36:36    Beth 5      Beth     5     0   
1      2023-07-19  2023      7   19  22:36:36   Ellen 5     Ellen     5     0   
2      2023-07-19  2023      7   19  22:36:36    Lucy 4      Lucy     4     0   
3      2023-07-19  2023      7   19  22:36:36     Tom 4       Tom     4     0   
4      2023-07-19  2023      7   19  22:36:36     Jen 3  Jennifer     3     0   
...           ...   ...    ...  ...       ...       ...       ...   ...   ...   
1077   2024-07-18  2024      7   18  14:34:00    Beth 5      Beth     5     0   
1078   2024-07-18  2024      7   18  14:34:00  Marcos 5    Marcos     5     0   
1079   2024-07-18  2024      7   18  14:34:00     Tom 5       Tom     5     0   
1080   2024-07-18  2024      7   18  14:34:00  Smitty 3    Smitty     3     0   
1081   2024-07-18  2024      7   18  14:34:00    Slim 4      Slim     4     0   

     Start_Words secondary_

In [69]:
expanded_df.to_csv('expanded_df.csv', index=False)