In [1]:
import snscrape.modules.twitter as sntwitter
from urllib.request import urlopen
import pandas as pd
import matplotlib.pyplot as plt

# Load data from @WordleStats
tweets = []
for i, tweet in enumerate(sntwitter.TwitterSearchScraper('from:WordleStats').get_items()):
    if i >= 1000:
        break
    tweets.append(tweet.rawContent)

Stopping after 20 empty pages


In [2]:
print('Example tweet format:\n')
print(tweets[0])

Example tweet format:

#Wordle 694 2023-05-14
17,120 results found on Twitter.
1,764 hard mode players.

1:  0%
2: 🟩 5%
3: 🟩🟩🟩🟩 19%
4: 🟩🟩🟩🟩🟩🟩🟩 29%
5: 🟩🟩🟩🟩🟩🟩 25%
6: 🟩🟩🟩🟩 17%
X: 🟩 4%

#Wordle694


In [3]:
# Import data from all tweets into pandas DataFrame:
error_indices = []
df = pd.DataFrame(columns=['wordle_id','year', 'month', 'date', 'n_players', 'n_hard_mode', 'percent_1',
                           'percent_2', 'percent_3', 'percent_4', 'percent_5', 'percent_6', 'percent_fail'])

def extract_stats(tweet, df):
    """Adds the stats from a tweet by @WordleStats to a pandas DataFrame.
    Input:  tweet = a string of the tweet text
            df = DataFrame to hold the tweet data. Expecting the following columns (in order):
                 wordle_id, year, month, date, n_players, n_hard_mode, percent_1, percent_2, percent_3, percent_4,
                 percent_5, percent_6, percent_fail
    Output: none (the data is added as a new row in df)
    """
    tweet_lines = tweet.split('\n')
    try:
        wordle_id = int(tweet_lines[0].split(' ')[1])
        year = int(tweet_lines[0].replace('-', ' ').split(' ')[2])
        month = int(tweet_lines[0].replace('-', ' ').split(' ')[3])
        date = int(tweet_lines[0].replace('-', ' ').split(' ')[4])
        n_players = int(tweet_lines[1].replace(',','').split(' ')[0])
        n_hard_mode = int(tweet_lines[2].replace(',','').split(' ')[0])
        row_data = [wordle_id, year, month, date, n_players, n_hard_mode]
        for i in range(4, 11):
            row_data.append(int(tweet_lines[i].replace('%', '').split(' ')[-1]))
        df.loc[len(df)] = row_data
    except:
        print('error extracting stats from tweet at index', tweets.index(tweet))
        error_indices.append(tweets.index(tweet))


    
for tweet in tweets:
    extract_stats(tweet, df)
print('There are', len(df), 'valid data points')

error extracting stats from tweet at index 101
error extracting stats from tweet at index 102
error extracting stats from tweet at index 461
error extracting stats from tweet at index 466
error extracting stats from tweet at index 469
error extracting stats from tweet at index 478
error extracting stats from tweet at index 480
error extracting stats from tweet at index 485
error extracting stats from tweet at index 491
error extracting stats from tweet at index 492
There are 483 valid data points


In [4]:
print('Invalid tweets:\n')
for error in error_indices:
    print('Tweet index ' + str(error)+ ':')
    print(tweets[error], '\n')

unique_ids = df.wordle_id.unique()
print('\n\n')
for i in range(203, 694):
    if i not in unique_ids:
        print("Missing Wordle ID #" + str(i))

# fig, ax = plt.subplots(figsize=(10,5))
# plt.hist(df.wordle_id, bins=list(range(202, 700)))
# ax.set(xlim=(690, 696), ylim=(0,3))
# plt.show()
# plt.clf()

print('\nChecking for repeated Wordle IDs...')
repeated_ids = False
tweets_per_id = df.wordle_id.value_counts()
for i in list(tweets_per_id.index):
    if tweets_per_id[i] > 1:
        print('More than one tweet for Wordle ID #', i)
        repeated_ids = True
if repeated_ids == False:
    print('No repeated IDs found.')

Invalid tweets:

Tweet index 101:
@QuickNovaCaleb Thousands of dollars a month, and I'm not interested in giving the owner of this site money anyway 

Tweet index 102:
As a result of this change this bot will shut down some time in the next week. 

Tweet index 461:
@kyfdx Whole group 

Tweet index 466:
@CristinaAmpil What would an aggregate distribution look like? I’m unfamiliar with stats/etc 

Tweet index 469:
@fudo @gooeyblob @WordleFRStats Yes, all players. I’ll see if adding the average makes sense, thanks! 

Tweet index 478:
220,950 results found on Twitter.
6,206 hard mode players.

1:  1%
2:  2%
3: 🟩🟩 11%
4: 🟩🟩🟩🟩🟩🟩 24%
5: 🟩🟩🟩🟩🟩🟩🟩 31%
6: 🟩🟩🟩🟩🟩🟩 26%
X: 🟩 6%

#Wordle213 

Tweet index 480:
@24Acoustics @PlanningActBlog You are correct! This should now be fixed. 

Tweet index 485:
@Gary_Boyd_NZ Just for you for today Gary:

3,073 hard mode players.

1:  1%
2:  4%
3: 🟩🟩🟩 16%
4: 🟩🟩🟩🟩🟩🟩 27%
5: 🟩🟩🟩🟩🟩🟩🟩 30%
6: 🟩🟩🟩🟩 19%
X: 🟩 4% 

Tweet index 491:
@WordleHaiku These are the full results fo

In [5]:
# Check the tweets that didn't import their data correctly (tweets at indices 478, 485, and 492):

# # Checking tweet at index 478:
# print('The tweet just before index 478 is Wordle ID #' + str(tweets[477][8:12]))
# print(tweets[477], '\n\n')
# print('The tweet just after index 478 is Wordle ID #' + str(tweets[479][8:12]))
# print(tweets[479])
# # So tweets[478] is Wordle ID #213
# print('So the tweet at index 478 is Wordle ID #213\n')
# print(tweets[478])

# # Checking tweet at index 485:
# print('\n\n')
# print('The tweet just before index 485:\n')
# print(tweets[484], '\n\n')
# print('The tweet just after index 485:\n')
# print(tweets[486])
# # It looks like tweets[485] is for Wordle ID #207, but just the stats for hard mode. See the following link:
# # https://twitter.com/WordleStats/status/1481687496241164291?cxt=HHwWhsC9ma67gZApAAAA

# # Checking tweet at index 492:
# print('\n\nThe tweet at index 492:\n')
# print(tweets[492])
# # tweets[492] is Wordle ID #202. It didn't import correctly because it is missing a blank line
# # between the hard mode players and the scores.

# Add Wordle ID #213:
df.loc[len(df)] = [213, 2022, 1, 18, 220950, 6206, 1, 2, 11, 24, 31, 26, 6]
# Add Wordle ID #202:
df.loc[len(df)] = [202, 2022, 1, 7, 80630, 1362, 1, 2, 23, 39, 24, 9, 1]

In [6]:
# Check missing Wordle IDs by hand on twitter.com/WordleStats:

# Wordle ID #213 is actually in tweets[478] (already added)

# Wordle ID #273 
df.loc[len(df)] = [273, 2022, 3, 19, 156311, 8515, 0, 5, 21, 32, 26, 14, 3]

# Wordle ID #298
df.loc[len(df)] = [298, 2022, 4, 13, 123255, 7835, 1, 4, 29, 42, 18, 5, 1]

# Wordle ID #301
df.loc[len(df)] = [301, 2022, 4, 16, 107987, 7035, 0, 3, 19, 40, 28, 9, 1]

# Wordle ID #315
df.loc[len(df)] = [315, 2022, 4, 30, 77991, 5749, 0, 2, 10, 25, 35, 23, 4]

# Wordle ID #340
df.loc[len(df)] = [340, 2022, 5, 25, 62723, 4835, 0, 2, 9, 25, 33, 24, 6]

# Wordle ID #381
df.loc[len(df)] = [381, 2022, 7, 5, 44578, 3604, 1, 6, 25, 36, 23, 9, 1]

# Wordle ID #591 is actually missing

# Wordle ID #608 is actually missing
print(len(df))

491


In [7]:
min_id = df.wordle_id.min()
max_id = df.wordle_id.max()
print('There should be', max_id - min_id + 1, 'Wordles (between ID #' + str(min_id), 'and #' + str(max_id) + ').')
print('We are missing Wordle ID #591 and #608.')
print('So there are', len(df), 'total Wordles in this dataset.')

There should be 493 Wordles (between ID #202 and #694).
We are missing Wordle ID #591 and #608.
So there are 491 total Wordles in this dataset.


In [8]:
df.sort_values('wordle_id', inplace=True)
# df.set_index('wordle_id', inplace=True, verify_integrity=True)
df.head(10)

Unnamed: 0,wordle_id,year,month,date,n_players,n_hard_mode,percent_1,percent_2,percent_3,percent_4,percent_5,percent_6,percent_fail
484,202,2022,1,7,80630,1362,1,2,23,39,24,9,1
482,203,2022,1,8,101503,1763,1,5,23,31,24,14,2
481,204,2022,1,9,91477,1913,1,3,13,27,30,22,4
480,205,2022,1,10,107134,2242,1,4,16,30,30,17,2
479,206,2022,1,11,153880,3017,1,9,35,34,16,5,1
478,207,2022,1,12,137586,3073,1,4,15,26,29,21,4
477,208,2022,1,13,132726,3345,1,2,13,29,31,20,3
476,209,2022,1,14,169484,3985,1,4,21,30,24,15,5
475,210,2022,1,15,205880,4655,1,9,35,34,16,5,1
474,211,2022,1,16,209609,4955,1,9,32,32,18,7,1


In [9]:
df.tail(10)

Unnamed: 0,wordle_id,year,month,date,n_players,n_hard_mode,percent_1,percent_2,percent_3,percent_4,percent_5,percent_6,percent_fail
9,685,2023,5,5,16911,1703,0,4,23,40,24,8,1
8,686,2023,5,6,16996,1690,0,7,28,34,20,8,1
7,687,2023,5,7,18039,1796,1,7,36,39,13,3,0
6,688,2023,5,8,16684,1705,0,2,18,35,28,14,3
5,689,2023,5,9,16256,1666,0,1,10,33,35,17,3
4,690,2023,5,10,17154,1713,1,7,29,37,20,6,1
3,691,2023,5,11,17777,1794,0,3,16,32,31,15,2
2,692,2023,5,12,18486,1802,0,7,28,34,21,9,1
1,693,2023,5,13,17209,1719,0,9,34,36,16,5,0
0,694,2023,5,14,17120,1764,0,5,19,29,25,17,4


In [10]:
url = 'https://wordfinder.yourdictionary.com/wordle/answers/'
page = urlopen(url)
html_bytes = page.read()
html = html_bytes.decode("utf-8").split('\n')
# print(len(html))
# print(html[148])
# print(html[2245])
# for i in range(148,155):
#     print(i, html[i], '\n\n')
# for i in range(2240, 2246):
#     print(i, html[i], '\n\n')

id_lines = list(range(148, 2245, 4))
answers = {}
for line in id_lines:
    try:
        id = int(html[line].strip(' '))
        answer_index = html[line+1].index('"">') + 3
        answer = html[line+1][answer_index:answer_index+5].lower()
        answers[id] = answer
    except:
        continue
df['answer'] = df.wordle_id.apply(lambda x: answers[x])
# Check that the df imported the answers correctly before and after the missing Wordle IDs:
# display(df[(df.wordle_id > 585) & (df.wordle_id < 615)])
df.head()

Unnamed: 0,wordle_id,year,month,date,n_players,n_hard_mode,percent_1,percent_2,percent_3,percent_4,percent_5,percent_6,percent_fail,answer
484,202,2022,1,7,80630,1362,1,2,23,39,24,9,1,slump
482,203,2022,1,8,101503,1763,1,5,23,31,24,14,2,crank
481,204,2022,1,9,91477,1913,1,3,13,27,30,22,4,gorge
480,205,2022,1,10,107134,2242,1,4,16,30,30,17,2,query
479,206,2022,1,11,153880,3017,1,9,35,34,16,5,1,drink


In [11]:
print('Exporting data to .csv file...')
df.to_csv('wordle_data.csv', index=False)
print('Complete')

Exporting data to .csv file...
Complete
