### Package Installs and Imports

In [2]:
# %conda install -c conda-forge tweepy

In [49]:
import tweepy
import json
import re
import pandas as pd

### Twitter Authetication

In [4]:
with open('credentials.json', 'r') as ifile:
    creds = json.load(ifile)

In [5]:
api_key = creds['api_key']
api_secret = creds['api_key_secret']
bearer_token = creds['bearer_token']
access_token = creds['access_token']
access_token_secret = creds['access_token_secret']

# Twitter API v.2 client authorization

client = tweepy.Client(bearer_token, api_key, api_secret, access_token, access_token_secret)

try:
    me = client.get_me()
    print(f"Authorized as {me.data}")
except:
    print("Authorization failed")


# Twitter API v.1 client authorization

# auth = tweepy.OAuth1UserHandler(api_key, api_secret, access_token, access_token_secret)
# api = tweepy.API(auth)

Authorized as william04dan


### Tweet Scraping


In [6]:
target_username = 'WordleStats'
target = client.get_users(usernames=[target_username])
target_id = target.data[0].id

Client.get_users_tweets() only retrieves a maximum of 100 tweets per request so to grab all tweets we need to repeatedly request tweets

In [7]:
target_tweets = client.get_users_tweets(id=target_id, max_results=100).data
latest_id = target_tweets[-1].id

while True:
    next_tweets = client.get_users_tweets(id=target_id, max_results=100, until_id=latest_id).data
    if not next_tweets:
        break
    target_tweets.extend(next_tweets)
    latest_id = target_tweets[-1].id

print(f"Grabbed {len(target_tweets)} tweets from @{target_username}\n")

Grabbed 374 tweets from @WordleStats



### Tweet Processing

In [23]:
example_tweet = target_tweets[0]
example_tweet

<Tweet id=1612494626723876864 text='#Wordle 568 2023-01-08\n21,510 results found on Twitter.\n2,054 hard mode players.\n\n1:  2%\n2: 🟩🟩 10%\n3: 🟩🟩🟩🟩🟩🟩🟩 29%\n4: 🟩🟩🟩🟩🟩🟩🟩🟩 33%\n5: 🟩🟩🟩🟩🟩 20%\n6: 🟩 6%\nX:  1%\n\n#Wordle568'>

In [68]:
# Looking for the wordle number
# For example, Wordle's autogenerated share message begins with 'Wordle #' where # is which number Wordle it is
wordle_pattern = re.compile(r'[wW]ordle\s?\d+')

# Looking for results found
results_pattern = re.compile(r'[\d,]+\sresults')

# Looking for hardmode players
hardmode_pattern = re.compile(r'[\d,]+\shard')

# Looking for the stats in WordleStat's tweets
stat_pattern = re.compile(r'\b[1-6xX]:.+\d%')

# Example processing of example_tweet
example_data = {}
wordle_match = wordle_pattern.search(example_tweet.text)
results_match = results_pattern.search(example_tweet.text)
hardmode_match = hardmode_pattern.search(example_tweet.text)
stats_matches = stat_pattern.finditer(example_tweet.text)

example_data['wordle_number'] = re.search(r'\d+', wordle_match[0])[0]
example_data['total_players'] = int(re.search(r'[\d,]+', results_match[0])[0].replace(',', ''))
example_data['hardmode_players'] = int(re.search(r'[\d,]+', hardmode_match[0])[0].replace(',', ''))
for match in stats_matches:
    example_data[f'score_{match[0][0]}'] = int(re.search(r'\d+', match[0][1:])[0]) / 100

example_data

{'wordle_number': '568',
 'total_players': 21510,
 'hardmode_players': 2054,
 'score_1': 0.02,
 'score_2': 0.1,
 'score_3': 0.29,
 'score_4': 0.33,
 'score_5': 0.2,
 'score_6': 0.06,
 'score_X': 0.01}

In [73]:
wordle_data = []
invalid_tweets = set()

for tweet in target_tweets:
    tweet_data = {}
    wordle_match = wordle_pattern.search(tweet.text)
    if not wordle_match:
        print(f'Did not find matching wordle_pattern in tweet {tweet.id}')
        invalid_tweets.add(tweet.id)
    else:
        tweet_data['wordle_number'] = re.search(r'\d+', wordle_match[0])[0]

    results_match = results_pattern.search(tweet.text)
    if not results_match:
        print(f'Did not find matching results_pattern in tweet {tweet.id}')
        invalid_tweets.add(tweet.id)
    else:
        tweet_data['total_players'] = int(re.search(r'[\d,]+', results_match[0])[0].replace(',', ''))

    hardmode_match = hardmode_pattern.search(tweet.text)
    if not hardmode_match:
        print(f'Did not find matching results_pattern in tweet {tweet.id}')
        invalid_tweets.add(tweet.id)
    else:
        tweet_data['hardmode_players'] = int(re.search(r'[\d,]+', hardmode_match[0])[0].replace(',', ''))

    stats_matches = stat_pattern.finditer(tweet.text)
    scores = []
    for match in stats_matches:
        scores.append(match[0][0])
        tweet_data[f'score_{match[0][0]}'] = int(re.search(r'\d+', match[0][1:])[0]) / 100
    if len(scores) != 7:
        print(f'Only found data for {scores} in tweet {tweet.id}')
        invalid_tweets.add(tweet.id)

    if not tweet.id in invalid_tweets:
        wordle_data.append(tweet_data)

Did not find matching wordle_pattern in tweet 1488956933621129223
Did not find matching results_pattern in tweet 1488956933621129223
Did not find matching results_pattern in tweet 1488956933621129223
Only found data for [] in tweet 1488956933621129223
Did not find matching wordle_pattern in tweet 1487826071357739008
Did not find matching results_pattern in tweet 1487826071357739008
Did not find matching results_pattern in tweet 1487826071357739008
Only found data for [] in tweet 1487826071357739008
Did not find matching wordle_pattern in tweet 1486840170406158341
Did not find matching results_pattern in tweet 1486840170406158341
Did not find matching results_pattern in tweet 1486840170406158341
Only found data for [] in tweet 1486840170406158341
Did not find matching wordle_pattern in tweet 1483455698038050820
Did not find matching results_pattern in tweet 1483455698038050820
Did not find matching results_pattern in tweet 1483455698038050820
Only found data for [] in tweet 148345569803

In [71]:
for tweet_id in invalid_tweets:
    tweet = client.get_tweet(id=tweet_id)
    print(tweet.data.text)

@CristinaAmpil What would an aggregate distribution look like? I’m unfamiliar with stats/etc
@Gary_Boyd_NZ Just for you for today Gary:

3,073 hard mode players.

1:  1%
2:  4%
3: 🟩🟩🟩 16%
4: 🟩🟩🟩🟩🟩🟩 27%
5: 🟩🟩🟩🟩🟩🟩🟩 30%
6: 🟩🟩🟩🟩 19%
X: 🟩 4%
@WordleHaiku These are the full results for everyone :)
@24Acoustics @PlanningActBlog You are correct! This should now be fixed.
@WordleHaiku Just for you for today!

132,770 results found on Twitter.
3,346 hard mode players.

1:  0.63%
2:  2.28%
3: 🟩🟩🟩 13.08%
4: 🟩🟩🟩🟩🟩🟩🟩 29.11%
5: 🟩🟩🟩🟩🟩🟩🟩 31.45%
6: 🟩🟩🟩🟩🟩 20.32%
X:  3.13%
@fudo @gooeyblob @WordleFRStats Yes, all players. I’ll see if adding the average makes sense, thanks!
@kyfdx Whole group


Looking through the invalid tweets, there isn't a pattern of WordleStats tweets misidentified. There are only two actual Wordles not included, so for my purposes I will simply drop the data.

In [83]:
wordle_df = pd.DataFrame(wordle_data)
wordle_df

Unnamed: 0,wordle_number,total_players,hardmode_players,score_1,score_2,score_3,score_4,score_5,score_6,score_X
0,568,21510,2054,0.02,0.10,0.29,0.33,0.20,0.06,0.01
1,567,21946,2072,0.01,0.08,0.29,0.37,0.19,0.06,0.01
2,566,19725,1936,0.00,0.01,0.12,0.33,0.33,0.17,0.03
3,565,22283,2078,0.00,0.03,0.22,0.42,0.25,0.07,0.01
4,564,22394,2207,0.00,0.05,0.18,0.25,0.22,0.19,0.11
...,...,...,...,...,...,...,...,...,...,...
362,206,153880,3017,0.01,0.09,0.35,0.34,0.16,0.05,0.01
363,205,107134,2242,0.01,0.04,0.16,0.30,0.30,0.17,0.02
364,204,91477,1913,0.01,0.03,0.13,0.27,0.30,0.22,0.04
365,203,101503,1763,0.01,0.05,0.23,0.31,0.24,0.14,0.02


#### Data Exploration