In [1]:
import snscrape.modules.twitter as sntwitter
import pandas as pd
import numpy as np
import re
import datetime

Read in the hashtag list.

In [2]:
team_hashes = {}
with open('teams.nfl.hashtags.txt', 'r') as f:
    for line in f:
        words = line.split()
        team = words[0].replace('-', ' ')
        hashes = words[1:]
        team_hashes[team] = hashes

Code to get all hashtags from a given tweet; function below takes a tweet and a team name (that we want that tweet to only be associated with) and checks if it contains tweets from any other team.

In [3]:
def get_hashtags(tweet):
    pattern = r'#{1}\w*'
    return re.findall(pattern, tweet)

In [4]:
def one_team_only(one_tweet, which_team):
    all_teams = team_hashes.copy()
    good = all_teams.pop(which_team)
    temp = [team_hashes[key] for key in all_teams]
    bad_teams = [item for sublist in temp for item in sublist]
    bad_teams = set(bad_teams)
    curr_hashes = set(get_hashtags(one_tweet))
    
    if len(list(curr_hashes & bad_teams)) != 0:
            return False
    return True

Read in the data for a full season, and convert the type of the datetime column.

In [5]:
full_season = pd.read_csv('2020_all_data.csv', index_col=0)
full_season['Datetime'] = pd.to_datetime(full_season['Datetime'], utc=True)

Initialize containers. Home_tweets contains all tweets related to the home teams, same for away, and corpus contains all the unique words seen.

In [6]:
home_tweets = []
away_tweets = []
corpus = {}

In [7]:
# count unigrams
def count_unigrams(tweet):
    global corpus
    words = tweet.split(" ")
    for word in words:
        if word in corpus:
            corpus[word] += 1
        else:
            corpus[word] = 1

Returns all valid tweets for one game, for one team. A valid tweet is one which is between the desired start and end times; has at least 10 likes; and only contains hashtags related to one team.

In [8]:
def one_game_one_team(game_time, team_name):
    start = game_time - datetime.timedelta(days=1)
    end = game_time - datetime.timedelta(hours=1)
    date_range = " since:" + start.strftime('%Y-%m-%d') + " until:" + end.strftime('%Y-%m-%d')
    
    team_tags = " OR ".join(team_hashes[team_name])
    query = team_tags + date_range
    
    # Scrape the tweets for the date_range. Also have to filter based on the 
    # time stamp so as not to capture tweets during and after games.
    to_return = []
    for i,tweet in enumerate(sntwitter.TwitterSearchScraper(query).get_items()):
        if i > 10000:
            break
        if not (start <= tweet.date and tweet.date <= end):
            continue
        if tweet.likeCount < 10:
            continue
        if not one_team_only(tweet.content.lower(), team_name):
            continue
        tw = tweet.content.lower()
        to_return.append(tw)
        count_unigrams(tw)
    return to_return

Gets all tweets for an entire season. Uses the above helper function to get a

In [9]:
def get_season_tweets(df):
    for game in range(len(df)):
        gametime = df.iloc[game]['Datetime']
        hometeam = df.iloc[game]['Home']
        awayteam = df.iloc[game]['Away']
        
        h_tweets = one_game_one_team(gametime, hometeam)
        a_tweets = one_game_one_team(gametime, awayteam)
        
        home_tweets.append(h_tweets)
        away_tweets.append(a_tweets)
        
    return        

In [10]:
get_season_tweets(full_season)

In [86]:
import pickle
with open('data/2020season_home.pkl', 'wb') as f:
    pickle.dump(home_tweets, f)
with open('2020season_away.pkl', 'wb') as f:
    pickle.dump(away_tweets, f)
with open('2020season_corpus.pkl', 'wb') as f:
    pickle.dump(corpus, f)

Delete words with less than 2 letters or less from the corpus.

In [50]:
revised_corpus = corpus.copy()
for key in list(revised_corpus.keys()):
    if len(key) <= 2:
        del revised_corpus[key]

Get the top 100 hundred unigrams by frequency. Call this our corpus (can be changed)

In [129]:
top_grams = [x[0] for x in sorted(revised_corpus.items(), key=lambda x: x[1], reverse=True)[:3000]]

A function that, given a list of (valid) tweets for one game and a corpus of interest, returns a numerical version representation of the game. Each number in the return vector represents one word in the corpus, and it is computed as the number of occurrences of that word in tweets about this game, divided by the total number of tweets about this game.

In [130]:
from collections import Counter
def vectorize_list(list_of_tweets, corpus):
    num_tweets = len(list_of_tweets)
    lol = [x.split() for x in list_of_tweets]
    with_repeats = [item for sublist in lol for item in sublist]
    counts = dict(Counter(with_repeats))
    to_return = []
    for key in corpus:
        num = counts[key]/num_tweets if key in counts else 0
        to_return.append(num)
    return to_return

In [131]:
vectorize_list(home_tweets[0], ['lots', 'of', 'a','at'])

[0.08333333333333333, 0.5, 0.16666666666666666, 0.08333333333333333]

Vectorize each home and each away game.

In [132]:
num_h_tweets = [vectorize_list(game, top_grams) for game in home_tweets]
num_a_tweets = [vectorize_list(game, top_grams) for game in away_tweets]

Concatenate the home and away vectors as in the papers, and use this to produce our X and Y to perform models on.

In [133]:
home_vecs = np.array(num_h_tweets)
away_vecs = np.array(num_a_tweets)
X = np.concatenate([home_vecs, away_vecs], axis=1)

In [134]:
Y = full_season.iloc[:,-1].values

In [135]:
X.shape

(256, 6000)

In [136]:
Y.shape

(256,)

In [143]:
everything = np.concatenate([X,Y.reshape(-1,1)], axis=1)
np.random.shuffle(everything)
xs = everything[:,:-1]
ys = everything[:,-1]
X_train = xs[:200]
X_test = xs[200:]
Y_train = ys[:200]
Y_test = ys[200:]

In [144]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train, Y_train)
lr.score(X_test, Y_test)

0.625

In [145]:
from sklearn.ensemble import AdaBoostClassifier
ada = AdaBoostClassifier(n_estimators=1000)
ada.fit(X_train, Y_train)
ada.score(X_test, Y_test)

0.5

In [77]:
np.save('data/X_2020_1000unigrams_no2letters', X)

In [78]:
np.save('data/Y_2020_1000unigrams_no2letters', Y)

In [146]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier().fit(X_train, Y_train)
dt.score(X_test, Y_test)

0.5714285714285714

In [147]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=1000)
rf.fit(X_train, Y_train)
rf.score(X_test, Y_test)

0.5178571428571429

In [148]:
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(max_iter=1000)
mlp.fit(X_train, Y_train)
mlp.score(X_test, Y_test)

0.5178571428571429