In [1]:
import snscrape.modules.twitter as sntwitter
import pandas as pd
import numpy as np
import re
import datetime
import nltk

Read in the hashtag list.

In [2]:
team_hashes = {}
with open('teams_hashtags.txt', 'r') as f:
    for line in f:
        words = line.split()
        team = words[0].replace('-', ' ')
        hashes = words[1:]
        team_hashes[team] = hashes

Code to get all hashtags from a given tweet; function below takes a tweet and a team name (that we want that tweet to only be associated with) and checks if it contains tweets from any other team.

In [3]:
def get_hashtags(tweet):
    pattern = r'#{1}\w*'
    return re.findall(pattern, tweet)

In [4]:
def one_team_only(one_tweet, which_team):
    all_teams = team_hashes.copy()
    good = all_teams.pop(which_team)
    temp = [team_hashes[key] for key in all_teams]
    bad_teams = [item for sublist in temp for item in sublist]
    bad_teams = set(bad_teams)
    curr_hashes = set(get_hashtags(one_tweet))
    
    if len(list(curr_hashes & bad_teams)) != 0:
            return False
    return True

Read in the data for a full season, and convert the type of the datetime column.

In [5]:
full_season = pd.read_csv('2020_all_data.csv', index_col=0)
full_season['Datetime'] = pd.to_datetime(full_season['Datetime'], utc=True)

Initialize containers. Home_tweets contains all tweets related to the home teams, same for away, and corpus contains all the unique words seen.

In [6]:
home_tweets = []
away_tweets = []
corpus = {}

In [None]:
# count unigrams
def count_unigrams(tweet):
    global corpus
    words = tweet.split(" ")
    for word in words:
        if word in corpus:
            corpus[word] += 1
        else:
            corpus[word] = 1

In [7]:
# get bigrams from the list of tweets
def count_bigrams(tweet):
    global corpus
    bigrams = nltk.bigrams(tweet.split(" "))
    for bg in bigrams:
        if bg in corpus:
            corpus[bg] += 1
        else:
            corpus[bg] = 1

Returns all valid tweets for one game, for one team. A valid tweet is one which is between the desired start and end times; has at least 10 likes; and only contains hashtags related to one team.

In [8]:
def one_game_one_team(game_time, team_name):
    start = game_time - datetime.timedelta(days=1)
    end = game_time - datetime.timedelta(hours=1)
    date_range = " since:" + start.strftime('%Y-%m-%d') + " until:" + end.strftime('%Y-%m-%d')
    
    team_tags = " OR ".join(team_hashes[team_name])
    query = team_tags + date_range
    
    # Scrape the tweets for the date_range. Also have to filter based on the 
    # time stamp so as not to capture tweets during and after games.
    to_return = []
    for i,tweet in enumerate(sntwitter.TwitterSearchScraper(query).get_items()):
        if i > 10000:
            break
        if not (start <= tweet.date and tweet.date <= end):
            continue
        if tweet.likeCount < 10:
            continue
        if not one_team_only(tweet.content.lower(), team_name):
            continue
        tw = tweet.content.lower()
        to_return.append(tw)
        count_bigrams(tw)
    return to_return

Gets all tweets for an entire season. Uses the above helper function to get a

In [9]:
def get_season_tweets(df):
    for game in range(len(df)):
        print("Processing game number {}".format(game))
        gametime = df.iloc[game]['Datetime']
        hometeam = df.iloc[game]['Home']
        awayteam = df.iloc[game]['Away']
        
        h_tweets = one_game_one_team(gametime, hometeam)
        a_tweets = one_game_one_team(gametime, awayteam)
        
        home_tweets.append(h_tweets)
        away_tweets.append(a_tweets)
        
    return     

In [10]:
get_season_tweets(full_season)

Processing game number 0
Processing game number 1
Processing game number 2
Processing game number 3
Processing game number 4
Processing game number 5
Processing game number 6
Processing game number 7
Processing game number 8
Processing game number 9
Processing game number 10
Processing game number 11
Processing game number 12
Processing game number 13
Processing game number 14
Processing game number 15
Processing game number 16
Processing game number 17
Processing game number 18
Processing game number 19
Processing game number 20
Processing game number 21
Processing game number 22
Processing game number 23
Processing game number 24
Processing game number 25
Processing game number 26
Processing game number 27
Processing game number 28
Processing game number 29
Processing game number 30
Processing game number 31
Processing game number 32
Processing game number 33
Processing game number 34
Processing game number 35
Processing game number 36
Processing game number 37
Processing game number

In [11]:
import pickle
with open('data/pickled_tweets_corpus/2020season_home_bigrams.pkl', 'wb') as f:
    pickle.dump(home_tweets, f)
with open('data/pickled_tweets_corpus/2020season_away_bigrams.pkl', 'wb') as f:
    pickle.dump(away_tweets, f)
with open('data/pickled_tweets_corpus/2020season_corpus_bigrams.pkl', 'wb') as f:
    pickle.dump(corpus, f)

Get the top 100 or top 1000 unigrams or bigrams by frequency. Call this our corpus (can be changed)

In [219]:
top_grams = [x[0] for x in sorted(corpus.items(), key=lambda x: x[1], reverse=True)[:3000]]

A function that, given a list of (valid) tweets for one game and a corpus of interest, returns a numerical version representation of the game. Each number in the return vector represents one word in the corpus, and it is computed as the number of occurrences of that word in tweets about this game, divided by the total number of tweets about this game.

In [220]:
def vectorize_list_bigrams(list_of_tweets, corpus):
    num_tweets = len(list_of_tweets)
    this_dict = {}
    for tweet in list_of_tweets:
        bigrams = nltk.bigrams(tweet.split(" "))
        for bg in bigrams:
            if bg in this_dict:
                this_dict[bg] += 1
            else:
                this_dict[bg] = 1
    to_return = []
    for key in corpus:
        num = this_dict[key]/num_tweets if key in this_dict else 0
        to_return.append(num)
    return to_return

In [221]:
vectorize_list_bigrams(home_tweets[0], [('to', 'the'), ('in', 'the')])

[0.16666666666666666, 0.08333333333333333]

Vectorize each home and each away game.

In [222]:
num_h_tweets = [vectorize_list_bigrams(game, top_grams) for game in home_tweets]
num_a_tweets = [vectorize_list_bigrams(game, top_grams) for game in away_tweets]

Concatenate the home and away vectors as in the papers, and use this to produce our X and Y to perform models on.

In [223]:
home_vecs = np.array(num_h_tweets)
away_vecs = np.array(num_a_tweets)
X = np.concatenate([home_vecs, away_vecs], axis=1)

In [224]:
Y = full_season.iloc[:,-1].values

In [225]:
X.shape

(256, 6000)

In [226]:
Y.shape

(256,)

In [211]:
np.save('data/preprocessed/X_2020_1000bigrams', X)
np.save('data/preprocessed/Y_2020_1000bigrams', Y)

In [234]:
everything = np.concatenate([X,Y.reshape(-1,1)], axis=1)
np.random.shuffle(everything)
xs = everything[:,:-1]
ys = everything[:,-1]

In [235]:
X_train = xs[:200]
X_test = xs[200:]
Y_train = ys[:200]
Y_test = ys[200:]

In [236]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train, Y_train)
lr.score(X_test, Y_test)

0.6607142857142857

In [237]:
from sklearn.ensemble import AdaBoostClassifier
ada = AdaBoostClassifier(n_estimators=100)
ada.fit(X_train, Y_train)
ada.score(X_test, Y_test)

0.5178571428571429

In [238]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier().fit(X_train, Y_train)
dt.score(X_test, Y_test)

0.6071428571428571

In [239]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=1000)
rf.fit(X_train, Y_train)
rf.score(X_test, Y_test)

0.6071428571428571

In [240]:
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(max_iter=1000)
mlp.fit(X_train, Y_train)
mlp.score(X_test, Y_test)

0.5714285714285714