In [1]:
import snscrape.modules.twitter as sntwitter
import pandas as pd
import numpy as np
import re
import datetime
import pickle
import nltk
import sklearn
import matplotlib.pyplot as plt
from nltk.stem import WordNetLemmatizer

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB

import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.utils import to_categorical
from keras.layers import *
from keras.regularizers import l1,l2, L1L2

In [2]:
seed_value = 273958
import os
os.environ['PYTHONHASHSEED']=str(seed_value)
import random
import numpy as np
np.random.seed(seed_value)
import tensorflow as tf
tf.random.set_seed(seed_value)

In [4]:
# load in tweets
with open('../data/pickled_tweets/home_2019_3daysback_nolikecriterion.pkl', 'rb') as f:
    home_2019_tweets = pickle.load(f)
with open('../data/pickled_tweets/away_2019_3daysback_nolikecriterion.pkl', 'rb') as f:
    away_2019_tweets = pickle.load(f)
with open('../data/pickled_tweets/home_2020_3daysback_nolikecriterion.pkl', 'rb') as f:
    home_2020_tweets = pickle.load(f)
with open('../data/pickled_tweets/away_2020_3daysback_nolikecriterion.pkl', 'rb') as f:
    away_2020_tweets = pickle.load(f)

# load in nfl data
s2020 = pd.read_csv('../data/season_data/2020_all_data.csv', index_col=0)
s2019 = pd.read_csv('../data/season_data/2019_all_data.csv', index_col=0)

In [5]:
#Define a list of stopwords to remove.
stopwords = nltk.corpus.stopwords.words("english")

# lemmatize function
def lemmatize(sentence, include_stopwords=False):
    if include_stopwords:
        return [WordNetLemmatizer().lemmatize(word) for word in sentence]
    return [WordNetLemmatizer().lemmatize(word) for word in sentence if word not in stopwords]

# preprocess the tweets - remove punctuation and lemmatize
def preprocess(tweets):
    for i in range(len(tweets)):
        for j in range(len(tweets[i])):
            tweets[i][j] = re.sub('[^a-zA-Z]',' ',tweets[i][j]).split()
            tweets[i][j] = lemmatize(tweets[i][j])

preprocess(home_2019_tweets)
preprocess(away_2019_tweets)
preprocess(home_2020_tweets)
preprocess(away_2020_tweets)

In [6]:
from collections import Counter
def vectorize_list(list_of_tweets, corpus):
    num_tweets = len(list_of_tweets)
    with_repeats = [item for sublist in list_of_tweets for item in sublist]
    counts = dict(Counter(with_repeats))
    to_return = []
    for key in corpus:
        num = np.log(1+counts[key]) if key in counts else 0
        to_return.append(num)
    return to_return

In [7]:
# count unigrams
def count_unigrams(tweet,corpus):
    for word in tweet:
        if word in corpus:
            corpus[word] += 1
        else:
            corpus[word] = 1

In [8]:
#Create the home/away corpus
home_corpus = {}
away_corpus = {}
for tweets in home_2019_tweets:
    for tw in tweets:
        count_unigrams(tw,home_corpus)

for tweets in away_2019_tweets:
    for tw in tweets:
        count_unigrams(tw,away_corpus)

In [9]:
# remove words with <= 2 characters
for key in list(home_corpus.keys()):
    if len(key) <= 2:
        del home_corpus[key]

for key in list(away_corpus.keys()):
    if len(key) <= 2:
        del away_corpus[key]

In [10]:
# get number of tweets for all home and away teams
n_home_tweets = sum([len(game_tweets) for game_tweets in home_2019_tweets])
n_away_tweets = sum([len(game_tweets) for game_tweets in away_2019_tweets])

# get the unigrams that appear in at least 0.1% of home/away tweets
home_top_grams = [word for word in home_corpus if home_corpus[word] > n_home_tweets*0.001]
away_top_grams = [word for word in away_corpus if away_corpus[word] > n_away_tweets*0.001]

In [11]:
# TRAIN SET
#Create lists of vectors for home, away games
num_h_tweets = [vectorize_list(game, home_top_grams) for game in home_2019_tweets]
num_a_tweets = [vectorize_list(game, away_top_grams) for game in away_2019_tweets]

#Turn into arrays
home_vecs = np.array(num_h_tweets)
away_vecs = np.array(num_a_tweets)

#Concatenate home, away to form input matrix.
X_train = np.concatenate([home_vecs, away_vecs], axis=1)

# TEST SET
#Create lists of vectors for home, away games
num_h_tweets = [vectorize_list(game, home_top_grams) for game in home_2020_tweets]
num_a_tweets = [vectorize_list(game, away_top_grams) for game in away_2020_tweets]

#Turn into arrays
home_vecs = np.array(num_h_tweets)
away_vecs = np.array(num_a_tweets)

#Concatenate home, away to form input matrix.
X_test = np.concatenate([home_vecs, away_vecs], axis=1)

In [12]:
# TRAIN and TEST outcomes
Y_train = np.array(s2019["Home Win"])
Y_test = np.array(s2020["Home Win"])

In [13]:
# Basic classifiers on full data
lr = LogisticRegression(penalty="l1",solver="liblinear",C=0.1)
lr.fit(X_train, Y_train)
print(lr.score(X_test, Y_test))

lrl1 = LogisticRegressionCV(cv=5,penalty="l1",solver="liblinear",
                            max_iter=1000).fit(X_train, Y_train)
lrl1.fit(X_train, Y_train)
print(lrl1.score(X_test, Y_test))

lrl2 = LogisticRegressionCV(cv=5,penalty="l2",solver="liblinear",
                            max_iter=1000).fit(X_train, Y_train)
lrl2.fit(X_train, Y_train)
print(lrl2.score(X_test, Y_test))

ada = AdaBoostClassifier(n_estimators=200)
ada.fit(X_train, Y_train)
print(ada.score(X_test, Y_test))

rf = RandomForestClassifier(n_estimators=400)
rf.fit(X_train, Y_train)
print(rf.score(X_test, Y_test))

gnb = GaussianNB().fit(X_train, Y_train)
gnb.fit(X_train, Y_train)
print(gnb.score(X_test, Y_test))

0.57421875
0.56640625
0.625
0.51953125
0.55859375
0.5703125


### Sparse L1 Coefficients

In [14]:
# Basic classifiers on full data
lr = LogisticRegression(penalty="l1",solver="liblinear",C=0.1)
lr.fit(X_train, Y_train)
print(lr.score(X_test, Y_test))

0.57421875


In [15]:
len(np.where(lr.coef_ != 0)[1]) #sparsity of lr coefs

58

In [16]:
all_grams = np.array(home_top_grams + away_top_grams)
all_grams[np.where(lr.coef_ != 0)[1]] #grams corresponding to nonzero entries

array(['fox', 'ram', 'que', 'doubt', 'yankee', 'mlb', 'scored', 'adam',
       'offense', 'owner', 'tnf', 'tournament', 'snap', 'johnson', 'pro',
       'del', 'beginning', 'florida', 'prime', 'gordon', 'bruce',
       'hamstring', 'dallascowboys', 'drew', 'tennessee', 'texas', 'bucs',
       'ravensnation', 'wpmoychallenge', 'que', 'thursday', 'september',
       'sam', 'shot', 'mike', 'rush', 'senior', 'dallascowboys', 'falcon',
       'odds', 'colt', 'robert', 'state', 'share', 'chiefskingdom',
       'november', 'ankle', 'tune', 'harris', 'saint', 'obj', 'cold',
       'seahawks', 'raven', 'halloween', 'wilson', 'whodey',
       'raidernation'], dtype='<U19')

In [17]:
np.where(lr.coef_ != 0)[1]

array([ 374,  388,  390,  393,  408,  409,  424,  428,  453,  577,  586,
        687,  799,  818,  827,  870,  978, 1073, 1100, 1108, 1229, 1236,
       1249, 1263, 1264, 1363, 1407, 1463, 1507, 1527, 1620, 1823, 1873,
       1887, 1890, 1905, 1947, 2014, 2024, 2069, 2165, 2199, 2276, 2376,
       2402, 2426, 2487, 2509, 2536, 2651, 2693, 2712, 2787, 2805, 2807,
       2877, 2933, 3020])

In [18]:
signs = np.sign(np.take(lr.coef_, np.where(lr.coef_ != 0)[1])).reshape(-1,1)

In [19]:
home_away = np.where(np.where(lr.coef_ != 0)[1] > len(home_top_grams), 'A', 'H').reshape(-1,1)

In [20]:
all_grams[np.where(lr.coef_ != 0)[1]]

array(['fox', 'ram', 'que', 'doubt', 'yankee', 'mlb', 'scored', 'adam',
       'offense', 'owner', 'tnf', 'tournament', 'snap', 'johnson', 'pro',
       'del', 'beginning', 'florida', 'prime', 'gordon', 'bruce',
       'hamstring', 'dallascowboys', 'drew', 'tennessee', 'texas', 'bucs',
       'ravensnation', 'wpmoychallenge', 'que', 'thursday', 'september',
       'sam', 'shot', 'mike', 'rush', 'senior', 'dallascowboys', 'falcon',
       'odds', 'colt', 'robert', 'state', 'share', 'chiefskingdom',
       'november', 'ankle', 'tune', 'harris', 'saint', 'obj', 'cold',
       'seahawks', 'raven', 'halloween', 'wilson', 'whodey',
       'raidernation'], dtype='<U19')

In [21]:
np.concatenate([all_grams[np.where(lr.coef_ != 0)[1]].reshape(-1,1), signs, np.where(lr.coef_ != 0)[1].reshape(-1,1), home_away], axis=1)

array([['fox', '1.0', '374', 'H'],
       ['ram', '1.0', '388', 'H'],
       ['que', '1.0', '390', 'H'],
       ['doubt', '-1.0', '393', 'H'],
       ['yankee', '-1.0', '408', 'H'],
       ['mlb', '-1.0', '409', 'H'],
       ['scored', '1.0', '424', 'H'],
       ['adam', '1.0', '428', 'H'],
       ['offense', '1.0', '453', 'H'],
       ['owner', '-1.0', '577', 'H'],
       ['tnf', '1.0', '586', 'H'],
       ['tournament', '-1.0', '687', 'H'],
       ['snap', '1.0', '799', 'H'],
       ['johnson', '1.0', '818', 'H'],
       ['pro', '1.0', '827', 'H'],
       ['del', '1.0', '870', 'H'],
       ['beginning', '1.0', '978', 'H'],
       ['florida', '-1.0', '1073', 'H'],
       ['prime', '1.0', '1100', 'H'],
       ['gordon', '1.0', '1108', 'H'],
       ['bruce', '-1.0', '1229', 'H'],
       ['hamstring', '-1.0', '1236', 'H'],
       ['dallascowboys', '1.0', '1249', 'H'],
       ['drew', '1.0', '1263', 'H'],
       ['tennessee', '-1.0', '1264', 'H'],
       ['texas', '1.0', '1363', 'H'],
   

### Cross Validated L1

In [22]:
len(np.where(lrl1.coef_ != 0)[1]) #sparsity of lr coefs

1958

In [25]:
max_positives = np.argpartition(lrl1.coef_.reshape(-1), -20)[-20:]
all_grams[max_positives]

array(['drew', 'force', 'moore', 'surgery', 'ranking', 'participant',
       'mack', 'professional', 'michigan', 'cold', 'value', 'anderson',
       'harris', 'turnover', 'whodey', 'ground', 'tnf', 'november',
       'ravensnation', 'beginning'], dtype='<U19')

In [26]:
signs = np.sign(np.take(lrl1.coef_, max_positives)).reshape(-1,1)
home_away = np.where(max_positives > len(home_top_grams), 'A', 'H').reshape(-1,1)
np.concatenate([all_grams[max_positives].reshape(-1,1), signs, max_positives.reshape(-1,1), home_away], axis=1)

array([['drew', '1.0', '1263', 'H'],
       ['force', '1.0', '1259', 'H'],
       ['moore', '1.0', '1260', 'H'],
       ['surgery', '1.0', '2245', 'A'],
       ['ranking', '1.0', '936', 'H'],
       ['participant', '1.0', '2544', 'A'],
       ['mack', '1.0', '1842', 'A'],
       ['professional', '1.0', '1027', 'H'],
       ['michigan', '1.0', '2480', 'A'],
       ['cold', '1.0', '2712', 'A'],
       ['value', '1.0', '2695', 'A'],
       ['anderson', '1.0', '2421', 'A'],
       ['harris', '1.0', '2536', 'A'],
       ['turnover', '1.0', '2716', 'A'],
       ['whodey', '1.0', '2933', 'A'],
       ['ground', '1.0', '1352', 'H'],
       ['tnf', '1.0', '586', 'H'],
       ['november', '1.0', '2426', 'A'],
       ['ravensnation', '1.0', '1463', 'H'],
       ['beginning', '1.0', '978', 'H']], dtype='<U32')

In [27]:
max_negatives = np.argpartition(-1*lrl1.coef_.reshape(-1), -20)[-20:]
all_grams[max_negatives]

array(['terrible', 'tip', 'turned', 'duke', 'loud', 'ravensnation',
       'tyler', 'goravens', 'ness', 'preseason', 'lmao', 'county', 'ward',
       'sundberg', 'kcchiefs', 'aigo', 'cat', 'watt', 'september',
       'hater'], dtype='<U19')

In [28]:
signs = np.sign(np.take(lrl1.coef_, max_negatives)).reshape(-1,1)
home_away = np.where(max_negatives > len(home_top_grams), 'A', 'H').reshape(-1,1)
np.concatenate([all_grams[max_negatives].reshape(-1,1), signs, max_negatives.reshape(-1,1), home_away], axis=1)

array([['terrible', '-1.0', '2793', 'A'],
       ['tip', '-1.0', '826', 'H'],
       ['turned', '-1.0', '2434', 'A'],
       ['duke', '-1.0', '1950', 'A'],
       ['loud', '-1.0', '2216', 'A'],
       ['ravensnation', '-1.0', '2928', 'A'],
       ['tyler', '-1.0', '2381', 'A'],
       ['goravens', '-1.0', '2926', 'A'],
       ['ness', '-1.0', '2929', 'A'],
       ['preseason', '-1.0', '997', 'H'],
       ['lmao', '-1.0', '1051', 'H'],
       ['county', '-1.0', '1026', 'H'],
       ['ward', '-1.0', '2880', 'A'],
       ['sundberg', '-1.0', '1465', 'H'],
       ['kcchiefs', '-1.0', '2920', 'A'],
       ['aigo', '-1.0', '3028', 'A'],
       ['cat', '-1.0', '788', 'H'],
       ['watt', '-1.0', '2817', 'A'],
       ['september', '-1.0', '1823', 'A'],
       ['hater', '-1.0', '236', 'H']], dtype='<U32')

### Cross Validated L2

In [None]:
len(np.where(lrl2.coef_ != 0)[1]) #sparsity of lr coefs

In [None]:
max_positives = np.argpartition(lrl2.coef_.reshape(-1), -20)[-20:]
all_grams = np.array(home_top_grams + away_top_grams)
all_grams[max_positives]

In [None]:
signs = np.sign(np.take(lrl2.coef_, max_positives)).reshape(-1,1)
home_away = np.where(max_positives > len(home_top_grams), 'A', 'H').reshape(-1,1)
np.concatenate([all_grams[max_positives].reshape(-1,1), signs, max_positives.reshape(-1,1), home_away], axis=1)

In [None]:
max_negatives = np.argpartition(-1*lrl2.coef_.reshape(-1), -20)[-20:]
all_grams = np.array(home_top_grams + away_top_grams)
all_grams[max_negatives]

In [None]:
signs = np.sign(np.take(lrl2.coef_, max_negatives)).reshape(-1,1)
home_away = np.where(max_negatives > len(home_top_grams), 'A', 'H').reshape(-1,1)
np.concatenate([all_grams[max_negatives], signs, max_negatives.reshape(-1,1), home_away], axis=1)