In [1]:
import snscrape.modules.twitter as sntwitter
import pandas as pd
import numpy as np
import re
import datetime
import pickle
import nltk
import sklearn
import matplotlib.pyplot as plt
import string
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB

import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.utils import to_categorical
from keras.layers import *
from keras.regularizers import l1,l2, L1L2

In [2]:
seed_value = 273958
import os
os.environ['PYTHONHASHSEED']=str(seed_value)
import random
import numpy as np
np.random.seed(seed_value)
import tensorflow as tf
tf.random.set_seed(seed_value)

In [3]:
# load in tweets
with open('../data/pickled_tweets/home_2019_3daysback_nolikecriterion.pkl', 'rb') as f:
    home_2019_tweets = pickle.load(f)
with open('../data/pickled_tweets/away_2019_3daysback_nolikecriterion.pkl', 'rb') as f:
    away_2019_tweets = pickle.load(f)
with open('../data/pickled_tweets/home_2020_3daysback_nolikecriterion.pkl', 'rb') as f:
    home_2020_tweets = pickle.load(f)
with open('../data/pickled_tweets/away_2020_3daysback_nolikecriterion.pkl', 'rb') as f:
    away_2020_tweets = pickle.load(f)

# load in nfl data
s2020 = pd.read_csv('../data/season_data/2020_all_data.csv', index_col=0)
s2019 = pd.read_csv('../data/season_data/2019_all_data.csv', index_col=0)

# load in nfl data
s2020b = pd.read_csv('../data/season_data/2020_with_betting.csv', index_col=0)
s2019b = pd.read_csv('../data/season_data/2019_with_betting.csv', index_col=0)

In [4]:
sw = stopwords.words('english')

# lemmatize function
def lemmatize(sentence, include_stopwords=False):
    if include_stopwords:
        return [WordNetLemmatizer().lemmatize(word) for word in sentence]
    return [WordNetLemmatizer().lemmatize(word) for word in sentence if word not in sw]

# preprocess the tweets - remove punctuation and lemmatize
def preprocess(tweets):
    for i in range(len(tweets)):
        for j in range(len(tweets[i])):
            tweets[i][j] = re.sub('[^a-zA-Z]',' ',tweets[i][j]).split()
            tweets[i][j] = lemmatize(tweets[i][j])

preprocess(home_2019_tweets)
preprocess(away_2019_tweets)
preprocess(home_2020_tweets)
preprocess(away_2020_tweets)

In [5]:
from collections import Counter
def vectorize_list(list_of_tweets, corpus):
    num_tweets = len(list_of_tweets)
    lol = [nltk.bigrams(x) for x in list_of_tweets]
    with_repeats = [item for sublist in lol for item in sublist]
    counts = dict(Counter(with_repeats))
    to_return = []
    for key in corpus:
        num = np.log(1+counts[key]) if key in counts else 0
        to_return.append(num)
    return to_return

In [6]:
# count bigrams
def count_bigrams(tweet,corpus):
    bigramized_tweet = nltk.bigrams(tweet)
    for bigram in bigramized_tweet:
        if bigram in corpus:
            corpus[bigram] += 1
        else:
            corpus[bigram] = 1

In [7]:
#Create the home/away corpus
home_corpus = {}
away_corpus = {}
for tweets in home_2019_tweets:
    for tw in tweets:
        count_bigrams(tw,home_corpus)

for tweets in away_2019_tweets:
    for tw in tweets:
        count_bigrams(tw,away_corpus)

In [8]:
# get number of tweets for all home and away teams
n_home_tweets = sum([len(game_tweets) for game_tweets in home_2019_tweets])
n_away_tweets = sum([len(game_tweets) for game_tweets in away_2019_tweets])

# get the unigrams that appear in at least 0.1% of home/away tweets
home_top_grams = [word for word in home_corpus if home_corpus[word] > n_home_tweets*0.0001]
away_top_grams = [word for word in away_corpus if away_corpus[word] > n_away_tweets*0.0001]

In [9]:
# TRAIN SET
#Create lists of vectors for home, away games
num_h_tweets = [vectorize_list(game, home_top_grams) for game in home_2019_tweets]
num_a_tweets = [vectorize_list(game, away_top_grams) for game in away_2019_tweets]

#Turn into arrays
home_vecs = np.array(num_h_tweets)
away_vecs = np.array(num_a_tweets)

#Add in money lines
home_ml = s2019b['Home ML'].values.reshape(-1,1)
away_ml = s2019b['Away ML'].values.reshape(-1,1)

#Concatenate home, away to form input matrix.
X_train = np.concatenate([home_ml, home_vecs, away_ml, away_vecs], axis=1)

# TEST SET
#Create lists of vectors for home, away games
num_h_tweets = [vectorize_list(game, home_top_grams) for game in home_2020_tweets]
num_a_tweets = [vectorize_list(game, away_top_grams) for game in away_2020_tweets]

#Turn into arrays
home_vecs = np.array(num_h_tweets)
away_vecs = np.array(num_a_tweets)

#Add in money lines
home_ml = s2020b['Home ML'].values.reshape(-1,1)
away_ml = s2020b['Away ML'].values.reshape(-1,1)

#Concatenate home, away to form input matrix.
X_test = np.concatenate([home_ml, home_vecs, away_ml, away_vecs], axis=1)

# TRAIN and TEST outcomes
Y_train = np.array(s2019b["Home Win"])
Y_test = np.array(s2020b["Home Win"])

In [14]:
# Basic classifiers on full data
lr = LogisticRegression(penalty="l1",solver="liblinear",C=0.1)
lr.fit(X_train, Y_train)
print(lr.score(X_test, Y_test))

lrl1 = LogisticRegressionCV(cv=5,penalty="l1",solver="liblinear",
                            max_iter=1000).fit(X_train, Y_train)
lrl1.fit(X_train, Y_train)
print(lrl1.score(X_test, Y_test))

lrl2 = LogisticRegressionCV(cv=5,penalty="l2",solver="liblinear",
                            max_iter=1000).fit(X_train, Y_train)
lrl2.fit(X_train, Y_train)
print(lrl2.score(X_test, Y_test))

ada = AdaBoostClassifier(n_estimators=200)
ada.fit(X_train, Y_train)
print(ada.score(X_test, Y_test))

rf = RandomForestClassifier(n_estimators=400)
rf.fit(X_train, Y_train)
print(rf.score(X_test, Y_test))

gnb = GaussianNB().fit(X_train, Y_train)
gnb.fit(X_train, Y_train)
print(gnb.score(X_test, Y_test))

0.6796875
0.68359375
0.68359375
0.62109375
0.64453125
0.56640625


### Betting All Models

In [15]:
preds = np.where(X_test[:,0] < 0, 1, 0)
1-np.count_nonzero(preds-Y_test)/len(Y_test)

0.68359375

In [16]:
model_preds = np.round(lr.predict(X_test)).astype('int')
naive_preds = preds ##as defined above
np.count_nonzero(model_preds.reshape(-1) - naive_preds) #Number of games where our model doesn't pick favorite

13

In [17]:
betting_df = s2020b.copy()
betting_df['My Preds'] = model_preds
betting_df['Naive Preds'] = preds
betting_df['Home Pick Payout'] = np.where((betting_df['Home Win'] == 1) & (betting_df['Home ML'] < 0), -10000/betting_df['Home ML'], -100)
betting_df['Home Pick Payout'] = np.where((betting_df['Home Win'] == 1) & (betting_df['Home ML'] > 0), betting_df['Home ML'], betting_df['Home Pick Payout'])
betting_df['Away Pick Payout'] = np.where((betting_df['Home Win'] == 0) & (betting_df['Away ML'] < 0), -10000/betting_df['Away ML'], -100)
betting_df['Away Pick Payout'] = np.where((betting_df['Home Win'] == 0) & (betting_df['Away ML'] > 0), betting_df['Away ML'], betting_df['Away Pick Payout'])
betting_df['My Preds Payout'] = np.where(betting_df['My Preds'] == 1, betting_df['Home Pick Payout'], betting_df['Away Pick Payout'])
betting_df['Naive Preds Payout'] = np.where(betting_df['Naive Preds'] == 1, betting_df['Home Pick Payout'], betting_df['Away Pick Payout'])

In [18]:
np.sum(betting_df['My Preds Payout']), np.sum(betting_df['Naive Preds Payout'])

(-589.9499790537835, -618.9233628737145)

In [19]:
model_preds = np.round(lrl1.predict(X_test)).astype('int')
naive_preds = preds ##as defined above
np.count_nonzero(model_preds.reshape(-1) - naive_preds) #Number of games where our model doesn't pick favorite

0

In [20]:
model_preds = np.round(lrl2.predict(X_test)).astype('int')
naive_preds = preds ##as defined above
np.count_nonzero(model_preds.reshape(-1) - naive_preds) #Number of games where our model doesn't pick favorite

8

In [21]:
betting_df = s2020b.copy()
betting_df['My Preds'] = model_preds
betting_df['Naive Preds'] = preds
betting_df['Home Pick Payout'] = np.where((betting_df['Home Win'] == 1) & (betting_df['Home ML'] < 0), -10000/betting_df['Home ML'], -100)
betting_df['Home Pick Payout'] = np.where((betting_df['Home Win'] == 1) & (betting_df['Home ML'] > 0), betting_df['Home ML'], betting_df['Home Pick Payout'])
betting_df['Away Pick Payout'] = np.where((betting_df['Home Win'] == 0) & (betting_df['Away ML'] < 0), -10000/betting_df['Away ML'], -100)
betting_df['Away Pick Payout'] = np.where((betting_df['Home Win'] == 0) & (betting_df['Away ML'] > 0), betting_df['Away ML'], betting_df['Away Pick Payout'])
betting_df['My Preds Payout'] = np.where(betting_df['My Preds'] == 1, betting_df['Home Pick Payout'], betting_df['Away Pick Payout'])
betting_df['Naive Preds Payout'] = np.where(betting_df['Naive Preds'] == 1, betting_df['Home Pick Payout'], betting_df['Away Pick Payout'])
np.sum(betting_df['My Preds Payout']), np.sum(betting_df['Naive Preds Payout'])

(-593.7022072177764, -618.9233628737145)

In [22]:
model_preds = np.round(ada.predict(X_test)).astype('int')
naive_preds = preds ##as defined above
np.count_nonzero(model_preds.reshape(-1) - naive_preds) #Number of games where our model doesn't pick favorite

84

In [23]:
betting_df = s2020b.copy()
betting_df['My Preds'] = model_preds
betting_df['Naive Preds'] = preds
betting_df['Home Pick Payout'] = np.where((betting_df['Home Win'] == 1) & (betting_df['Home ML'] < 0), -10000/betting_df['Home ML'], -100)
betting_df['Home Pick Payout'] = np.where((betting_df['Home Win'] == 1) & (betting_df['Home ML'] > 0), betting_df['Home ML'], betting_df['Home Pick Payout'])
betting_df['Away Pick Payout'] = np.where((betting_df['Home Win'] == 0) & (betting_df['Away ML'] < 0), -10000/betting_df['Away ML'], -100)
betting_df['Away Pick Payout'] = np.where((betting_df['Home Win'] == 0) & (betting_df['Away ML'] > 0), betting_df['Away ML'], betting_df['Away Pick Payout'])
betting_df['My Preds Payout'] = np.where(betting_df['My Preds'] == 1, betting_df['Home Pick Payout'], betting_df['Away Pick Payout'])
betting_df['Naive Preds Payout'] = np.where(betting_df['Naive Preds'] == 1, betting_df['Home Pick Payout'], betting_df['Away Pick Payout'])
np.sum(betting_df['My Preds Payout']), np.sum(betting_df['Naive Preds Payout'])

(346.42261014431915, -618.9233628737145)

In [24]:
model_preds = np.round(rf.predict(X_test)).astype('int')
naive_preds = preds ##as defined above
np.count_nonzero(model_preds.reshape(-1) - naive_preds) #Number of games where our model doesn't pick favorite

80

In [25]:
betting_df = s2020b.copy()
betting_df['My Preds'] = model_preds
betting_df['Naive Preds'] = preds
betting_df['Home Pick Payout'] = np.where((betting_df['Home Win'] == 1) & (betting_df['Home ML'] < 0), -10000/betting_df['Home ML'], -100)
betting_df['Home Pick Payout'] = np.where((betting_df['Home Win'] == 1) & (betting_df['Home ML'] > 0), betting_df['Home ML'], betting_df['Home Pick Payout'])
betting_df['Away Pick Payout'] = np.where((betting_df['Home Win'] == 0) & (betting_df['Away ML'] < 0), -10000/betting_df['Away ML'], -100)
betting_df['Away Pick Payout'] = np.where((betting_df['Home Win'] == 0) & (betting_df['Away ML'] > 0), betting_df['Away ML'], betting_df['Away Pick Payout'])
betting_df['My Preds Payout'] = np.where(betting_df['My Preds'] == 1, betting_df['Home Pick Payout'], betting_df['Away Pick Payout'])
betting_df['Naive Preds Payout'] = np.where(betting_df['Naive Preds'] == 1, betting_df['Home Pick Payout'], betting_df['Away Pick Payout'])
np.sum(betting_df['My Preds Payout']), np.sum(betting_df['Naive Preds Payout'])

(1621.2299133958174, -618.9233628737145)

In [35]:
1510.99267630983/25600

0.05902315141835273