In [1]:
import snscrape.modules.twitter as sntwitter
import pandas as pd
import numpy as np
import re
import datetime
import pickle
import nltk

### Set seed.

In [2]:
seed_value = 273958
import os
os.environ['PYTHONHASHSEED']=str(seed_value)
import random
import numpy as np
np.random.seed(seed_value)
import tensorflow as tf
tf.random.set_seed(seed_value)

In [3]:
import sklearn
import matplotlib.pyplot as plt
import string
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB

import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.utils import to_categorical
from keras.layers import *
from keras.regularizers import l1,l2, L1L2

### Process data.

In [4]:
# load in tweets
with open('../data/pickled_tweets/home_2019_3daysback_nolikecriterion.pkl', 'rb') as f:
    home_2019_tweets = pickle.load(f)
with open('../data/pickled_tweets/away_2019_3daysback_nolikecriterion.pkl', 'rb') as f:
    away_2019_tweets = pickle.load(f)
with open('../data/pickled_tweets/home_2020_3daysback_nolikecriterion.pkl', 'rb') as f:
    home_2020_tweets = pickle.load(f)
with open('../data/pickled_tweets/away_2020_3daysback_nolikecriterion.pkl', 'rb') as f:
    away_2020_tweets = pickle.load(f)

# load in nfl data
s2020 = pd.read_csv('../data/season_data/2020_all_data.csv', index_col=0)
s2019 = pd.read_csv('../data/season_data/2019_all_data.csv', index_col=0)

# load in nfl data
s2020b = pd.read_csv('../data/season_data/2020_with_betting.csv', index_col=0)
s2019b = pd.read_csv('../data/season_data/2019_with_betting.csv', index_col=0)

In [5]:
sw = stopwords.words('english')

# lemmatize function
def lemmatize(sentence, include_stopwords=False):
    if include_stopwords:
        return [WordNetLemmatizer().lemmatize(word) for word in sentence]
    return [WordNetLemmatizer().lemmatize(word) for word in sentence if word not in sw]

# preprocess the tweets - remove punctuation and lemmatize
def preprocess(tweets):
    for i in range(len(tweets)):
        for j in range(len(tweets[i])):
            tweets[i][j] = re.sub('[^a-zA-Z]',' ',tweets[i][j]).split()
            tweets[i][j] = lemmatize(tweets[i][j])

preprocess(home_2019_tweets)
preprocess(away_2019_tweets)
preprocess(home_2020_tweets)
preprocess(away_2020_tweets)

In [6]:
from collections import Counter
def vectorize_list(list_of_tweets, corpus):
    num_tweets = len(list_of_tweets)
    lol = [nltk.bigrams(x) for x in list_of_tweets]
    with_repeats = [item for sublist in lol for item in sublist]
    counts = dict(Counter(with_repeats))
    to_return = []
    for key in corpus:
        num = np.log(1+counts[key]) if key in counts else 0
        to_return.append(num)
    return to_return

In [7]:
# count bigrams
def count_bigrams(tweet,corpus):
    bigramized_tweet = nltk.bigrams(tweet)
    for bigram in bigramized_tweet:
        if bigram in corpus:
            corpus[bigram] += 1
        else:
            corpus[bigram] = 1

In [8]:
#Create the home/away corpus
home_corpus = {}
away_corpus = {}
for tweets in home_2019_tweets:
    for tw in tweets:
        count_bigrams(tw,home_corpus)

for tweets in away_2019_tweets:
    for tw in tweets:
        count_bigrams(tw,away_corpus)

In [9]:
# get number of tweets for all home and away teams
n_home_tweets = sum([len(game_tweets) for game_tweets in home_2019_tweets])
n_away_tweets = sum([len(game_tweets) for game_tweets in away_2019_tweets])

# get the unigrams that appear in at least 0.1% of home/away tweets
home_top_grams = [word for word in home_corpus if home_corpus[word] > n_home_tweets*0.0001]
away_top_grams = [word for word in away_corpus if away_corpus[word] > n_away_tweets*0.0001]

In [10]:
# TRAIN SET
#Create lists of vectors for home, away games
num_h_tweets = [vectorize_list(game, home_top_grams) for game in home_2019_tweets]
num_a_tweets = [vectorize_list(game, away_top_grams) for game in away_2019_tweets]

#Turn into arrays
home_vecs = np.array(num_h_tweets)
away_vecs = np.array(num_a_tweets)

#Add in money lines
home_ml = s2019b['Home ML'].values.reshape(-1,1)
away_ml = s2019b['Away ML'].values.reshape(-1,1)

#Concatenate home, away to form input matrix.
X_train = np.concatenate([home_ml, home_vecs, away_ml, away_vecs], axis=1)

In [11]:
# TEST SET
#Create lists of vectors for home, away games
num_h_tweets = [vectorize_list(game, home_top_grams) for game in home_2020_tweets]
num_a_tweets = [vectorize_list(game, away_top_grams) for game in away_2020_tweets]

#Turn into arrays
home_vecs = np.array(num_h_tweets)
away_vecs = np.array(num_a_tweets)

#Add in money lines
home_ml = s2020b['Home ML'].values.reshape(-1,1)
away_ml = s2020b['Away ML'].values.reshape(-1,1)

#Concatenate home, away to form input matrix.
X_test = np.concatenate([home_ml, home_vecs, away_ml, away_vecs], axis=1)

In [12]:
# TRAIN and TEST outcomes
Y_train = np.array(s2019b["Home Win"])
Y_test = np.array(s2020b["Home Win"])

### Neural Net Prediction - no dimensionality reduction.

In [13]:
reg=1e-5
model = Sequential()
model.add(Dense(1024, activation='tanh', kernel_regularizer=L1L2(reg), input_dim=X_train.shape[1]))
#model.add(Dropout(.1))
model.add(Dense(512, activation='tanh',kernel_regularizer=L1L2(reg)))
#model.add(Dropout(.1))
model.add(Dense(256, activation='tanh',kernel_regularizer=L1L2(reg)))
#model.add(Dropout(.1))
model.add(Dense(128, activation='tanh',kernel_regularizer=L1L2(reg)))
#model.add(Dropout(.1))
model.add(Dense(64, activation='tanh',kernel_regularizer=L1L2(reg)))
#model.add(Dropout(.1))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', 
              loss='binary_crossentropy', 
              metrics=['accuracy'])

model.fit(X_train, Y_train, epochs=50, verbose=True)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x7f81fa5b4370>

#### Out of Sample Prediction Raw Accuracy

In [14]:
model.evaluate(X_test, Y_test)



[2.132178544998169, 0.703125]

#### Baseline Accuracy By Betting Favorite

In [15]:
preds = np.where(X_test[:,0] < 0, 1, 0)
1-np.count_nonzero(preds-Y_test)/len(Y_test)

0.68359375

### Compute Betting Gains using Naive Predictions (Favorite) and Our Model's Predictions

In [16]:
model_preds = np.round(model.predict(X_test)).astype('int')
naive_preds = preds ##as defined above
np.count_nonzero(model_preds.reshape(-1) - naive_preds) #Number of games where our model doesn't pick favorite

31

In [17]:
s2020b['My Preds'] = model_preds
s2020b['Naive Preds'] = preds
s2020b['Home Pick Payout'] = np.where((s2020b['Home Win'] == 1) & (s2020b['Home ML'] < 0), -10000/s2020b['Home ML'], -100)
s2020b['Home Pick Payout'] = np.where((s2020b['Home Win'] == 1) & (s2020b['Home ML'] > 0), s2020b['Home ML'], s2020b['Home Pick Payout'])
s2020b['Away Pick Payout'] = np.where((s2020b['Home Win'] == 0) & (s2020b['Away ML'] < 0), -10000/s2020b['Away ML'], -100)
s2020b['Away Pick Payout'] = np.where((s2020b['Home Win'] == 0) & (s2020b['Away ML'] > 0), s2020b['Away ML'], s2020b['Away Pick Payout'])
s2020b['My Preds Payout'] = np.where(s2020b['My Preds'] == 1, s2020b['Home Pick Payout'], s2020b['Away Pick Payout'])
s2020b['Naive Preds Payout'] = np.where(s2020b['Naive Preds'] == 1, s2020b['Home Pick Payout'], s2020b['Away Pick Payout'])


In [18]:
s2020b

Unnamed: 0,Date,Time,Datetime,Home,Away,Home Win,Home Score,Away Score,Home ML,Away ML,My Preds,Naive Preds,Home Pick Payout,Away Pick Payout,My Preds Payout,Naive Preds Payout
0,2020-09-10,8:20PM,2020-09-10 20:20:00,Kansas City Chiefs,Houston Texans,1,34,20,-450,375,1,1,22.222222,-100.000000,22.222222,22.222222
1,2020-09-13,1:00PM,2020-09-13 13:00:00,Atlanta Falcons,Seattle Seahawks,0,25,38,-115,-105,0,1,-100.000000,95.238095,95.238095,-100.000000
2,2020-09-13,1:00PM,2020-09-13 13:00:00,Buffalo Bills,New York Jets,1,27,17,-300,250,1,1,33.333333,-100.000000,33.333333,33.333333
3,2020-09-13,1:00PM,2020-09-13 13:00:00,Washington Football Team,Philadelphia Eagles,1,27,17,195,-230,0,0,195.000000,-100.000000,-100.000000,-100.000000
4,2020-09-13,1:00PM,2020-09-13 13:00:00,Minnesota Vikings,Green Bay Packers,0,34,43,-125,105,0,1,-100.000000,105.000000,105.000000,-100.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
251,2021-01-03,4:25PM,2021-01-03 16:25:00,Chicago Bears,Green Bay Packers,0,16,35,180,-210,0,0,-100.000000,47.619048,47.619048,47.619048
252,2021-01-03,4:25PM,2021-01-03 16:25:00,Kansas City Chiefs,Los Angeles Chargers,0,21,38,250,-300,0,0,-100.000000,33.333333,33.333333,33.333333
253,2021-01-03,4:25PM,2021-01-03 16:25:00,Denver Broncos,Las Vegas Raiders,0,31,32,130,-150,0,0,-100.000000,66.666667,66.666667,66.666667
254,2021-01-03,4:25PM,2021-01-03 16:25:00,Houston Texans,Tennessee Titans,0,38,41,280,-340,0,0,-100.000000,29.411765,29.411765,29.411765


In [19]:
np.sum(s2020b['My Preds Payout'])

1510.99267630983

In [20]:
np.sum(s2020b['Naive Preds Payout'])

-618.9233628737145