In [1]:
import snscrape.modules.twitter as sntwitter
import pandas as pd
import numpy as np
import re
import datetime
import pickle
import nltk
import sklearn
import matplotlib.pyplot as plt
from nltk.stem import PorterStemmer
from keras.regularizers import l1,l2, L1L2

In [2]:
seed_value = 123
import os
os.environ['PYTHONHASHSEED']=str(seed_value)
import random
import numpy as np
np.random.seed(seed_value)
import tensorflow as tf
tf.random.set_seed(seed_value)

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import StratifiedKFold

import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.utils import to_categorical
from keras.layers import *

In [4]:
# load in tweets
with open('data/pickled_tweets/home_2020_3daysback_nolikecriterion.pkl', 'rb') as f:
    home_tweets = pickle.load(f)
with open('data/pickled_tweets/away_2020_3daysback_nolikecriterion.pkl', 'rb') as f:
    away_tweets = pickle.load(f)

In [5]:
season = pd.read_csv('data/season_data/2020_all_data.csv', index_col=0)

In [6]:
# get bigrams from the list of tweets
def count_bigrams(tweet, corpus):
    words = tweet.split(" ")
    #words = [stemmer.stem(word) for word in words]
    bigrams = nltk.bigrams(words)
    for bg in bigrams:
        if bg in corpus:
            corpus[bg] += 1
        else:
            corpus[bg] = 1

In [7]:
train_test_split = int(np.floor(0.75*len(home_tweets)))

In [8]:
train_test_split

192

In [9]:
#Create the home/away corpus
home_corpus = {}
away_corpus = {}
for tweets in home_tweets[:train_test_split]:
    for tw in tweets:
        count_bigrams(tw,home_corpus)

for tweets in away_tweets[:train_test_split]:
    for tw in tweets:
        count_bigrams(tw,away_corpus)

In [10]:
stopwords = ['ourselves', 'hers', 'between', 'yourself', 'but', 'again', 'there', 'about', 'once', 'during', 'out', 'very', 'having', 'with', 'they', 'own', 'an', 'be', 'some', 'for', 'do', 'its', 'yours', 'such', 'into', 'of', 'most', 'itself', 'other', 'off', 'is', 's', 'am', 'or', 'who', 'as', 'from', 'him', 'each', 'the', 'themselves', 'until', 'below', 'are', 'we', 'these', 'your', 'his', 'through', 'don', 'nor', 'me', 'were', 'her', 'more', 'himself', 'this', 'down', 'should', 'our', 'their', 'while', 'above', 'both', 'up', 'to', 'ours', 'had', 'she', 'all', 'no', 'when', 'at', 'any', 'before', 'them', 'same', 'and', 'been', 'have', 'in', 'will', 'on', 'does', 'yourselves', 'then', 'that', 'because', 'what', 'over', 'why', 'so', 'can', 'did', 'not', 'now', 'under', 'he', 'you', 'herself', 'has', 'just', 'where', 'too', 'only', 'myself', 'which', 'those', 'i', 'after', 'few', 'whom', 't', 'being', 'if', 'theirs', 'my', 'against', 'a', 'by', 'doing', 'it', 'how', 'further', 'was', 'here', 'than']

In [11]:
# remove words with <= 2 characters, stopwords, hashtags
'''
for key in list(home_corpus.keys()):
    if len(key[0]) <= 2 or len(key[1]) <= 2:
        del home_corpus[key]
    elif key[0] in stopwords or key[1] in stopwords:
        del home_corpus[key]
    elif key[0][0] == '#' or key[1][0] == '#':
        del home_corpus[key]

for key in list(away_corpus.keys()):
    if len(key[0]) <= 2 or len(key[1]) <= 2:
        del away_corpus[key]
    elif key[0] in stopwords or key[1] in stopwords:
        del away_corpus[key]
    elif key[0][0] == '#' or key[1][0] == '#':
        del away_corpus[key]
'''

"\nfor key in list(home_corpus.keys()):\n    if len(key[0]) <= 2 or len(key[1]) <= 2:\n        del home_corpus[key]\n    elif key[0] in stopwords or key[1] in stopwords:\n        del home_corpus[key]\n    elif key[0][0] == '#' or key[1][0] == '#':\n        del home_corpus[key]\n\nfor key in list(away_corpus.keys()):\n    if len(key[0]) <= 2 or len(key[1]) <= 2:\n        del away_corpus[key]\n    elif key[0] in stopwords or key[1] in stopwords:\n        del away_corpus[key]\n    elif key[0][0] == '#' or key[1][0] == '#':\n        del away_corpus[key]\n"

In [12]:
#Turns a list of tweets (for one team, for one game)
#into a vector using the top_grams
from collections import Counter
def vectorize_log(list_of_tweets, corpus):
    num_tweets = len(list_of_tweets)
    lol = [nltk.bigrams(x.split()) for x in list_of_tweets]
    with_repeats = [item for sublist in lol for item in sublist]
    counts = dict(Counter(with_repeats))
    #print(counts)
    to_return = []
    for key in corpus:
        num = np.log(1+counts[key]) if key in counts else 0
        to_return.append(num)
    return to_return

In [13]:
from collections import Counter
def vectorize_normalize(list_of_tweets, corpus):
    num_tweets = len(list_of_tweets)
    lol = [x.split() for x in list_of_tweets]
    with_repeats = [item for sublist in lol for item in sublist]
    counts = dict(Counter(with_repeats))
    to_return = []
    for key in corpus:
        num = counts[key]/num_tweets if key in counts else 0
        to_return.append(num)
    return to_return

In [14]:
# get number of tweets for all home and away teams
n_home_tweets = sum([len(game_tweets) for game_tweets in home_tweets])
n_away_tweets = sum([len(game_tweets) for game_tweets in away_tweets])

# get the unigrams that appear in at least 0.1% of home/away tweets
home_top_grams = [word for word in home_corpus if home_corpus[word] > n_home_tweets*0.001]
away_top_grams = [word for word in away_corpus if away_corpus[word] > n_away_tweets*0.001]

In [15]:
# TRAIN SET
#Create lists of vectors for home, away games
num_h_tweets = [vectorize_log(game, home_top_grams) for game in home_tweets[:train_test_split]]
num_a_tweets = [vectorize_log(game, away_top_grams) for game in away_tweets[:train_test_split]]

#Turn into arrays
home_vecs = np.array(num_h_tweets)
away_vecs = np.array(num_a_tweets)

#Concatenate home, away to form input matrix.
X_train = np.concatenate([home_vecs, away_vecs], axis=1)

# TEST SET
#Create lists of vectors for home, away games
num_h_tweets = [vectorize_log(game, home_top_grams) for game in home_tweets[train_test_split:]]
num_a_tweets = [vectorize_log(game, away_top_grams) for game in away_tweets[train_test_split:]]

#Turn into arrays
home_vecs = np.array(num_h_tweets)
away_vecs = np.array(num_a_tweets)

#Concatenate home, away to form input matrix.
X_test = np.concatenate([home_vecs, away_vecs], axis=1)

In [16]:
Y_train = np.array(season["Home Win"])[:train_test_split]
Y_test = np.array(season["Home Win"])[train_test_split:]

In [17]:
X_train.shape, Y_train.shape

((192, 1172), (192,))

In [18]:
X_test.shape, Y_test.shape

((64, 1172), (64,))

In [19]:
model = Sequential()
model.add(Dense(50, activation='relu', input_dim=X_train.shape[1]))
model.add(Dense(50, activation='relu'))
model.add(Dense(50, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', 
              loss='binary_crossentropy', 
              metrics=['accuracy'])

model.fit(X_train, Y_train, epochs=50, verbose=False)

<tensorflow.python.keras.callbacks.History at 0x7fa211cf9d90>

In [20]:
model.evaluate(X_test, Y_test)



[1.2428220510482788, 0.671875]

In [21]:
model = Sequential()
model.add(Dense(128, activation='linear', kernel_regularizer=l1(.001), input_dim=X_train.shape[1]))
model.add(LeakyReLU())
#model.add(Dropout(.5))
model.add(Dense(64, activation='linear',kernel_regularizer=l1(.001)))
model.add(LeakyReLU())
#model.add(Dropout(.5))
model.add(Dense(64, activation='linear',kernel_regularizer=l1(.001)))
model.add(LeakyReLU())
#model.add(Dropout(.5))
model.add(Dense(64, activation='linear',kernel_regularizer=l1(.001)))
model.add(LeakyReLU())
#model.add(Dropout(.5))
model.add(Dense(32, activation='linear',kernel_regularizer=l1(.001)))
model.add(LeakyReLU())
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', 
              loss='binary_crossentropy', 
              metrics=['accuracy'])

model.fit(X_train, Y_train, epochs=50, verbose=True)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x7fa2113f0a60>

In [22]:
model.evaluate(X_test, Y_test)



[2.4141204357147217, 0.6875]