In [1]:
import snscrape.modules.twitter as sntwitter
import pandas as pd
import numpy as np
import re
import datetime
import pickle
import nltk

In [2]:
with open('data/pickled_tweets_corpus/4seasons_home_bigrams.pkl', 'rb') as f:
    home_tweets = pickle.load(f)
with open('data/pickled_tweets_corpus/4seasons_away_bigrams.pkl', 'rb') as f:
    away_tweets = pickle.load(f)
with open('data/pickled_tweets_corpus/4seasons_corpus_bigrams.pkl', 'rb') as f:
    corpus = pickle.load(f)

In [3]:
len(home_tweets)

1024

In [4]:
home_tweets[0][0]

'looks ready for a new #patriots season to kick off tomorrow night. #footballisback #gopats https://t.co/rf9zu37twj'

Define a set of stopwords, aka common words, from nltk website.

In [5]:
stopwords = ['ourselves', 'hers', 'between', 'yourself', 'but', 'again', 'there', 'about', 'once', 'during', 'out', 'very', 'having', 'with', 'they', 'own', 'an', 'be', 'some', 'for', 'do', 'its', 'yours', 'such', 'into', 'of', 'most', 'itself', 'other', 'off', 'is', 's', 'am', 'or', 'who', 'as', 'from', 'him', 'each', 'the', 'themselves', 'until', 'below', 'are', 'we', 'these', 'your', 'his', 'through', 'don', 'nor', 'me', 'were', 'her', 'more', 'himself', 'this', 'down', 'should', 'our', 'their', 'while', 'above', 'both', 'up', 'to', 'ours', 'had', 'she', 'all', 'no', 'when', 'at', 'any', 'before', 'them', 'same', 'and', 'been', 'have', 'in', 'will', 'on', 'does', 'yourselves', 'then', 'that', 'because', 'what', 'over', 'why', 'so', 'can', 'did', 'not', 'now', 'under', 'he', 'you', 'herself', 'has', 'just', 'where', 'too', 'only', 'myself', 'which', 'those', 'i', 'after', 'few', 'whom', 't', 'being', 'if', 'theirs', 'my', 'against', 'a', 'by', 'doing', 'it', 'how', 'further', 'was', 'here', 'than']

Delete from the corpus any word that is 2 letters or fewer, is a stopword, or is hashtag.

In [6]:
revised_corpus = corpus.copy()
for key in list(revised_corpus.keys()):
    if len(key[0]) <= 2 or len(key[1]) <= 2:
        del revised_corpus[key]
    elif key[0] in stopwords or key[1] in stopwords:
        del revised_corpus[key]
    elif key[0][0] == '#' or key[1][0] == '#':
        del revised_corpus[key]

In [7]:
top_grams = [x[0] for x in sorted(revised_corpus.items(), key=lambda x: x[1], reverse=True)[:1000]]

Function that vectorizes a list of tweets given a corpus.

In [8]:
def vectorize_list_bigrams(list_of_tweets, corpus):
    num_tweets = len(list_of_tweets)
    this_dict = {}
    for tweet in list_of_tweets:
        bigrams = nltk.bigrams(tweet.split(" "))
        for bg in bigrams:
            if bg in this_dict:
                this_dict[bg] += 1
            else:
                this_dict[bg] = 1
    to_return = []
    for key in corpus:
        num = this_dict[key]/num_tweets if key in this_dict else 0
        to_return.append(num)
    return to_return

In [9]:
vectorize_list_bigrams(home_tweets[0], [('to', 'the'), ('in', 'the')])

[0, 0]

Vectorize each home and each away game.

In [10]:
num_h_tweets = [vectorize_list_bigrams(game, top_grams) for game in home_tweets]
num_a_tweets = [vectorize_list_bigrams(game, top_grams) for game in away_tweets]

Concatenate the home and away vectors as in the papers, and use this to produce our X and Y to perform models on.

In [11]:
home_vecs = np.array(num_h_tweets)
away_vecs = np.array(num_a_tweets)
X = np.concatenate([home_vecs, away_vecs], axis=1)

In [12]:
Y = np.load('data/preprocessed/Y_3000_uni_no2_4seasons.npy')

In [13]:
X.shape

(1024, 2000)

In [14]:
Y.shape

(1024,)

In [15]:
X_train = X[:768]
X_test = X[768:]
Y_train = Y[:768]
Y_test = Y[768:]

In [16]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train, Y_train)
lr.score(X_test, Y_test)

0.5

In [17]:
from sklearn.ensemble import AdaBoostClassifier
ada = AdaBoostClassifier(n_estimators=1000)
ada.fit(X_train, Y_train)
ada.score(X_test, Y_test)

0.4765625

In [18]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(max_depth=20).fit(X_train, Y_train)
dt.score(X_test, Y_test)

0.54296875

In [19]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train, Y_train)
rf.score(X_test, Y_test)

0.44140625

In [20]:
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(max_iter=1000)
mlp.fit(X_train, Y_train)
mlp.score(X_test, Y_test)

0.48828125

In [21]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB().fit(X_train, Y_train)
gnb.score(X_test, Y_test)

0.54296875