# Import libraries and model

In [1]:
from os import listdir
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import gc
import gensim.downloader as api

import torch
import torch.nn as nn

from math import ceil
import torch.optim as optim

nltk.download('stopwords')
nltk.download('wordnet')

# Load GloVe model with Gensim's API - Twitter specific embedding
embeddings_model = api.load("glove-twitter-200")  # 200-dimensional GloVe embeddings

#To check that T4 GPU is connected
#!nvidia-smi

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\berge\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\berge\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Data preprocessing and and feature extractions

## Preprocessing training set

In [None]:
# Read all training files and concatenate them into one dataframe

def load_data_frame(dirName):
    li = []
    i = 0
    for filename in listdir(dirName):
        if filename != '.ipynb_checkpoints':
            print(filename)
            df = pd.read_csv(dirName + "/" + filename)
            df.drop(columns=['Timestamp'], inplace=True)
            # Drop unused column(s)
            df['ID'] = df['MatchID'].astype(str) + '_' + df['PeriodID'].astype(str)
            df['MatchID'] = str(i)
            # Makes sure that the match IDs are ordered from 0,1,2... with no missing values
            i+=1
            li.append(df)
    df = pd.concat(li, ignore_index=True)
    print("Loaded the dataframe from the folder " + dirName + "!")
    return df

In [3]:
df = load_data_frame("train_tweets")

ArgentinaBelgium72.csv
ArgentinaGermanyFinal77.csv
AustraliaNetherlands29.csv
AustraliaSpain34.csv
BelgiumSouthKorea59.csv
CameroonBrazil36.csv
FranceGermany70.csv
FranceNigeria66.csv
GermanyAlgeria67.csv
GermanyBrazil74.csv
GermanyUSA57.csv
HondurasSwitzerland54.csv
MexicoCroatia37.csv
NetherlandsChile35.csv
PortugalGhana58.csv
USASlovenia2010.csv
Loaded the dataframe from the folder train_tweets!


In [4]:
# Preprocessing of tweet
def preprocess_text(text):
    # Lowercasing
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Tokenization
    words = text.split()
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

def preprocess_data_frame(df):
    df['Tweet'] = df['Tweet'].apply(preprocess_text)
    df

In [5]:
# Apply preprocessing to each tweet
preprocess_data_frame(df)
df

Unnamed: 0,ID,MatchID,PeriodID,EventType,Tweet
0,0_0,0,0,0,rt woridcup argentina v belgium win httptcoleu...
1,0_0,0,0,0,elijahman_ time focus belgium winning world cup
2,0_0,0,0,0,rt fifaworldcup global stadium joinin worldcup...
3,0_0,0,0,0,rt catholicnewssvc popefrancis uhoh argentina ...
4,0_0,0,0,0,rt soccerdotcom score v bel well award messisi...
...,...,...,...,...,...
5056045,15_129,15,129,0,rt nytimes fifa world cup final score u sloven...
5056046,15_129,15,129,0,ugh shouldve usa worldcup
5056047,15_129,15,129,0,rt jaclynkeough ha rt someecards id rather die...
5056048,15_129,15,129,0,rt gustavaulia many surprise worldcup timewoww...


## Tweet Embeddings on training set

In [6]:
# Get vector tweet embeddings
# TODOOOOOOOOOOOOOOOO maybe instead of avg word embedding for each tweet can get sentence
#   embeddings to retain more information
#   -> can try more complex functions here
#   -> avg embedding of each word for a tweet is fine for now, maybe works well enough

# Function to compute the average word vector for a tweet
def get_avg_embedding(tweet, model, vector_size=200):
    words = tweet.split()  # Tokenize by whitespace
    word_vectors = [model[word] for word in words if word in model]
    if not word_vectors:  # If no words in the tweet are in the vocabulary, return a zero vector
        return np.zeros(vector_size)
    return np.mean(word_vectors, axis=0)

def get_tweet_embeddings(df, vector_size):
    tweet_vectors = np.vstack([get_avg_embedding(tweet, embeddings_model, vector_size) for tweet in df['Tweet']])
    tweet_df = pd.DataFrame(tweet_vectors)
    print("Created vector tweet embeddings!")
    return tweet_df

In [7]:
# Crashes after using all available RAM :( on google colab

# Obtain vector tweet embeddings
vector_size = 200  # Adjust based on the chosen GloVe model
tweet_df = get_tweet_embeddings(df, vector_size)
tweet_df.head()

Created vector tweet embeddings!


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
0,0.370323,0.5317,-0.057301,0.001945,-0.177691,-0.039466,-0.318336,0.049636,0.338458,-0.102866,...,-0.320222,-0.201019,-0.065383,-0.042177,-0.264181,-0.016805,0.228576,-0.104629,-0.364003,0.27396
1,0.113964,0.179678,0.092712,-0.179685,-0.10961,0.123447,0.213493,-0.063245,0.183947,-0.246134,...,-0.133157,-0.330765,-0.093205,0.016202,0.008341,-0.00491,0.089336,0.079373,0.045625,0.062633
2,-0.048559,0.30201,0.049827,-0.275527,0.148541,-0.116081,-0.253812,-0.134411,0.196322,-0.021833,...,-0.047611,-0.088304,-0.101538,-0.114711,-0.299336,0.137398,0.111959,0.274555,-0.211163,0.208941
3,0.201836,0.249383,-0.066594,-0.073885,-0.133152,-0.033296,0.059994,-0.024996,0.046238,-0.053954,...,-0.148899,-0.085644,-0.126253,0.012219,-0.313125,-0.060006,0.204233,0.000428,-0.298889,0.278378
4,0.156795,0.375538,-0.025449,-0.044731,0.036281,0.043673,0.047919,0.148287,0.16369,0.090444,...,-0.238058,-0.065272,0.043676,0.039759,0.002273,0.056147,0.030378,-0.064882,-0.192816,0.016506


In [8]:
# No need for Tweet column since we have its corresponding vector embedding
df.drop(columns=['Tweet'], inplace=True)

# Attach the vectors into the original dataframe
df = pd.concat([df, tweet_df], axis=1)

# By now should have df with columns: ID, match id, period id, Event Type, tweet_vector. Tweet_vector is just 200 columns
df.head()

Unnamed: 0,ID,MatchID,PeriodID,EventType,0,1,2,3,4,5,...,190,191,192,193,194,195,196,197,198,199
0,0_0,0,0,0,0.370323,0.5317,-0.057301,0.001945,-0.177691,-0.039466,...,-0.320222,-0.201019,-0.065383,-0.042177,-0.264181,-0.016805,0.228576,-0.104629,-0.364003,0.27396
1,0_0,0,0,0,0.113964,0.179678,0.092712,-0.179685,-0.10961,0.123447,...,-0.133157,-0.330765,-0.093205,0.016202,0.008341,-0.00491,0.089336,0.079373,0.045625,0.062633
2,0_0,0,0,0,-0.048559,0.30201,0.049827,-0.275527,0.148541,-0.116081,...,-0.047611,-0.088304,-0.101538,-0.114711,-0.299336,0.137398,0.111959,0.274555,-0.211163,0.208941
3,0_0,0,0,0,0.201836,0.249383,-0.066594,-0.073885,-0.133152,-0.033296,...,-0.148899,-0.085644,-0.126253,0.012219,-0.313125,-0.060006,0.204233,0.000428,-0.298889,0.278378
4,0_0,0,0,0,0.156795,0.375538,-0.025449,-0.044731,0.036281,0.043673,...,-0.238058,-0.065272,0.043676,0.039759,0.002273,0.056147,0.030378,-0.064882,-0.192816,0.016506


In [None]:
# Group the tweets into their corresponding periods to generate an average embedding vector for each period
# so there are no duplicate period id rows per match.
# It decreases the size of data and makes it easier to fit into LSTM model
def group_data_frame_by_periods(df):
    df = df.groupby(['MatchID', 'PeriodID', 'ID']).mean().reset_index()
    #df.drop(columns=['ID'], inplace=True) 
    df['MatchID'] = df['MatchID'].astype(int)
    df['PeriodID'] = df['PeriodID'].astype(int)
    # need to convert to int before sorting
    df.sort_values(by=['MatchID', 'PeriodID'], inplace=True)
    df.reset_index(drop=True, inplace=True)
    print("Grouped dataframe by periods!")
    return df


In [10]:
df = group_data_frame_by_periods(df)
df

Grouped dataframe by periods!


Unnamed: 0,MatchID,PeriodID,EventType,0,1,2,3,4,5,6,...,190,191,192,193,194,195,196,197,198,199
0,0,0,0.0,0.122099,0.259289,0.021811,-0.091114,-0.020353,0.027769,0.110603,...,-0.191140,-0.069440,0.019676,-0.021974,-0.065337,-0.005958,0.155431,0.028152,-0.102278,0.169321
1,0,1,0.0,0.118798,0.257246,0.022628,-0.091629,-0.030559,0.027826,0.097386,...,-0.196974,-0.067863,0.016041,-0.022768,-0.066973,-0.010289,0.156989,0.027647,-0.104473,0.168100
2,0,2,0.0,0.120084,0.244924,0.021755,-0.087242,-0.043526,0.036788,0.112539,...,-0.185856,-0.067514,0.009049,-0.031775,-0.063545,-0.013830,0.166954,0.020025,-0.102942,0.172742
3,0,3,0.0,0.113977,0.246675,0.032208,-0.093585,-0.039601,0.042192,0.097110,...,-0.178769,-0.069509,-0.006021,-0.024460,-0.073347,-0.017333,0.173202,0.023651,-0.108952,0.177780
4,0,4,0.0,0.118590,0.251655,0.035730,-0.097995,-0.043148,0.033088,0.094264,...,-0.182203,-0.071062,-0.001199,-0.028591,-0.081255,-0.013600,0.180348,0.019224,-0.112693,0.181622
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2132,15,125,1.0,0.137326,0.245735,0.130649,-0.154533,0.011166,0.079916,0.216234,...,-0.199499,-0.126368,0.072447,0.001123,0.006951,0.044991,0.196317,0.111020,-0.054185,0.069251
2133,15,126,1.0,0.141557,0.251345,0.128418,-0.155091,0.014960,0.073601,0.216967,...,-0.205277,-0.133896,0.065923,0.001199,0.000703,0.051376,0.194330,0.105924,-0.053589,0.075656
2134,15,127,1.0,0.133384,0.244087,0.125350,-0.154126,0.010289,0.076279,0.210966,...,-0.201167,-0.126290,0.062471,0.006050,-0.008161,0.044420,0.194046,0.102683,-0.049700,0.064941
2135,15,128,1.0,0.134215,0.239541,0.128161,-0.155323,0.019753,0.076201,0.216787,...,-0.202252,-0.128437,0.064381,0.003787,-0.005455,0.053721,0.186922,0.113238,-0.053236,0.063437


## Preprocessing evaluation set

In [11]:
df_eval = load_data_frame("eval_tweets")
df_eval

GermanyGhana32.csv
GermanySerbia2010.csv
GreeceIvoryCoast44.csv
NetherlandsMexico64.csv
Loaded the dataframe from the folder eval_tweets!


Unnamed: 0,ID,MatchID,PeriodID,Tweet
0,0_0,0,0,I Finally get to see Germany play\n#GER 🇩🇪⚽🏆
1,0_0,0,0,RT @Wor1dCup2014: If Any of the Boateng Brothe...
2,0_0,0,0,Fascinated for this #GERvsGHA match. This will...
3,0_0,0,0,: #GER and #GHA in a few.
4,0_0,0,0,BOATENG GRUDGE MATCH: 21/2 for Jermaine to sco...
...,...,...,...,...
1072923,3_125,3,125,Dutch deserve to be in last 8.Keep their nerve...
1072924,3_125,3,125,RT @GeniusFootball: RETWEET if you think #MEX ...
1072925,3_125,3,125,"Hold your head high Mexico, played beautifully..."
1072926,3_125,3,125,RT @TheWorIdCup: Mexico fans right now... http...


In [12]:
preprocess_data_frame(df_eval)
df_eval

Unnamed: 0,ID,MatchID,PeriodID,Tweet
0,0_0,0,0,finally get see germany play ger
1,0_0,0,0,rt wordcup boateng brother score today well gi...
2,0_0,0,0,fascinated gervsgha match tell u lot chance us...
3,0_0,0,0,ger gha
4,0_0,0,0,boateng grudge match jermaine score ger kevinp...
...,...,...,...,...
1072923,3_125,3,125,dutch deserve last keep nerve till end turnove...
1072924,3_125,3,125,rt geniusfootball retweet think mex deserved w...
1072925,3_125,3,125,hold head high mexico played beautifully last ...
1072926,3_125,3,125,rt theworidcup mexico fan right httptcozfmwxiatw


## Tweet Embeddings on training set

In [13]:
tweet_df_eval = get_tweet_embeddings(df_eval, vector_size)
tweet_df_eval.head()

Created vector tweet embeddings!


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
0,0.183428,0.351652,-0.059248,-0.182003,0.081787,0.131392,0.513264,0.108436,0.235608,0.015548,...,-0.126687,-0.086776,0.224327,-0.2981,-0.039604,-0.170743,0.212588,-0.165797,-0.058123,0.24174
1,0.252394,0.291525,-0.054652,0.019181,0.082992,-0.067042,0.106594,0.06285,0.114198,0.168639,...,-0.273192,-0.043879,0.165205,0.01279,-0.035947,-0.023603,0.090462,0.079817,-0.096872,0.275643
2,0.133759,0.340722,-0.202986,-0.155269,-0.05257,0.217507,0.175424,0.069934,0.147071,0.122774,...,-0.041979,0.041901,0.099874,-0.158076,0.03502,0.1587,0.197988,0.048215,0.172673,0.004555
3,0.132508,0.662775,0.1056,-0.12087,0.362255,-0.073177,-0.708465,0.064674,0.255179,0.3855,...,0.14236,-0.204735,0.4138,-0.16136,-0.254585,0.11611,0.463215,0.00453,-0.096614,0.37292
4,0.349205,0.474341,-0.059732,-0.133805,0.189881,0.259303,0.060578,0.032346,-0.050215,0.255715,...,-0.366065,0.063324,0.365045,-0.195428,-0.176695,-0.036868,0.357725,-0.104471,0.130819,0.210772


In [14]:
# No need for Tweet column since we have its corresponding vector embedding
df_eval.drop(columns=['Tweet'], inplace=True)

# Attach the vectors into the original dataframe
df_eval = pd.concat([df_eval, tweet_df_eval], axis=1)

# By now should have df with columns: ID, match id, period id, Event Type, tweet_vector. Tweet_vector is just 200 columns
df_eval.head()

Unnamed: 0,ID,MatchID,PeriodID,0,1,2,3,4,5,6,...,190,191,192,193,194,195,196,197,198,199
0,0_0,0,0,0.183428,0.351652,-0.059248,-0.182003,0.081787,0.131392,0.513264,...,-0.126687,-0.086776,0.224327,-0.2981,-0.039604,-0.170743,0.212588,-0.165797,-0.058123,0.24174
1,0_0,0,0,0.252394,0.291525,-0.054652,0.019181,0.082992,-0.067042,0.106594,...,-0.273192,-0.043879,0.165205,0.01279,-0.035947,-0.023603,0.090462,0.079817,-0.096872,0.275643
2,0_0,0,0,0.133759,0.340722,-0.202986,-0.155269,-0.05257,0.217507,0.175424,...,-0.041979,0.041901,0.099874,-0.158076,0.03502,0.1587,0.197988,0.048215,0.172673,0.004555
3,0_0,0,0,0.132508,0.662775,0.1056,-0.12087,0.362255,-0.073177,-0.708465,...,0.14236,-0.204735,0.4138,-0.16136,-0.254585,0.11611,0.463215,0.00453,-0.096614,0.37292
4,0_0,0,0,0.349205,0.474341,-0.059732,-0.133805,0.189881,0.259303,0.060578,...,-0.366065,0.063324,0.365045,-0.195428,-0.176695,-0.036868,0.357725,-0.104471,0.130819,0.210772


In [15]:
df_eval = group_data_frame_by_periods(df_eval)
df_eval

Grouped dataframe by periods!


Unnamed: 0,MatchID,PeriodID,0,1,2,3,4,5,6,7,...,190,191,192,193,194,195,196,197,198,199
0,0,0,0.158896,0.264814,0.057981,-0.102842,0.061002,0.023203,0.131890,0.060724,...,-0.184701,-0.101496,0.081690,-0.004283,-0.057534,0.006105,0.171110,0.025386,-0.039204,0.189157
1,0,1,0.156288,0.271375,0.059343,-0.108422,0.052298,0.019057,0.119804,0.069230,...,-0.193451,-0.098110,0.085782,-0.014780,-0.065975,0.008123,0.176577,0.028697,-0.037489,0.189704
2,0,2,0.145923,0.240633,0.057680,-0.104799,0.108712,0.009395,0.081510,0.103349,...,-0.200057,-0.118543,0.082504,-0.064201,-0.029714,0.073884,0.186111,0.105346,-0.022998,0.202353
3,0,3,0.160460,0.285798,0.063682,-0.104289,0.061716,0.016656,0.126485,0.074434,...,-0.188633,-0.087426,0.102778,-0.018449,-0.064665,0.006555,0.178324,0.028487,-0.038289,0.191645
4,0,4,0.159856,0.281828,0.073446,-0.112441,0.063491,0.021445,0.110178,0.074091,...,-0.178722,-0.094382,0.093599,-0.025704,-0.076860,0.015346,0.189978,0.027995,-0.040230,0.188706
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
511,3,121,0.092493,0.205150,-0.011978,-0.058293,-0.024912,0.049651,0.203746,0.023840,...,-0.108902,-0.080190,0.005660,-0.020262,-0.005344,-0.057583,0.163659,0.081904,-0.119019,0.148621
512,3,122,0.090382,0.211368,-0.001556,-0.053755,-0.037618,0.063667,0.179914,0.025596,...,-0.121352,-0.086303,0.021765,-0.030234,-0.001703,-0.059899,0.174141,0.070798,-0.118152,0.146306
513,3,123,0.097765,0.220651,-0.004684,-0.051815,-0.031970,0.068565,0.204702,0.041446,...,-0.127275,-0.085487,0.020696,-0.026821,0.006977,-0.059209,0.155737,0.065237,-0.120275,0.141325
514,3,124,0.102277,0.236737,-0.015062,-0.054473,-0.028977,0.062958,0.204928,0.044631,...,-0.127073,-0.090737,0.014584,-0.023237,0.002420,-0.063367,0.148509,0.070196,-0.118347,0.136021


## Saving preprocessed data frames

In [16]:
# Save processed data in a file for reusability
df.to_feather("processed_data/train_tweets_processed.feather")
print("Training data frame saved!")
df_eval.to_feather("processed_data/eval_tweets_processed.feather")
print("Evaluation data frame saved!")

# Remove reference to the original DataFrame
del df  
del df_eval
gc.collect()  # force garbage collection to free up memory

  table = Table.from_pandas(df, preserve_index=preserve_index)


Training data frame saved!
Evaluation data frame saved!


0

# Model training

In [17]:
df = pd.read_feather("processed_data/train_tweets_processed.feather")
df

Unnamed: 0,MatchID,PeriodID,EventType,0,1,2,3,4,5,6,...,190,191,192,193,194,195,196,197,198,199
0,0,0,0.0,0.122099,0.259289,0.021811,-0.091114,-0.020353,0.027769,0.110603,...,-0.191140,-0.069440,0.019676,-0.021974,-0.065337,-0.005958,0.155431,0.028152,-0.102278,0.169321
1,0,1,0.0,0.118798,0.257246,0.022628,-0.091629,-0.030559,0.027826,0.097386,...,-0.196974,-0.067863,0.016041,-0.022768,-0.066973,-0.010289,0.156989,0.027647,-0.104473,0.168100
2,0,2,0.0,0.120084,0.244924,0.021755,-0.087242,-0.043526,0.036788,0.112539,...,-0.185856,-0.067514,0.009049,-0.031775,-0.063545,-0.013830,0.166954,0.020025,-0.102942,0.172742
3,0,3,0.0,0.113977,0.246675,0.032208,-0.093585,-0.039601,0.042192,0.097110,...,-0.178769,-0.069509,-0.006021,-0.024460,-0.073347,-0.017333,0.173202,0.023651,-0.108952,0.177780
4,0,4,0.0,0.118590,0.251655,0.035730,-0.097995,-0.043148,0.033088,0.094264,...,-0.182203,-0.071062,-0.001199,-0.028591,-0.081255,-0.013600,0.180348,0.019224,-0.112693,0.181622
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2132,15,125,1.0,0.137326,0.245735,0.130649,-0.154533,0.011166,0.079916,0.216234,...,-0.199499,-0.126368,0.072447,0.001123,0.006951,0.044991,0.196317,0.111020,-0.054185,0.069251
2133,15,126,1.0,0.141557,0.251345,0.128418,-0.155091,0.014960,0.073601,0.216967,...,-0.205277,-0.133896,0.065923,0.001199,0.000703,0.051376,0.194330,0.105924,-0.053589,0.075656
2134,15,127,1.0,0.133384,0.244087,0.125350,-0.154126,0.010289,0.076279,0.210966,...,-0.201167,-0.126290,0.062471,0.006050,-0.008161,0.044420,0.194046,0.102683,-0.049700,0.064941
2135,15,128,1.0,0.134215,0.239541,0.128161,-0.155323,0.019753,0.076201,0.216787,...,-0.202252,-0.128437,0.064381,0.003787,-0.005455,0.053721,0.186922,0.113238,-0.053236,0.063437


## Separate Train and Test data

In [18]:
# Train on of the first 13 of 16 matches (16*0.8=12.8~=13)
# and the test data would be the last 3 matches. 
# Before submitting on Kaggle we should train on full dataset, so al 16 matches
train_percentage = 0.8
unique_match_ids = df['MatchID'].unique()
print(unique_match_ids)
num_matches_training = int(ceil(len(unique_match_ids)*train_percentage))
print(num_matches_training)

[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15]
13


In [19]:
# All matches from num_matches_training and after will be in test test
# row_index is then the first row of the matches that will go to the test


row_index = (df['MatchID'] == num_matches_training).idxmax()
#row_index = df[df['MatchID'] == target_match_id].first_valid_index()
df_X_train = df[:row_index].copy()
df_X_test = df[row_index:].copy()


In [20]:
df_y_train = df_X_train['EventType']
df_y_train

0       0.0
1       0.0
2       0.0
3       0.0
4       0.0
       ... 
1742    1.0
1743    1.0
1744    1.0
1745    1.0
1746    1.0
Name: EventType, Length: 1747, dtype: float64

In [21]:
df_y_test = df_X_test['EventType']
df_y_test.reset_index(drop=True, inplace=True)
df_y_test

0      0.0
1      0.0
2      1.0
3      1.0
4      1.0
      ... 
385    1.0
386    1.0
387    1.0
388    1.0
389    0.0
Name: EventType, Length: 390, dtype: float64

In [22]:
df_X_train.drop(['EventType'], axis=1, inplace=True)
df_X_train

Unnamed: 0,MatchID,PeriodID,0,1,2,3,4,5,6,7,...,190,191,192,193,194,195,196,197,198,199
0,0,0,0.122099,0.259289,0.021811,-0.091114,-0.020353,0.027769,0.110603,0.024921,...,-0.191140,-0.069440,0.019676,-0.021974,-0.065337,-0.005958,0.155431,0.028152,-0.102278,0.169321
1,0,1,0.118798,0.257246,0.022628,-0.091629,-0.030559,0.027826,0.097386,0.022053,...,-0.196974,-0.067863,0.016041,-0.022768,-0.066973,-0.010289,0.156989,0.027647,-0.104473,0.168100
2,0,2,0.120084,0.244924,0.021755,-0.087242,-0.043526,0.036788,0.112539,0.011313,...,-0.185856,-0.067514,0.009049,-0.031775,-0.063545,-0.013830,0.166954,0.020025,-0.102942,0.172742
3,0,3,0.113977,0.246675,0.032208,-0.093585,-0.039601,0.042192,0.097110,0.006285,...,-0.178769,-0.069509,-0.006021,-0.024460,-0.073347,-0.017333,0.173202,0.023651,-0.108952,0.177780
4,0,4,0.118590,0.251655,0.035730,-0.097995,-0.043148,0.033088,0.094264,0.009966,...,-0.182203,-0.071062,-0.001199,-0.028591,-0.081255,-0.013600,0.180348,0.019224,-0.112693,0.181622
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1742,12,125,0.041150,0.267610,0.052960,-0.197763,0.104450,0.025712,-0.099700,-0.024117,...,-0.058006,-0.074136,-0.033716,0.011847,-0.041860,0.024191,0.167323,0.122392,-0.054058,0.158443
1743,12,126,0.044753,0.266801,0.058579,-0.202625,0.093912,0.024470,-0.069458,-0.022602,...,-0.050856,-0.076830,-0.019953,0.005234,-0.050930,0.036127,0.161945,0.129341,-0.063787,0.172965
1744,12,127,0.048542,0.271988,0.051762,-0.197929,0.092518,0.020690,-0.068634,-0.021649,...,-0.054085,-0.077008,-0.020153,0.008492,-0.052610,0.036094,0.162711,0.131641,-0.069869,0.174006
1745,12,128,0.051801,0.269485,0.051750,-0.198183,0.088989,0.019420,-0.066234,-0.022540,...,-0.049065,-0.073360,-0.026281,0.006967,-0.056533,0.039534,0.158736,0.124636,-0.077828,0.173008


In [23]:
df_X_test.drop(['EventType'], axis=1, inplace=True)
df_X_test.reset_index(drop=True, inplace=True)
df_X_test

Unnamed: 0,MatchID,PeriodID,0,1,2,3,4,5,6,7,...,190,191,192,193,194,195,196,197,198,199
0,13,0,0.158481,0.257868,-0.022338,-0.093147,-0.002405,-0.030854,-0.053274,0.096762,...,-0.180573,-0.037701,0.020883,0.033740,-0.065151,-0.008039,0.170601,0.045094,-0.108414,0.123681
1,13,1,0.155648,0.253481,-0.016269,-0.097016,-0.005047,-0.027279,-0.054086,0.096198,...,-0.181899,-0.035467,0.007620,0.041171,-0.062731,-0.005319,0.166944,0.046265,-0.110438,0.122801
2,13,2,0.151312,0.250384,-0.014702,-0.093926,-0.006961,-0.018035,-0.050121,0.088727,...,-0.182366,-0.030688,0.019412,0.037483,-0.067257,-0.002014,0.174397,0.043693,-0.105461,0.117063
3,13,3,0.155171,0.225126,-0.012214,-0.098732,-0.018331,0.003354,-0.030645,0.074597,...,-0.179215,-0.015111,0.020229,0.038932,-0.073913,-0.019716,0.184073,0.042528,-0.106684,0.117708
4,13,4,0.142500,0.219101,-0.000908,-0.106087,-0.024891,0.008740,-0.063446,0.073259,...,-0.174396,-0.017677,0.015991,0.036385,-0.078154,-0.008176,0.196789,0.049799,-0.107582,0.124269
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
385,15,125,0.137326,0.245735,0.130649,-0.154533,0.011166,0.079916,0.216234,0.088117,...,-0.199499,-0.126368,0.072447,0.001123,0.006951,0.044991,0.196317,0.111020,-0.054185,0.069251
386,15,126,0.141557,0.251345,0.128418,-0.155091,0.014960,0.073601,0.216967,0.096685,...,-0.205277,-0.133896,0.065923,0.001199,0.000703,0.051376,0.194330,0.105924,-0.053589,0.075656
387,15,127,0.133384,0.244087,0.125350,-0.154126,0.010289,0.076279,0.210966,0.090339,...,-0.201167,-0.126290,0.062471,0.006050,-0.008161,0.044420,0.194046,0.102683,-0.049700,0.064941
388,15,128,0.134215,0.239541,0.128161,-0.155323,0.019753,0.076201,0.216787,0.091133,...,-0.202252,-0.128437,0.064381,0.003787,-0.005455,0.053721,0.186922,0.113238,-0.053236,0.063437


Now `df_X_train` and `df_X_test` should have columns `MatchID`, `PeriodID`, `tweet_vector`. `tweet_vector` is just 200 columns.

`df_y_train` and `df_y_test` should have 1 column, `EventType`

The match IDs are grouped together so all the rows of the same, and the period IDs are ordered chronologically.

In [24]:
# Now we have df_X_train, df_X_test, df_y_train, df_y_test
# We no longer need df so we should free up the memory
del df  # Remove reference to the original DataFrame
gc.collect()  # Force garbage collection to free up memory

0

In [25]:
max_periods = df_X_train.groupby('MatchID')['PeriodID'].max().reset_index()
max_periods
# As we can see not every match has the same number of periods!

Unnamed: 0,MatchID,PeriodID
0,0,129
1,1,179
2,2,96
3,3,129
4,4,129
5,5,129
6,6,129
7,7,129
8,8,169
9,9,129


In [26]:
max_periods = df_X_test.groupby('MatchID')['PeriodID'].max().reset_index()
max_periods

Unnamed: 0,MatchID,PeriodID
0,13,129
1,14,129
2,15,129


## Format data for PyTorch LSTM

In [27]:
# input tensor for a PyTorch LSTM should have the shape of (when setting batch_first=True)
# (batch_size, seq_len, num_features) when using the batch_first=True parameter
# batch_size is number of sequences processed at once

# TRY WITHOUT SLIDING WINDOW APPROACH
#    which would mean batch size = number of matches
#    much easier to format for LSTM as 3D tensor
#    dimension of 3D tensor with batch_first=True:(batch_size = num_matches, seq_len = num_periods, num _features = 200)
#    (match_id, period_id, num_features=200)
#     not every match has the same number of periods!, so seq_len can vary between different matches
#     fix: pad with zeroes
# we want X_tensor[match_id][period_id] to return list len 200 of corresponding tweet vector


In [None]:
def convert_df_to_3D_tensor(df_X, df_y):
    # df_X should have columns MatchID, PeriodID, tweet_vector. Tweet_vector is just 200 columns
    # Rows with same matchID should be grouped together (adjacent rows)
    # df_y should have one column (the EventType)
    # Returns tensor_X numpy array already padded! shape: (num_matches, max_num_periods, num _features = 200)
    # and tensor_y of shape: (num_matches, max_num_periods) 
    
    num_matches = len(df_X['MatchID'].unique())
    max_periods = df_X.groupby('MatchID')['PeriodID'].max().reset_index()
    total_max_period = max_periods['PeriodID'].max()
    # total_max_period is max seq len

    tensor_X = np.zeros((num_matches, total_max_period+1, 200))

    tensor_y = np.zeros((num_matches, total_max_period+1))
    print(tensor_X.shape)
    print(tensor_y.shape)
    
    i=0
    previous_match_id = df_X['MatchID'][0]
    for row_index, row in df_X.iterrows():
        match_id = int(row['MatchID'])

        if match_id != previous_match_id:
            i+=1
            previous_match_id = match_id
        
        period_id = int(row['PeriodID'])
        
        features = row[3:].values  # Skip ID, MatchID and PeriodID
        tensor_X[i, period_id, :] = features
        tensor_y[i,period_id] = df_y[row_index]
        
    return tensor_X, tensor_y


X_train_tensor, y_train_tensor = convert_df_to_3D_tensor(df_X_train, df_y_train)
# X_train_tensor[match_id][period_id] to return list len 200 of corresponding tweet vector
# y_train_tensor[match_id][period_id] to return corresponding EventType (1 or 0)
# match_id index starts at 0 even if first match in df doesnt have match id 0

(13, 180, 200)
(13, 180)


In [29]:
# SCALE DATA? minmaxscaler for example!
# SCALING MIGHT BE UNNECESSARY SINCE OUTPUT OF GLOVE TWEET 200 IS ALREADY SCALED BETWEEN -1 AND 1
#scaler = MinMaxScaler()
#tensor = scaler.fit_transform(tensor)

# CONVERT TO PYTORCH TENSOR
X_train_tensor = torch.tensor(X_train_tensor, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train_tensor, dtype=torch.float32)

print(X_train_tensor.shape)
print(y_train_tensor.shape)
# X_train_tensor, y_train_tensor are now pytorch tensors

torch.Size([13, 180, 200])
torch.Size([13, 180])


## LSTM Model

In [30]:
# TODO VERIFY ITS CORRECT + MAKE MORE SOPHISTICATED
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, dropout_rate):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout_rate)
        self.fc = nn.Linear(hidden_size, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        out = self.fc(lstm_out)
        out = self.sigmoid(out) # Applying sigmoid to convert to probabilities
        return out.squeeze(-1)

#TODOOOOOOOOOO torch.nn.utils.rnn.pack_padded_sequence. This allows the model to ignore the padded values during computation.

## Train model

In [31]:
hidden_size = 500 # can tune
num_layers = 4 # can tune
dropout_rate = 0.2 # can tune
num_epochs = 500 # can tune
lr = 0.001 # can tune

model = LSTMModel(input_size=200, hidden_size=hidden_size, num_layers=num_layers, dropout_rate=dropout_rate)
optimizer = optim.Adam(model.parameters(), lr=lr)
criterion = nn.BCELoss() # Great for binary classification
# print(f"Shape of X_train_tensor: {X_train_tensor.shape}")
for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()
    outputs = model(X_train_tensor)
    # print(f"shape of outputs: {outputs.shape}")
    
    loss = criterion(outputs, y_train_tensor)
    loss.backward()
    optimizer.step()
    if epoch % 10 == 0:
        print(f"Epoch [{epoch}/{num_epochs}], Loss: {loss.item():.4f}")

print("Model is trained! (on training data)")

Epoch [0/500], Loss: 0.6916
Epoch [10/500], Loss: 0.5735
Epoch [20/500], Loss: 0.5436
Epoch [30/500], Loss: 0.5383
Epoch [40/500], Loss: 0.5433
Epoch [50/500], Loss: 0.5353
Epoch [60/500], Loss: 0.5286
Epoch [70/500], Loss: 0.5242
Epoch [80/500], Loss: 0.5199
Epoch [90/500], Loss: 0.5866
Epoch [100/500], Loss: 0.5261
Epoch [110/500], Loss: 0.5233
Epoch [120/500], Loss: 0.5200
Epoch [130/500], Loss: 0.5162
Epoch [140/500], Loss: 0.5012
Epoch [150/500], Loss: 0.4915
Epoch [160/500], Loss: 0.4818
Epoch [170/500], Loss: 0.4714
Epoch [180/500], Loss: 0.4553
Epoch [190/500], Loss: 0.5731
Epoch [200/500], Loss: 0.5084
Epoch [210/500], Loss: 0.6469
Epoch [220/500], Loss: 0.5353
Epoch [230/500], Loss: 0.5234
Epoch [240/500], Loss: 0.5164
Epoch [250/500], Loss: 0.5092
Epoch [260/500], Loss: 0.5051
Epoch [270/500], Loss: 0.5102
Epoch [280/500], Loss: 0.5030
Epoch [290/500], Loss: 0.4688
Epoch [300/500], Loss: 0.4544
Epoch [310/500], Loss: 0.4416
Epoch [320/500], Loss: 0.4218
Epoch [330/500], Loss

## Evaluate on test data

In [32]:
# Convert df_X_test and df_y_test to correct format/dimensions
X_test_tensor, y_test_tensor = convert_df_to_3D_tensor(df_X_test, df_y_test)
# CONVERT TO PYTORCH TENSOR
X_test_tensor = torch.tensor(X_test_tensor, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test_tensor, dtype=torch.float32)

(3, 130, 200)
(3, 130)


In [33]:
model.eval()

with torch.no_grad():
    predictions = model(X_test_tensor)

# Predictions have values between 0 and 1 because forward pass of LSTM contains sigmoid at output
#print(predictions)

# This converts to same dimensional array of True or false, and .float() converts True to 1 and False to 0
predicted_classes = (predictions > 0.5).float() # 0.5 is threshold
 

In [34]:
# Performance metrics

loss = criterion(predictions, y_test_tensor) # use predictions for loss calculation

print(f"Binary Cross-Entropy Loss: {loss.item():.4f}")

def accuracy(y_true, y_pred):
    if y_true.dtype != y_pred.dtype or y_true.shape != y_pred.shape:
        raise ValueError(f"Inputs do not have same type or shape!")
    correct_predictions = (y_true == y_pred).sum().item()
    total_predictions = y_true.numel()
    accuracy = correct_predictions / total_predictions * 100
    return accuracy
accuracy = accuracy(y_test_tensor, predicted_classes)


print(f"Accuracy: {accuracy:.4f}")

#print(y_test_tensor.shape)
#print(predicted_classes.shape)


# Visualization of Actual vs Predicted Classes
# import matplotlib.pyplot as plt
# TODO COULD USE PLT TO VISUALIZE?

Binary Cross-Entropy Loss: 0.9916
Accuracy: 65.6410


# Application of the model (Kaggle submission)

## Retrain the model on the entire data

In [35]:
# RETRAIN MODEL ON ENTIRE TRAINING DATA AND EVALUATE EVAL TWEETS

df_X = pd.concat([df_X_train, df_X_test], ignore_index=True)
df_y = pd.concat([df_y_train, df_y_test], ignore_index=True)
print(df_X['MatchID'].unique())
print(df_X.shape)
print(df_y.shape)


[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15]
(2137, 202)
(2137,)


In [36]:
df_X

Unnamed: 0,MatchID,PeriodID,0,1,2,3,4,5,6,7,...,190,191,192,193,194,195,196,197,198,199
0,0,0,0.122099,0.259289,0.021811,-0.091114,-0.020353,0.027769,0.110603,0.024921,...,-0.191140,-0.069440,0.019676,-0.021974,-0.065337,-0.005958,0.155431,0.028152,-0.102278,0.169321
1,0,1,0.118798,0.257246,0.022628,-0.091629,-0.030559,0.027826,0.097386,0.022053,...,-0.196974,-0.067863,0.016041,-0.022768,-0.066973,-0.010289,0.156989,0.027647,-0.104473,0.168100
2,0,2,0.120084,0.244924,0.021755,-0.087242,-0.043526,0.036788,0.112539,0.011313,...,-0.185856,-0.067514,0.009049,-0.031775,-0.063545,-0.013830,0.166954,0.020025,-0.102942,0.172742
3,0,3,0.113977,0.246675,0.032208,-0.093585,-0.039601,0.042192,0.097110,0.006285,...,-0.178769,-0.069509,-0.006021,-0.024460,-0.073347,-0.017333,0.173202,0.023651,-0.108952,0.177780
4,0,4,0.118590,0.251655,0.035730,-0.097995,-0.043148,0.033088,0.094264,0.009966,...,-0.182203,-0.071062,-0.001199,-0.028591,-0.081255,-0.013600,0.180348,0.019224,-0.112693,0.181622
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2132,15,125,0.137326,0.245735,0.130649,-0.154533,0.011166,0.079916,0.216234,0.088117,...,-0.199499,-0.126368,0.072447,0.001123,0.006951,0.044991,0.196317,0.111020,-0.054185,0.069251
2133,15,126,0.141557,0.251345,0.128418,-0.155091,0.014960,0.073601,0.216967,0.096685,...,-0.205277,-0.133896,0.065923,0.001199,0.000703,0.051376,0.194330,0.105924,-0.053589,0.075656
2134,15,127,0.133384,0.244087,0.125350,-0.154126,0.010289,0.076279,0.210966,0.090339,...,-0.201167,-0.126290,0.062471,0.006050,-0.008161,0.044420,0.194046,0.102683,-0.049700,0.064941
2135,15,128,0.134215,0.239541,0.128161,-0.155323,0.019753,0.076201,0.216787,0.091133,...,-0.202252,-0.128437,0.064381,0.003787,-0.005455,0.053721,0.186922,0.113238,-0.053236,0.063437


In [37]:
df_y

0       0.0
1       0.0
2       0.0
3       0.0
4       0.0
       ... 
2132    1.0
2133    1.0
2134    1.0
2135    1.0
2136    0.0
Name: EventType, Length: 2137, dtype: float64

In [38]:
# Convert df_X_test and df_y_test to correct format/dimensions
X_tensor, y_tensor = convert_df_to_3D_tensor(df_X, df_y)
# CONVERT TO PYTORCH TENSOR
X_tensor = torch.tensor(X_tensor, dtype=torch.float32)
y_tensor = torch.tensor(y_tensor, dtype=torch.float32)

(16, 180, 200)
(16, 180)


In [39]:
# Retrain model on all 16 matches (with same hyper parameters)
model = LSTMModel(input_size=200, hidden_size=hidden_size, num_layers=num_layers, dropout_rate=dropout_rate)
optimizer = optim.Adam(model.parameters(), lr=lr)
criterion = nn.BCELoss() # Great for binary classification

for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()
    outputs = model(X_tensor)
    
    loss = criterion(outputs, y_tensor)
    loss.backward()
    optimizer.step()
    if epoch % 10 == 0:
        print(f"Epoch [{epoch}/{num_epochs}], Loss: {loss.item():.4f}")

print("Model is trained! (on all 16 matches)")

Epoch [0/500], Loss: 0.6965
Epoch [10/500], Loss: 0.5755
Epoch [20/500], Loss: 0.5552
Epoch [30/500], Loss: 0.5246
Epoch [40/500], Loss: 0.5232
Epoch [50/500], Loss: 0.5094
Epoch [60/500], Loss: 0.5008
Epoch [70/500], Loss: 0.4984
Epoch [80/500], Loss: 0.4993
Epoch [90/500], Loss: 0.5616
Epoch [100/500], Loss: 0.5390
Epoch [110/500], Loss: 0.5225
Epoch [120/500], Loss: 0.5177
Epoch [130/500], Loss: 0.5079
Epoch [140/500], Loss: 0.5330
Epoch [150/500], Loss: 0.5255
Epoch [160/500], Loss: 0.5208
Epoch [170/500], Loss: 0.5165
Epoch [180/500], Loss: 0.5107
Epoch [190/500], Loss: 0.4915
Epoch [200/500], Loss: 0.5078
Epoch [210/500], Loss: 0.4902
Epoch [220/500], Loss: 0.4693
Epoch [230/500], Loss: 0.4708
Epoch [240/500], Loss: 0.4568
Epoch [250/500], Loss: 0.4466
Epoch [260/500], Loss: 0.4393
Epoch [270/500], Loss: 0.4283
Epoch [280/500], Loss: 0.4159
Epoch [290/500], Loss: 0.4184
Epoch [300/500], Loss: 0.3966
Epoch [310/500], Loss: 0.3801
Epoch [320/500], Loss: 0.3890
Epoch [330/500], Loss

## Model prediction on the evaluation set

In [40]:
df_eval = pd.read_feather("processed_data/eval_tweets_processed.feather")
df_eval

Unnamed: 0,MatchID,PeriodID,0,1,2,3,4,5,6,7,...,190,191,192,193,194,195,196,197,198,199
0,0,0,0.158896,0.264814,0.057981,-0.102842,0.061002,0.023203,0.131890,0.060724,...,-0.184701,-0.101496,0.081690,-0.004283,-0.057534,0.006105,0.171110,0.025386,-0.039204,0.189157
1,0,1,0.156288,0.271375,0.059343,-0.108422,0.052298,0.019057,0.119804,0.069230,...,-0.193451,-0.098110,0.085782,-0.014780,-0.065975,0.008123,0.176577,0.028697,-0.037489,0.189704
2,0,2,0.145923,0.240633,0.057680,-0.104799,0.108712,0.009395,0.081510,0.103349,...,-0.200057,-0.118543,0.082504,-0.064201,-0.029714,0.073884,0.186111,0.105346,-0.022998,0.202353
3,0,3,0.160460,0.285798,0.063682,-0.104289,0.061716,0.016656,0.126485,0.074434,...,-0.188633,-0.087426,0.102778,-0.018449,-0.064665,0.006555,0.178324,0.028487,-0.038289,0.191645
4,0,4,0.159856,0.281828,0.073446,-0.112441,0.063491,0.021445,0.110178,0.074091,...,-0.178722,-0.094382,0.093599,-0.025704,-0.076860,0.015346,0.189978,0.027995,-0.040230,0.188706
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
511,3,121,0.092493,0.205150,-0.011978,-0.058293,-0.024912,0.049651,0.203746,0.023840,...,-0.108902,-0.080190,0.005660,-0.020262,-0.005344,-0.057583,0.163659,0.081904,-0.119019,0.148621
512,3,122,0.090382,0.211368,-0.001556,-0.053755,-0.037618,0.063667,0.179914,0.025596,...,-0.121352,-0.086303,0.021765,-0.030234,-0.001703,-0.059899,0.174141,0.070798,-0.118152,0.146306
513,3,123,0.097765,0.220651,-0.004684,-0.051815,-0.031970,0.068565,0.204702,0.041446,...,-0.127275,-0.085487,0.020696,-0.026821,0.006977,-0.059209,0.155737,0.065237,-0.120275,0.141325
514,3,124,0.102277,0.236737,-0.015062,-0.054473,-0.028977,0.062958,0.204928,0.044631,...,-0.127073,-0.090737,0.014584,-0.023237,0.002420,-0.063367,0.148509,0.070196,-0.118347,0.136021


In [41]:
# NO EVENTTYPE, CAN ONLY MAKE PREDICTIONS WITHOUT KNOWING ACCURACY

# There is no df_y when we are trying to evaluate the matches in eval_tweets
# for kaggle submission!!
# Let df_y have all zeros with the same number of rows as df_X
# This is just to make code run more easily, df_y
#     and tensor_y (value returned by convert_df_to_3D_tensor) will not be used

# df_y has no real meaning, only for ease of coding!
df_y = pd.Series(0, index=df_eval.index)
df_y


0      0
1      0
2      0
3      0
4      0
      ..
511    0
512    0
513    0
514    0
515    0
Length: 516, dtype: int64

In [42]:
X_eval_tensor, _ = convert_df_to_3D_tensor(df_eval, df_y)
# CONVERT TO PYTORCH TENSOR
X_eval_tensor = torch.tensor(X_eval_tensor, dtype=torch.float32)
X_eval_tensor

(4, 130, 200)
(4, 130)


tensor([[[ 0.1589,  0.2648,  0.0580,  ...,  0.0254, -0.0392,  0.1892],
         [ 0.1563,  0.2714,  0.0593,  ...,  0.0287, -0.0375,  0.1897],
         [ 0.1459,  0.2406,  0.0577,  ...,  0.1053, -0.0230,  0.2024],
         ...,
         [ 0.1209,  0.2895,  0.1666,  ...,  0.1049, -0.0291,  0.0956],
         [ 0.1459,  0.2925,  0.1178,  ...,  0.0700, -0.0339,  0.1269],
         [ 0.1424,  0.2870,  0.1126,  ...,  0.0668, -0.0390,  0.1218]],

        [[ 0.0911,  0.2609,  0.0674,  ...,  0.1012, -0.0218,  0.0741],
         [ 0.0741,  0.2192,  0.1009,  ...,  0.1315, -0.0232,  0.0677],
         [ 0.0641,  0.2537,  0.0954,  ...,  0.0894, -0.0264,  0.0502],
         ...,
         [ 0.0968,  0.2205,  0.1096,  ...,  0.1026, -0.0453,  0.0523],
         [ 0.0904,  0.2277,  0.1050,  ...,  0.1223, -0.0478,  0.0635],
         [ 0.0932,  0.2114,  0.1104,  ...,  0.1117, -0.0403,  0.0537]],

        [[ 0.0159,  0.1494,  0.1740,  ...,  0.0662,  0.0571,  0.0319],
         [-0.0012,  0.1338,  0.1826,  ...,  0

In [43]:
model.eval()

with torch.no_grad():
    predictions = model(X_eval_tensor)

# Predictions have values between 0 and 1 because forward pass of LSTM contains sigmoid at output

# This converts to same dimensional array of True or false, and .float() converts True to 1 and False to 0
predicted_classes = (predictions > 0.5).float() # 0.5 is threshold
print(predicted_classes.shape)
predicted_classes


torch.Size([4, 130])


tensor([[0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 1., 1., 1., 0., 0., 0., 0., 1., 1., 0., 0., 0., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 1., 1., 1.,
         1., 1., 1., 1., 1., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0.,
         0., 0., 0., 1., 1., 1., 1., 0., 0., 1., 1., 1., 1., 1., 1., 0., 0., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1.,
         1., 1., 0., 0.],
        [0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 0., 0.,
         0., 0., 1., 1., 1., 1., 1., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
         1., 1., 1., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
  

In [46]:
df_eval.head()

Unnamed: 0,MatchID,PeriodID,0,1,2,3,4,5,6,7,...,190,191,192,193,194,195,196,197,198,199
0,0,0,0.158896,0.264814,0.057981,-0.102842,0.061002,0.023203,0.13189,0.060724,...,-0.184701,-0.101496,0.08169,-0.004283,-0.057534,0.006105,0.17111,0.025386,-0.039204,0.189157
1,0,1,0.156288,0.271375,0.059343,-0.108422,0.052298,0.019057,0.119804,0.06923,...,-0.193451,-0.09811,0.085782,-0.01478,-0.065975,0.008123,0.176577,0.028697,-0.037489,0.189704
2,0,2,0.145923,0.240633,0.05768,-0.104799,0.108712,0.009395,0.08151,0.103349,...,-0.200057,-0.118543,0.082504,-0.064201,-0.029714,0.073884,0.186111,0.105346,-0.022998,0.202353
3,0,3,0.16046,0.285798,0.063682,-0.104289,0.061716,0.016656,0.126485,0.074434,...,-0.188633,-0.087426,0.102778,-0.018449,-0.064665,0.006555,0.178324,0.028487,-0.038289,0.191645
4,0,4,0.159856,0.281828,0.073446,-0.112441,0.063491,0.021445,0.110178,0.074091,...,-0.178722,-0.094382,0.093599,-0.025704,-0.07686,0.015346,0.189978,0.027995,-0.04023,0.188706


In [None]:
#TODO MAKE CSV OF OUTPUT WITH CORRECT MATCH IDS?
# For the duplicate period ids, just use same prediction for that period id!

"""order = [6, 16, 9, 15]
sorted_predictions = []
id_predictions = []
matchid_predictions = []
for row_index, row in df_eval.iterrows():
    sorted_predictions.append(float(predicted_classes[int(row["MatchID"]), int(row["PeriodID"])]))
    matchid_predictions.append(order[int(row["MatchID"])])
    id_predictions.append(str(matchid_predictions[-1]) + "_" + str(int(row["PeriodID"])))

prediction_tab = pd.DataFrame(df_eval["PeriodID"])
prediction_tab['ID'] = id_predictions
prediction_tab["EventType"] = sorted_predictions
prediction_tab["MatchID"] = matchid_predictions
prediction_tab.sort_values(by=['MatchID', 'PeriodID'], inplace=True)
prediction_tab.drop(columns=["MatchID", 'PeriodID'], inplace=True)"""


sorted_predictions = []
for row_index, row in df_eval.iterrows():
    sorted_predictions.append(float(predicted_classes[int(row["MatchID"]), int(row["PeriodID"])]))

prediction_tab = pd.DataFrame(df_eval[["ID", "MatchID", "PeriodID"]])
prediction_tab["EventType"] = sorted_predictions
prediction_tab.sort_values(by=['MatchID', 'PeriodID'], inplace=True)
prediction_tab.drop(columns=["MatchID", 'PeriodID'], inplace=True)

prediction_tab.to_csv("Submission.csv", index=False)


In [None]:
# NOTES
# HOW TO MAKE SURE THAT we:
# 1. DO NOT ignore the order of the tweets -> (LSTM)
# 2. Treat each time period as RELATED to the football match they belong to -> treat each match as a sequence, train LSTM on every sequence
#                      since pytorch tensor expects multiple sequences (batches)



# for LSTM: Each input sequence should consist of tweets from a specific match, ordered by Period ID.
#   Tweets of different matches are unrelated, but tweets of a same match are related sequentially (chronologically)
#   Structure training data such that tweets are grouped by match id, and ordered by period id