# Import libraries and model

In [1]:
from os import listdir
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import gc
import gensim.downloader as api

import torch
import torch.nn as nn

from math import ceil
import torch.optim as optim

import datetime

nltk.download('stopwords')
nltk.download('wordnet')

# Load GloVe model with Gensim's API - Twitter specific embedding
embeddings_model = api.load("glove-twitter-200")  # 200-dimensional GloVe embeddings

#To check that T4 GPU is connected
#!nvidia-smi

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\berge\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\berge\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Data preprocessing and and feature extractions

## Preprocessing training set

In [2]:
# Read all training files and concatenate them into one dataframe

def load_data_frame(dirName):
    li = []
    i = 0
    for filename in listdir(dirName):
        if filename != '.ipynb_checkpoints':
            print(filename)
            df = pd.read_csv(dirName + "/" + filename)
            df.drop(columns=['Timestamp'], inplace=True)
            # Drop unused column(s)
            df['ID'] = df['MatchID'].astype(str) + '_' + df['PeriodID'].astype(str)
            #df['MatchID'] = str(i)
            # Makes sure that the match IDs are ordered from 0,1,2... with no missing values
            i+=1
            li.append(df)
    df = pd.concat(li, ignore_index=True)
    print("Loaded the dataframe from the folder " + dirName + "!")
    return df

In [3]:
df = load_data_frame("train_tweets")

ArgentinaBelgium72.csv
ArgentinaGermanyFinal77.csv
AustraliaNetherlands29.csv
AustraliaSpain34.csv
BelgiumSouthKorea59.csv
CameroonBrazil36.csv
FranceGermany70.csv
FranceNigeria66.csv
GermanyAlgeria67.csv
GermanyBrazil74.csv
GermanyUSA57.csv
HondurasSwitzerland54.csv
MexicoCroatia37.csv
NetherlandsChile35.csv
PortugalGhana58.csv
USASlovenia2010.csv
Loaded the dataframe from the folder train_tweets!


In [4]:
# Preprocessing of tweet
def preprocess_text(text):
    # Lowercasing
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Tokenization
    words = text.split()
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

def preprocess_data_frame(df):
    df['Tweet'] = df['Tweet'].apply(preprocess_text)
    df

In [5]:
# Apply preprocessing to each tweet
preprocess_data_frame(df)
df

Unnamed: 0,ID,MatchID,PeriodID,EventType,Tweet
0,11_0,11,0,0,rt woridcup argentina v belgium win httptcoleu...
1,11_0,11,0,0,elijahman_ time focus belgium winning world cup
2,11_0,11,0,0,rt fifaworldcup global stadium joinin worldcup...
3,11_0,11,0,0,rt catholicnewssvc popefrancis uhoh argentina ...
4,11_0,11,0,0,rt soccerdotcom score v bel well award messisi...
...,...,...,...,...,...
5056045,18_129,18,129,0,rt nytimes fifa world cup final score u sloven...
5056046,18_129,18,129,0,ugh shouldve usa worldcup
5056047,18_129,18,129,0,rt jaclynkeough ha rt someecards id rather die...
5056048,18_129,18,129,0,rt gustavaulia many surprise worldcup timewoww...


## Tweet Embeddings on training set

In [6]:
# Get vector tweet embeddings
# TODOOOOOOOOOOOOOOOO maybe instead of avg word embedding for each tweet can get sentence
#   embeddings to retain more information
#   -> can try more complex functions here
#   -> avg embedding of each word for a tweet is fine for now, maybe works well enough

# Function to compute the average word vector for a tweet
def get_avg_embedding(tweet, model, vector_size=200):
    words = tweet.split()  # Tokenize by whitespace
    word_vectors = [model[word] for word in words if word in model]
    if not word_vectors:  # If no words in the tweet are in the vocabulary, return a zero vector
        return np.zeros(vector_size)
    return np.mean(word_vectors, axis=0)

def get_tweet_embeddings(df, vector_size):
    tweet_vectors = np.vstack([get_avg_embedding(tweet, embeddings_model, vector_size) for tweet in df['Tweet']])
    tweet_df = pd.DataFrame(tweet_vectors)
    print("Created vector tweet embeddings!")
    return tweet_df

In [7]:
# Crashes after using all available RAM :( on google colab

# Obtain vector tweet embeddings
vector_size = 200  # Adjust based on the chosen GloVe model
tweet_df = get_tweet_embeddings(df, vector_size)
tweet_df.head()

Created vector tweet embeddings!


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
0,0.370323,0.5317,-0.057301,0.001945,-0.177691,-0.039466,-0.318336,0.049636,0.338458,-0.102866,...,-0.320222,-0.201019,-0.065383,-0.042177,-0.264181,-0.016805,0.228576,-0.104629,-0.364003,0.27396
1,0.113964,0.179678,0.092712,-0.179685,-0.10961,0.123447,0.213493,-0.063245,0.183947,-0.246134,...,-0.133157,-0.330765,-0.093205,0.016202,0.008341,-0.00491,0.089336,0.079373,0.045625,0.062633
2,-0.048559,0.30201,0.049827,-0.275527,0.148541,-0.116081,-0.253812,-0.134411,0.196322,-0.021833,...,-0.047611,-0.088304,-0.101538,-0.114711,-0.299336,0.137398,0.111959,0.274555,-0.211163,0.208941
3,0.201836,0.249383,-0.066594,-0.073885,-0.133152,-0.033296,0.059994,-0.024996,0.046238,-0.053954,...,-0.148899,-0.085644,-0.126253,0.012219,-0.313125,-0.060006,0.204233,0.000428,-0.298889,0.278378
4,0.156795,0.375538,-0.025449,-0.044731,0.036281,0.043673,0.047919,0.148287,0.16369,0.090444,...,-0.238058,-0.065272,0.043676,0.039759,0.002273,0.056147,0.030378,-0.064882,-0.192816,0.016506


In [8]:
# No need for Tweet column since we have its corresponding vector embedding
df.drop(columns=['Tweet'], inplace=True)

# Attach the vectors into the original dataframe
df = pd.concat([df, tweet_df], axis=1)

# By now should have df with columns: ID, match id, period id, Event Type, tweet_vector. Tweet_vector is just 200 columns
df.head()

Unnamed: 0,ID,MatchID,PeriodID,EventType,0,1,2,3,4,5,...,190,191,192,193,194,195,196,197,198,199
0,11_0,11,0,0,0.370323,0.5317,-0.057301,0.001945,-0.177691,-0.039466,...,-0.320222,-0.201019,-0.065383,-0.042177,-0.264181,-0.016805,0.228576,-0.104629,-0.364003,0.27396
1,11_0,11,0,0,0.113964,0.179678,0.092712,-0.179685,-0.10961,0.123447,...,-0.133157,-0.330765,-0.093205,0.016202,0.008341,-0.00491,0.089336,0.079373,0.045625,0.062633
2,11_0,11,0,0,-0.048559,0.30201,0.049827,-0.275527,0.148541,-0.116081,...,-0.047611,-0.088304,-0.101538,-0.114711,-0.299336,0.137398,0.111959,0.274555,-0.211163,0.208941
3,11_0,11,0,0,0.201836,0.249383,-0.066594,-0.073885,-0.133152,-0.033296,...,-0.148899,-0.085644,-0.126253,0.012219,-0.313125,-0.060006,0.204233,0.000428,-0.298889,0.278378
4,11_0,11,0,0,0.156795,0.375538,-0.025449,-0.044731,0.036281,0.043673,...,-0.238058,-0.065272,0.043676,0.039759,0.002273,0.056147,0.030378,-0.064882,-0.192816,0.016506


In [9]:
# Group the tweets into their corresponding periods to generate an average embedding vector for each period
# so there are no duplicate period id rows per match.
# It decreases the size of data and makes it easier to fit into LSTM model
def group_data_frame_by_periods(df):
    df = df.groupby(['MatchID', 'PeriodID', 'ID']).mean().reset_index()
    #df.drop(columns=['ID'], inplace=True) 
    df['MatchID'] = df['MatchID'].astype(int)
    df['PeriodID'] = df['PeriodID'].astype(int)
    # need to convert to int before sorting
    df.sort_values(by=['MatchID', 'PeriodID'], inplace=True)
    df.reset_index(drop=True, inplace=True)
    print("Grouped dataframe by periods!")
    return df


In [10]:
df = group_data_frame_by_periods(df)
df

Grouped dataframe by periods!


Unnamed: 0,MatchID,PeriodID,ID,EventType,0,1,2,3,4,5,...,190,191,192,193,194,195,196,197,198,199
0,0,0,0_0,0.0,0.142700,0.273355,0.024052,-0.100410,-0.056623,0.047082,...,-0.095398,-0.058165,0.056072,-0.000830,-0.073885,-0.005746,0.174125,0.093547,-0.009831,0.145556
1,0,1,0_1,0.0,0.155795,0.274697,0.022707,-0.114188,-0.078044,0.046390,...,-0.094809,-0.053387,0.067755,-0.000557,-0.106459,0.007881,0.165998,0.100027,-0.008115,0.127290
2,0,2,0_2,0.0,0.149349,0.291958,0.035742,-0.123525,-0.054195,0.042744,...,-0.112641,-0.056553,0.073958,0.001823,-0.102801,0.003312,0.174950,0.081542,-0.008284,0.130799
3,0,3,0_3,0.0,0.160484,0.263250,0.003070,-0.158384,-0.055241,0.047692,...,-0.122675,-0.047729,0.064131,0.015966,-0.102371,0.001360,0.156838,0.095372,0.014287,0.130726
4,0,4,0_4,0.0,0.159678,0.268265,-0.034739,-0.143102,-0.086689,0.054459,...,-0.117430,-0.039636,0.030168,0.018995,-0.101448,0.020565,0.149877,0.092729,-0.011608,0.149763
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2132,19,125,19_125,1.0,0.041150,0.267610,0.052960,-0.197763,0.104450,0.025712,...,-0.058006,-0.074136,-0.033716,0.011847,-0.041860,0.024191,0.167323,0.122392,-0.054058,0.158443
2133,19,126,19_126,1.0,0.044753,0.266801,0.058579,-0.202625,0.093912,0.024470,...,-0.050856,-0.076830,-0.019953,0.005234,-0.050930,0.036127,0.161945,0.129341,-0.063787,0.172965
2134,19,127,19_127,1.0,0.048542,0.271988,0.051762,-0.197929,0.092518,0.020690,...,-0.054085,-0.077008,-0.020153,0.008492,-0.052610,0.036094,0.162711,0.131641,-0.069869,0.174006
2135,19,128,19_128,1.0,0.051801,0.269485,0.051750,-0.198183,0.088989,0.019420,...,-0.049065,-0.073360,-0.026281,0.006967,-0.056533,0.039534,0.158736,0.124636,-0.077828,0.173008


## Preprocessing evaluation set

In [11]:
df_eval = load_data_frame("eval_tweets")
df_eval

GermanyGhana32.csv
GermanySerbia2010.csv
GreeceIvoryCoast44.csv
NetherlandsMexico64.csv
Loaded the dataframe from the folder eval_tweets!


Unnamed: 0,ID,MatchID,PeriodID,Tweet
0,6_0,6,0,I Finally get to see Germany play\n#GER 🇩🇪⚽🏆
1,6_0,6,0,RT @Wor1dCup2014: If Any of the Boateng Brothe...
2,6_0,6,0,Fascinated for this #GERvsGHA match. This will...
3,6_0,6,0,: #GER and #GHA in a few.
4,6_0,6,0,BOATENG GRUDGE MATCH: 21/2 for Jermaine to sco...
...,...,...,...,...
1072923,15_125,15,125,Dutch deserve to be in last 8.Keep their nerve...
1072924,15_125,15,125,RT @GeniusFootball: RETWEET if you think #MEX ...
1072925,15_125,15,125,"Hold your head high Mexico, played beautifully..."
1072926,15_125,15,125,RT @TheWorIdCup: Mexico fans right now... http...


In [12]:
preprocess_data_frame(df_eval)
df_eval

Unnamed: 0,ID,MatchID,PeriodID,Tweet
0,6_0,6,0,finally get see germany play ger
1,6_0,6,0,rt wordcup boateng brother score today well gi...
2,6_0,6,0,fascinated gervsgha match tell u lot chance us...
3,6_0,6,0,ger gha
4,6_0,6,0,boateng grudge match jermaine score ger kevinp...
...,...,...,...,...
1072923,15_125,15,125,dutch deserve last keep nerve till end turnove...
1072924,15_125,15,125,rt geniusfootball retweet think mex deserved w...
1072925,15_125,15,125,hold head high mexico played beautifully last ...
1072926,15_125,15,125,rt theworidcup mexico fan right httptcozfmwxiatw


## Tweet Embeddings on training set

In [13]:
tweet_df_eval = get_tweet_embeddings(df_eval, vector_size)
tweet_df_eval.head()

Created vector tweet embeddings!


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
0,0.183428,0.351652,-0.059248,-0.182003,0.081787,0.131392,0.513264,0.108436,0.235608,0.015548,...,-0.126687,-0.086776,0.224327,-0.2981,-0.039604,-0.170743,0.212588,-0.165797,-0.058123,0.24174
1,0.252394,0.291525,-0.054652,0.019181,0.082992,-0.067042,0.106594,0.06285,0.114198,0.168639,...,-0.273192,-0.043879,0.165205,0.01279,-0.035947,-0.023603,0.090462,0.079817,-0.096872,0.275643
2,0.133759,0.340722,-0.202986,-0.155269,-0.05257,0.217507,0.175424,0.069934,0.147071,0.122774,...,-0.041979,0.041901,0.099874,-0.158076,0.03502,0.1587,0.197988,0.048215,0.172673,0.004555
3,0.132508,0.662775,0.1056,-0.12087,0.362255,-0.073177,-0.708465,0.064674,0.255179,0.3855,...,0.14236,-0.204735,0.4138,-0.16136,-0.254585,0.11611,0.463215,0.00453,-0.096614,0.37292
4,0.349205,0.474341,-0.059732,-0.133805,0.189881,0.259303,0.060578,0.032346,-0.050215,0.255715,...,-0.366065,0.063324,0.365045,-0.195428,-0.176695,-0.036868,0.357725,-0.104471,0.130819,0.210772


In [14]:
# No need for Tweet column since we have its corresponding vector embedding
df_eval.drop(columns=['Tweet'], inplace=True)

# Attach the vectors into the original dataframe
df_eval = pd.concat([df_eval, tweet_df_eval], axis=1)

# By now should have df with columns: ID, match id, period id, Event Type, tweet_vector. Tweet_vector is just 200 columns
df_eval.head()

Unnamed: 0,ID,MatchID,PeriodID,0,1,2,3,4,5,6,...,190,191,192,193,194,195,196,197,198,199
0,6_0,6,0,0.183428,0.351652,-0.059248,-0.182003,0.081787,0.131392,0.513264,...,-0.126687,-0.086776,0.224327,-0.2981,-0.039604,-0.170743,0.212588,-0.165797,-0.058123,0.24174
1,6_0,6,0,0.252394,0.291525,-0.054652,0.019181,0.082992,-0.067042,0.106594,...,-0.273192,-0.043879,0.165205,0.01279,-0.035947,-0.023603,0.090462,0.079817,-0.096872,0.275643
2,6_0,6,0,0.133759,0.340722,-0.202986,-0.155269,-0.05257,0.217507,0.175424,...,-0.041979,0.041901,0.099874,-0.158076,0.03502,0.1587,0.197988,0.048215,0.172673,0.004555
3,6_0,6,0,0.132508,0.662775,0.1056,-0.12087,0.362255,-0.073177,-0.708465,...,0.14236,-0.204735,0.4138,-0.16136,-0.254585,0.11611,0.463215,0.00453,-0.096614,0.37292
4,6_0,6,0,0.349205,0.474341,-0.059732,-0.133805,0.189881,0.259303,0.060578,...,-0.366065,0.063324,0.365045,-0.195428,-0.176695,-0.036868,0.357725,-0.104471,0.130819,0.210772


In [15]:
df_eval = group_data_frame_by_periods(df_eval)
df_eval

Grouped dataframe by periods!


Unnamed: 0,MatchID,PeriodID,ID,0,1,2,3,4,5,6,...,190,191,192,193,194,195,196,197,198,199
0,6,0,6_0,0.158896,0.264814,0.057981,-0.102842,0.061002,0.023203,0.131890,...,-0.184701,-0.101496,0.081690,-0.004283,-0.057534,0.006105,0.171110,0.025386,-0.039204,0.189157
1,6,1,6_1,0.156288,0.271375,0.059343,-0.108422,0.052298,0.019057,0.119804,...,-0.193451,-0.098110,0.085782,-0.014780,-0.065975,0.008123,0.176577,0.028697,-0.037489,0.189704
2,6,2,6_2,0.145923,0.240633,0.057680,-0.104799,0.108712,0.009395,0.081510,...,-0.200057,-0.118543,0.082504,-0.064201,-0.029714,0.073884,0.186111,0.105346,-0.022998,0.202353
3,6,3,6_3,0.160460,0.285798,0.063682,-0.104289,0.061716,0.016656,0.126485,...,-0.188633,-0.087426,0.102778,-0.018449,-0.064665,0.006555,0.178324,0.028487,-0.038289,0.191645
4,6,4,6_4,0.159856,0.281828,0.073446,-0.112441,0.063491,0.021445,0.110178,...,-0.178722,-0.094382,0.093599,-0.025704,-0.076860,0.015346,0.189978,0.027995,-0.040230,0.188706
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
511,16,125,16_125,0.090554,0.227452,0.107798,-0.111091,0.034236,0.063783,0.053129,...,-0.198507,-0.122853,0.024147,0.000304,-0.161041,0.092954,0.143728,0.113752,-0.049263,0.056543
512,16,126,16_126,0.086859,0.236306,0.107814,-0.125368,0.026619,0.063352,0.062930,...,-0.189677,-0.125140,0.027448,0.003508,-0.144993,0.096095,0.155616,0.101075,-0.042474,0.058476
513,16,127,16_127,0.096795,0.220505,0.109566,-0.119462,0.023341,0.063660,0.084856,...,-0.180416,-0.117967,0.033671,0.007132,-0.137807,0.088667,0.160295,0.102587,-0.045250,0.052347
514,16,128,16_128,0.090397,0.227700,0.104989,-0.123692,0.035497,0.059867,0.049117,...,-0.188227,-0.115212,0.019899,0.006688,-0.167376,0.096072,0.149536,0.122288,-0.047816,0.063510


## Saving preprocessed data frames

In [16]:
# Save processed data in a file for reusability
df.to_feather("processed_data/train_tweets_processed.feather")
print("Training data frame saved!")
df_eval.to_feather("processed_data/eval_tweets_processed.feather")
print("Evaluation data frame saved!")

# Remove reference to the original DataFrame
del df  
del df_eval
gc.collect()  # force garbage collection to free up memory

  table = Table.from_pandas(df, preserve_index=preserve_index)


Training data frame saved!
Evaluation data frame saved!


0

# Model training

In [17]:
df = pd.read_feather("processed_data/train_tweets_processed.feather")
df

Unnamed: 0,MatchID,PeriodID,ID,EventType,0,1,2,3,4,5,...,190,191,192,193,194,195,196,197,198,199
0,0,0,0_0,0.0,0.142700,0.273355,0.024052,-0.100410,-0.056623,0.047082,...,-0.095398,-0.058165,0.056072,-0.000830,-0.073885,-0.005746,0.174125,0.093547,-0.009831,0.145556
1,0,1,0_1,0.0,0.155795,0.274697,0.022707,-0.114188,-0.078044,0.046390,...,-0.094809,-0.053387,0.067755,-0.000557,-0.106459,0.007881,0.165998,0.100027,-0.008115,0.127290
2,0,2,0_2,0.0,0.149349,0.291958,0.035742,-0.123525,-0.054195,0.042744,...,-0.112641,-0.056553,0.073958,0.001823,-0.102801,0.003312,0.174950,0.081542,-0.008284,0.130799
3,0,3,0_3,0.0,0.160484,0.263250,0.003070,-0.158384,-0.055241,0.047692,...,-0.122675,-0.047729,0.064131,0.015966,-0.102371,0.001360,0.156838,0.095372,0.014287,0.130726
4,0,4,0_4,0.0,0.159678,0.268265,-0.034739,-0.143102,-0.086689,0.054459,...,-0.117430,-0.039636,0.030168,0.018995,-0.101448,0.020565,0.149877,0.092729,-0.011608,0.149763
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2132,19,125,19_125,1.0,0.041150,0.267610,0.052960,-0.197763,0.104450,0.025712,...,-0.058006,-0.074136,-0.033716,0.011847,-0.041860,0.024191,0.167323,0.122392,-0.054058,0.158443
2133,19,126,19_126,1.0,0.044753,0.266801,0.058579,-0.202625,0.093912,0.024470,...,-0.050856,-0.076830,-0.019953,0.005234,-0.050930,0.036127,0.161945,0.129341,-0.063787,0.172965
2134,19,127,19_127,1.0,0.048542,0.271988,0.051762,-0.197929,0.092518,0.020690,...,-0.054085,-0.077008,-0.020153,0.008492,-0.052610,0.036094,0.162711,0.131641,-0.069869,0.174006
2135,19,128,19_128,1.0,0.051801,0.269485,0.051750,-0.198183,0.088989,0.019420,...,-0.049065,-0.073360,-0.026281,0.006967,-0.056533,0.039534,0.158736,0.124636,-0.077828,0.173008


## Separate Train and Test data

In [18]:
# Train on of the first 13 of 16 matches (16*0.8=12.8~=13)
# and the test data would be the last 3 matches. 
# Before submitting on Kaggle we should train on full dataset, so al 16 matches
train_percentage = 0.8
unique_match_ids = df['MatchID'].unique()
print(unique_match_ids)
num_matches_training = int(ceil(len(unique_match_ids)*train_percentage))
print(num_matches_training)

[ 0  1  2  3  4  5  7  8 10 11 12 13 14 17 18 19]
13


In [19]:
# All matches from num_matches_training and after will be in test test
# row_index is then the first row of the matches that will go to the test


row_index = (df['MatchID'] == num_matches_training).idxmax()
#row_index = df[df['MatchID'] == target_match_id].first_valid_index()
df_X_train = df[:row_index].copy()
df_X_test = df[row_index:].copy()


In [20]:
df_y_train = df_X_train['EventType']
df_y_train

0       0.0
1       0.0
2       0.0
3       0.0
4       0.0
       ... 
1482    1.0
1483    0.0
1484    0.0
1485    1.0
1486    1.0
Name: EventType, Length: 1487, dtype: float64

In [21]:
df_y_test = df_X_test['EventType']
df_y_test.reset_index(drop=True, inplace=True)
df_y_test

0      0.0
1      0.0
2      0.0
3      0.0
4      0.0
      ... 
645    1.0
646    1.0
647    1.0
648    1.0
649    1.0
Name: EventType, Length: 650, dtype: float64

In [22]:
df_X_train.drop(['EventType'], axis=1, inplace=True)
df_X_train

Unnamed: 0,MatchID,PeriodID,ID,0,1,2,3,4,5,6,...,190,191,192,193,194,195,196,197,198,199
0,0,0,0_0,0.142700,0.273355,0.024052,-0.100410,-0.056623,0.047082,0.107778,...,-0.095398,-0.058165,0.056072,-0.000830,-0.073885,-0.005746,0.174125,0.093547,-0.009831,0.145556
1,0,1,0_1,0.155795,0.274697,0.022707,-0.114188,-0.078044,0.046390,0.074244,...,-0.094809,-0.053387,0.067755,-0.000557,-0.106459,0.007881,0.165998,0.100027,-0.008115,0.127290
2,0,2,0_2,0.149349,0.291958,0.035742,-0.123525,-0.054195,0.042744,0.082248,...,-0.112641,-0.056553,0.073958,0.001823,-0.102801,0.003312,0.174950,0.081542,-0.008284,0.130799
3,0,3,0_3,0.160484,0.263250,0.003070,-0.158384,-0.055241,0.047692,0.073075,...,-0.122675,-0.047729,0.064131,0.015966,-0.102371,0.001360,0.156838,0.095372,0.014287,0.130726
4,0,4,0_4,0.159678,0.268265,-0.034739,-0.143102,-0.086689,0.054459,0.009802,...,-0.117430,-0.039636,0.030168,0.018995,-0.101448,0.020565,0.149877,0.092729,-0.011608,0.149763
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1482,12,92,12_92,0.145892,0.222630,0.120081,-0.116945,0.024491,0.009698,0.137669,...,-0.224321,-0.143763,0.020284,0.018604,-0.082488,0.076500,0.234800,0.126446,-0.053445,0.130328
1483,12,93,12_93,0.153782,0.227837,0.096260,-0.097191,0.040159,-0.006013,0.112964,...,-0.231501,-0.134865,0.019563,0.001825,-0.100477,0.059753,0.218688,0.120353,-0.046471,0.139495
1484,12,94,12_94,0.132877,0.225982,0.112246,-0.112541,0.027943,0.009418,0.120520,...,-0.215114,-0.125932,0.020598,-0.012168,-0.092944,0.057864,0.243887,0.117850,-0.046737,0.132410
1485,12,95,12_95,0.121008,0.233462,0.137019,-0.118832,0.069999,0.016348,0.101057,...,-0.202068,-0.163163,0.043706,0.024385,-0.094716,0.110365,0.294361,0.131040,-0.043662,0.119831


In [23]:
df_X_test.drop(['EventType'], axis=1, inplace=True)
df_X_test.reset_index(drop=True, inplace=True)
df_X_test

Unnamed: 0,MatchID,PeriodID,ID,0,1,2,3,4,5,6,...,190,191,192,193,194,195,196,197,198,199
0,13,0,13_0,0.122759,0.217809,0.033307,-0.081587,-0.090911,0.004376,0.115224,...,-0.097398,-0.126960,0.061238,-0.010634,-0.073343,0.062516,0.167249,0.021082,0.089660,0.088117
1,13,1,13_1,0.137794,0.241211,0.032959,-0.080467,-0.093196,0.008602,0.108653,...,-0.097439,-0.122606,0.051559,-0.013713,-0.078115,0.064444,0.165768,0.029060,0.085954,0.088108
2,13,2,13_2,0.146005,0.245907,0.001561,-0.068743,-0.051985,0.006599,0.125866,...,-0.124040,-0.102845,0.055133,0.015766,-0.066953,0.052463,0.126224,0.038286,0.051691,0.116719
3,13,3,13_3,0.140378,0.245704,-0.006743,-0.065213,-0.058203,0.009926,0.117768,...,-0.127208,-0.100219,0.061303,0.022392,-0.068072,0.050457,0.120086,0.027437,0.044498,0.110755
4,13,4,13_4,0.140965,0.249176,0.005743,-0.071034,-0.063098,0.012924,0.122345,...,-0.125824,-0.104658,0.069917,0.022122,-0.067908,0.056947,0.121216,0.018840,0.038354,0.106161
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
645,19,125,19_125,0.041150,0.267610,0.052960,-0.197763,0.104450,0.025712,-0.099700,...,-0.058006,-0.074136,-0.033716,0.011847,-0.041860,0.024191,0.167323,0.122392,-0.054058,0.158443
646,19,126,19_126,0.044753,0.266801,0.058579,-0.202625,0.093912,0.024470,-0.069458,...,-0.050856,-0.076830,-0.019953,0.005234,-0.050930,0.036127,0.161945,0.129341,-0.063787,0.172965
647,19,127,19_127,0.048542,0.271988,0.051762,-0.197929,0.092518,0.020690,-0.068634,...,-0.054085,-0.077008,-0.020153,0.008492,-0.052610,0.036094,0.162711,0.131641,-0.069869,0.174006
648,19,128,19_128,0.051801,0.269485,0.051750,-0.198183,0.088989,0.019420,-0.066234,...,-0.049065,-0.073360,-0.026281,0.006967,-0.056533,0.039534,0.158736,0.124636,-0.077828,0.173008


Now `df_X_train` and `df_X_test` should have columns `MatchID`, `PeriodID`, `tweet_vector`. `tweet_vector` is just 200 columns.

`df_y_train` and `df_y_test` should have 1 column, `EventType`

The match IDs are grouped together so all the rows of the same, and the period IDs are ordered chronologically.

In [24]:
# Now we have df_X_train, df_X_test, df_y_train, df_y_test
# We no longer need df so we should free up the memory
del df  # Remove reference to the original DataFrame
gc.collect()  # Force garbage collection to free up memory

0

In [25]:
max_periods = df_X_train.groupby('MatchID')['PeriodID'].max().reset_index()
max_periods
# As we can see not every match has the same number of periods!

Unnamed: 0,MatchID,PeriodID
0,0,129
1,1,129
2,2,129
3,3,129
4,4,169
5,5,129
6,7,129
7,8,129
8,10,179
9,11,129


In [26]:
max_periods = df_X_test.groupby('MatchID')['PeriodID'].max().reset_index()
max_periods

Unnamed: 0,MatchID,PeriodID
0,13,129
1,14,129
2,17,129
3,18,129
4,19,129


## Format data for PyTorch LSTM

In [27]:
# input tensor for a PyTorch LSTM should have the shape of (when setting batch_first=True)
# (batch_size, seq_len, num_features) when using the batch_first=True parameter
# batch_size is number of sequences processed at once

# TRY WITHOUT SLIDING WINDOW APPROACH
#    which would mean batch size = number of matches
#    much easier to format for LSTM as 3D tensor
#    dimension of 3D tensor with batch_first=True:(batch_size = num_matches, seq_len = num_periods, num _features = 200)
#    (match_id, period_id, num_features=200)
#     not every match has the same number of periods!, so seq_len can vary between different matches
#     fix: pad with zeroes
# we want X_tensor[match_id][period_id] to return list len 200 of corresponding tweet vector


In [28]:
def convert_df_to_3D_tensor(df_X, df_y):
    # df_X should have columns MatchID, PeriodID, tweet_vector. Tweet_vector is just 200 columns
    # Rows with same matchID should be grouped together (adjacent rows)
    # df_y should have one column (the EventType)
    # Returns tensor_X numpy array already padded! shape: (num_matches, max_num_periods, num _features = 200)
    # and tensor_y of shape: (num_matches, max_num_periods) 
    
    num_matches = len(df_X['MatchID'].unique())
    max_periods = df_X.groupby('MatchID')['PeriodID'].max().reset_index()
    total_max_period = max_periods['PeriodID'].max()
    # total_max_period is max seq len

    tensor_X = np.zeros((num_matches, total_max_period+1, 200))

    tensor_y = np.zeros((num_matches, total_max_period+1))
    print(tensor_X.shape)
    print(tensor_y.shape)
    
    i=0
    previous_match_id = df_X['MatchID'][0]
    for row_index, row in df_X.iterrows():
        match_id = int(row['MatchID'])

        if match_id != previous_match_id:
            i+=1
            previous_match_id = match_id
        
        period_id = int(row['PeriodID'])
        
        features = row[3:].values  # Skip ID, MatchID and PeriodID
        tensor_X[i, period_id, :] = features
        tensor_y[i,period_id] = df_y[row_index]
        
    return tensor_X, tensor_y


X_train_tensor, y_train_tensor = convert_df_to_3D_tensor(df_X_train, df_y_train)
# X_train_tensor[match_id][period_id] to return list len 200 of corresponding tweet vector
# y_train_tensor[match_id][period_id] to return corresponding EventType (1 or 0)
# match_id index starts at 0 even if first match in df doesnt have match id 0

(11, 180, 200)
(11, 180)


In [29]:
# SCALE DATA? minmaxscaler for example!
# SCALING MIGHT BE UNNECESSARY SINCE OUTPUT OF GLOVE TWEET 200 IS ALREADY SCALED BETWEEN -1 AND 1
#scaler = MinMaxScaler()
#tensor = scaler.fit_transform(tensor)

# CONVERT TO PYTORCH TENSOR
X_train_tensor = torch.tensor(X_train_tensor, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train_tensor, dtype=torch.float32)

print(X_train_tensor.shape)
print(y_train_tensor.shape)
# X_train_tensor, y_train_tensor are now pytorch tensors

torch.Size([11, 180, 200])
torch.Size([11, 180])


## LSTM Model

In [30]:
# TODO VERIFY ITS CORRECT + MAKE MORE SOPHISTICATED
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, dropout_rate):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout_rate)
        self.fc = nn.Linear(hidden_size, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        out = self.fc(lstm_out)
        out = self.sigmoid(out) # Applying sigmoid to convert to probabilities
        return out.squeeze(-1)

#TODOOOOOOOOOO torch.nn.utils.rnn.pack_padded_sequence. This allows the model to ignore the padded values during computation.

## Train model

In [31]:
hidden_size = 500 # can tune
num_layers = 4 # can tune
dropout_rate = 0.2 # can tune
num_epochs = 500 # can tune
lr = 0.001 # can tune

model = LSTMModel(input_size=200, hidden_size=hidden_size, num_layers=num_layers, dropout_rate=dropout_rate)
optimizer = optim.Adam(model.parameters(), lr=lr)
criterion = nn.BCELoss() # Great for binary classification
# print(f"Shape of X_train_tensor: {X_train_tensor.shape}")
for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()
    outputs = model(X_train_tensor)
    # print(f"shape of outputs: {outputs.shape}")
    
    loss = criterion(outputs, y_train_tensor)
    loss.backward()
    optimizer.step()
    if epoch % 10 == 0:
        print(f"Epoch [{epoch}/{num_epochs}], Loss: {loss.item():.4f}")

print("Model is trained! (on training data)")

Epoch [0/500], Loss: 0.6949
Epoch [10/500], Loss: 0.5697
Epoch [20/500], Loss: 0.5450
Epoch [30/500], Loss: 0.5380
Epoch [40/500], Loss: 0.5297
Epoch [50/500], Loss: 0.5245
Epoch [60/500], Loss: 0.5397
Epoch [70/500], Loss: 0.5398
Epoch [80/500], Loss: 0.5324
Epoch [90/500], Loss: 0.5275
Epoch [100/500], Loss: 0.5246
Epoch [110/500], Loss: 0.5226
Epoch [120/500], Loss: 0.5140
Epoch [130/500], Loss: 0.5162
Epoch [140/500], Loss: 0.5178
Epoch [150/500], Loss: 0.5045
Epoch [160/500], Loss: 0.4781
Epoch [170/500], Loss: 0.5066
Epoch [180/500], Loss: 0.4766
Epoch [190/500], Loss: 0.4443
Epoch [200/500], Loss: 0.4171
Epoch [210/500], Loss: 0.3904
Epoch [220/500], Loss: 0.3710
Epoch [230/500], Loss: 0.3422
Epoch [240/500], Loss: 0.3489
Epoch [250/500], Loss: 0.3264
Epoch [260/500], Loss: 0.2865
Epoch [270/500], Loss: 0.3147
Epoch [280/500], Loss: 0.2748
Epoch [290/500], Loss: 0.2453
Epoch [300/500], Loss: 0.2347
Epoch [310/500], Loss: 0.2133
Epoch [320/500], Loss: 0.1907
Epoch [330/500], Loss

## Evaluate on test data

In [32]:
# Convert df_X_test and df_y_test to correct format/dimensions
X_test_tensor, y_test_tensor = convert_df_to_3D_tensor(df_X_test, df_y_test)
# CONVERT TO PYTORCH TENSOR
X_test_tensor = torch.tensor(X_test_tensor, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test_tensor, dtype=torch.float32)

(5, 130, 200)
(5, 130)


In [33]:
model.eval()

with torch.no_grad():
    predictions = model(X_test_tensor)

# Predictions have values between 0 and 1 because forward pass of LSTM contains sigmoid at output
#print(predictions)

# This converts to same dimensional array of True or false, and .float() converts True to 1 and False to 0
predicted_classes = (predictions > 0.5).float() # 0.5 is threshold
 

In [34]:
# Performance metrics

loss = criterion(predictions, y_test_tensor) # use predictions for loss calculation

print(f"Binary Cross-Entropy Loss: {loss.item():.4f}")

def accuracy(y_true, y_pred):
    if y_true.dtype != y_pred.dtype or y_true.shape != y_pred.shape:
        raise ValueError(f"Inputs do not have same type or shape!")
    correct_predictions = (y_true == y_pred).sum().item()
    total_predictions = y_true.numel()
    accuracy = correct_predictions / total_predictions * 100
    return accuracy
accuracy = accuracy(y_test_tensor, predicted_classes)


print(f"Accuracy: {accuracy:.4f}")

#print(y_test_tensor.shape)
#print(predicted_classes.shape)


# Visualization of Actual vs Predicted Classes
# import matplotlib.pyplot as plt
# TODO COULD USE PLT TO VISUALIZE?

Binary Cross-Entropy Loss: 1.8329
Accuracy: 65.5385


# Application of the model (Kaggle submission)

## Retrain the model on the entire data

In [35]:
# RETRAIN MODEL ON ENTIRE TRAINING DATA AND EVALUATE EVAL TWEETS

df_X = pd.concat([df_X_train, df_X_test], ignore_index=True)
df_y = pd.concat([df_y_train, df_y_test], ignore_index=True)
print(df_X['MatchID'].unique())
print(df_X.shape)
print(df_y.shape)


[ 0  1  2  3  4  5  7  8 10 11 12 13 14 17 18 19]
(2137, 203)
(2137,)


In [36]:
df_X

Unnamed: 0,MatchID,PeriodID,ID,0,1,2,3,4,5,6,...,190,191,192,193,194,195,196,197,198,199
0,0,0,0_0,0.142700,0.273355,0.024052,-0.100410,-0.056623,0.047082,0.107778,...,-0.095398,-0.058165,0.056072,-0.000830,-0.073885,-0.005746,0.174125,0.093547,-0.009831,0.145556
1,0,1,0_1,0.155795,0.274697,0.022707,-0.114188,-0.078044,0.046390,0.074244,...,-0.094809,-0.053387,0.067755,-0.000557,-0.106459,0.007881,0.165998,0.100027,-0.008115,0.127290
2,0,2,0_2,0.149349,0.291958,0.035742,-0.123525,-0.054195,0.042744,0.082248,...,-0.112641,-0.056553,0.073958,0.001823,-0.102801,0.003312,0.174950,0.081542,-0.008284,0.130799
3,0,3,0_3,0.160484,0.263250,0.003070,-0.158384,-0.055241,0.047692,0.073075,...,-0.122675,-0.047729,0.064131,0.015966,-0.102371,0.001360,0.156838,0.095372,0.014287,0.130726
4,0,4,0_4,0.159678,0.268265,-0.034739,-0.143102,-0.086689,0.054459,0.009802,...,-0.117430,-0.039636,0.030168,0.018995,-0.101448,0.020565,0.149877,0.092729,-0.011608,0.149763
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2132,19,125,19_125,0.041150,0.267610,0.052960,-0.197763,0.104450,0.025712,-0.099700,...,-0.058006,-0.074136,-0.033716,0.011847,-0.041860,0.024191,0.167323,0.122392,-0.054058,0.158443
2133,19,126,19_126,0.044753,0.266801,0.058579,-0.202625,0.093912,0.024470,-0.069458,...,-0.050856,-0.076830,-0.019953,0.005234,-0.050930,0.036127,0.161945,0.129341,-0.063787,0.172965
2134,19,127,19_127,0.048542,0.271988,0.051762,-0.197929,0.092518,0.020690,-0.068634,...,-0.054085,-0.077008,-0.020153,0.008492,-0.052610,0.036094,0.162711,0.131641,-0.069869,0.174006
2135,19,128,19_128,0.051801,0.269485,0.051750,-0.198183,0.088989,0.019420,-0.066234,...,-0.049065,-0.073360,-0.026281,0.006967,-0.056533,0.039534,0.158736,0.124636,-0.077828,0.173008


In [37]:
df_y

0       0.0
1       0.0
2       0.0
3       0.0
4       0.0
       ... 
2132    1.0
2133    1.0
2134    1.0
2135    1.0
2136    1.0
Name: EventType, Length: 2137, dtype: float64

In [38]:
# Convert df_X_test and df_y_test to correct format/dimensions
X_tensor, y_tensor = convert_df_to_3D_tensor(df_X, df_y)
# CONVERT TO PYTORCH TENSOR
X_tensor = torch.tensor(X_tensor, dtype=torch.float32)
y_tensor = torch.tensor(y_tensor, dtype=torch.float32)

(16, 180, 200)
(16, 180)


In [39]:
# Retrain model on all 16 matches (with same hyper parameters)
model = LSTMModel(input_size=200, hidden_size=hidden_size, num_layers=num_layers, dropout_rate=dropout_rate)
optimizer = optim.Adam(model.parameters(), lr=lr)
criterion = nn.BCELoss() # Great for binary classification

for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()
    outputs = model(X_tensor)
    
    loss = criterion(outputs, y_tensor)
    loss.backward()
    optimizer.step()
    if epoch % 10 == 0:
        print(f"Epoch [{epoch}/{num_epochs}], Loss: {loss.item():.4f}")

print("Model is trained! (on all 16 matches)")

Epoch [0/500], Loss: 0.6929
Epoch [10/500], Loss: 0.5734
Epoch [20/500], Loss: 0.5475
Epoch [30/500], Loss: 0.5179
Epoch [40/500], Loss: 0.5503
Epoch [50/500], Loss: 0.5464
Epoch [60/500], Loss: 0.5328
Epoch [70/500], Loss: 0.5246
Epoch [80/500], Loss: 0.5193
Epoch [90/500], Loss: 0.5127
Epoch [100/500], Loss: 0.5229
Epoch [110/500], Loss: 0.5240
Epoch [120/500], Loss: 0.5204
Epoch [130/500], Loss: 0.5166
Epoch [140/500], Loss: 0.5071
Epoch [150/500], Loss: 0.5706
Epoch [160/500], Loss: 0.5188
Epoch [170/500], Loss: 0.5192
Epoch [180/500], Loss: 0.5156
Epoch [190/500], Loss: 0.5120
Epoch [200/500], Loss: 0.5046
Epoch [210/500], Loss: 0.4882
Epoch [220/500], Loss: 0.4764
Epoch [230/500], Loss: 0.4665
Epoch [240/500], Loss: 0.4506
Epoch [250/500], Loss: 0.4663
Epoch [260/500], Loss: 0.4321
Epoch [270/500], Loss: 0.4215
Epoch [280/500], Loss: 0.4162
Epoch [290/500], Loss: 0.4029
Epoch [300/500], Loss: 0.3931
Epoch [310/500], Loss: 0.3721
Epoch [320/500], Loss: 0.3728
Epoch [330/500], Loss

## Model prediction on the evaluation set

In [40]:
df_eval = pd.read_feather("processed_data/eval_tweets_processed.feather")
df_eval

Unnamed: 0,MatchID,PeriodID,ID,0,1,2,3,4,5,6,...,190,191,192,193,194,195,196,197,198,199
0,6,0,6_0,0.158896,0.264814,0.057981,-0.102842,0.061002,0.023203,0.131890,...,-0.184701,-0.101496,0.081690,-0.004283,-0.057534,0.006105,0.171110,0.025386,-0.039204,0.189157
1,6,1,6_1,0.156288,0.271375,0.059343,-0.108422,0.052298,0.019057,0.119804,...,-0.193451,-0.098110,0.085782,-0.014780,-0.065975,0.008123,0.176577,0.028697,-0.037489,0.189704
2,6,2,6_2,0.145923,0.240633,0.057680,-0.104799,0.108712,0.009395,0.081510,...,-0.200057,-0.118543,0.082504,-0.064201,-0.029714,0.073884,0.186111,0.105346,-0.022998,0.202353
3,6,3,6_3,0.160460,0.285798,0.063682,-0.104289,0.061716,0.016656,0.126485,...,-0.188633,-0.087426,0.102778,-0.018449,-0.064665,0.006555,0.178324,0.028487,-0.038289,0.191645
4,6,4,6_4,0.159856,0.281828,0.073446,-0.112441,0.063491,0.021445,0.110178,...,-0.178722,-0.094382,0.093599,-0.025704,-0.076860,0.015346,0.189978,0.027995,-0.040230,0.188706
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
511,16,125,16_125,0.090554,0.227452,0.107798,-0.111091,0.034236,0.063783,0.053129,...,-0.198507,-0.122853,0.024147,0.000304,-0.161041,0.092954,0.143728,0.113752,-0.049263,0.056543
512,16,126,16_126,0.086859,0.236306,0.107814,-0.125368,0.026619,0.063352,0.062930,...,-0.189677,-0.125140,0.027448,0.003508,-0.144993,0.096095,0.155616,0.101075,-0.042474,0.058476
513,16,127,16_127,0.096795,0.220505,0.109566,-0.119462,0.023341,0.063660,0.084856,...,-0.180416,-0.117967,0.033671,0.007132,-0.137807,0.088667,0.160295,0.102587,-0.045250,0.052347
514,16,128,16_128,0.090397,0.227700,0.104989,-0.123692,0.035497,0.059867,0.049117,...,-0.188227,-0.115212,0.019899,0.006688,-0.167376,0.096072,0.149536,0.122288,-0.047816,0.063510


In [41]:
# NO EVENTTYPE, CAN ONLY MAKE PREDICTIONS WITHOUT KNOWING ACCURACY

# There is no df_y when we are trying to evaluate the matches in eval_tweets
# for kaggle submission!!
# Let df_y have all zeros with the same number of rows as df_X
# This is just to make code run more easily, df_y
#     and tensor_y (value returned by convert_df_to_3D_tensor) will not be used

# df_y has no real meaning, only for ease of coding!
df_y = pd.Series(0, index=df_eval.index)
df_y


0      0
1      0
2      0
3      0
4      0
      ..
511    0
512    0
513    0
514    0
515    0
Length: 516, dtype: int64

In [42]:
X_eval_tensor, _ = convert_df_to_3D_tensor(df_eval, df_y)
# CONVERT TO PYTORCH TENSOR
X_eval_tensor = torch.tensor(X_eval_tensor, dtype=torch.float32)
X_eval_tensor

(4, 130, 200)
(4, 130)


tensor([[[ 0.1589,  0.2648,  0.0580,  ...,  0.0254, -0.0392,  0.1892],
         [ 0.1563,  0.2714,  0.0593,  ...,  0.0287, -0.0375,  0.1897],
         [ 0.1459,  0.2406,  0.0577,  ...,  0.1053, -0.0230,  0.2024],
         ...,
         [ 0.1209,  0.2895,  0.1666,  ...,  0.1049, -0.0291,  0.0956],
         [ 0.1459,  0.2925,  0.1178,  ...,  0.0700, -0.0339,  0.1269],
         [ 0.1424,  0.2870,  0.1126,  ...,  0.0668, -0.0390,  0.1218]],

        [[ 0.0159,  0.1494,  0.1740,  ...,  0.0662,  0.0571,  0.0319],
         [-0.0012,  0.1338,  0.1826,  ...,  0.0573,  0.0487,  0.0290],
         [ 0.0076,  0.1246,  0.2063,  ...,  0.0507,  0.0662,  0.0145],
         ...,
         [ 0.0140,  0.0887,  0.1781,  ...,  0.0982,  0.0193,  0.0239],
         [ 0.0052,  0.0957,  0.1780,  ...,  0.0993,  0.0218,  0.0250],
         [ 0.0159,  0.0961,  0.1673,  ...,  0.1030,  0.0149,  0.0278]],

        [[ 0.1054,  0.2370,  0.0124,  ...,  0.0596, -0.0988,  0.1500],
         [ 0.0932,  0.2318,  0.0044,  ...,  0

In [43]:
model.eval()

with torch.no_grad():
    predictions = model(X_eval_tensor)

# Predictions have values between 0 and 1 because forward pass of LSTM contains sigmoid at output

# This converts to same dimensional array of True or false, and .float() converts True to 1 and False to 0
predicted_classes = (predictions > 0.5).float() # 0.5 is threshold
print(predicted_classes.shape)
predicted_classes


torch.Size([4, 130])


tensor([[0., 0., 0., 0., 0., 1., 1., 1., 1., 0., 1., 1., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 1., 1.,
         1., 1., 1., 1., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1.],
        [0., 0., 0., 0., 0., 1., 1., 1., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 1., 1., 1., 0.,
         0., 0., 0., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
  

In [44]:
df_eval.head()

Unnamed: 0,MatchID,PeriodID,ID,0,1,2,3,4,5,6,...,190,191,192,193,194,195,196,197,198,199
0,6,0,6_0,0.158896,0.264814,0.057981,-0.102842,0.061002,0.023203,0.13189,...,-0.184701,-0.101496,0.08169,-0.004283,-0.057534,0.006105,0.17111,0.025386,-0.039204,0.189157
1,6,1,6_1,0.156288,0.271375,0.059343,-0.108422,0.052298,0.019057,0.119804,...,-0.193451,-0.09811,0.085782,-0.01478,-0.065975,0.008123,0.176577,0.028697,-0.037489,0.189704
2,6,2,6_2,0.145923,0.240633,0.05768,-0.104799,0.108712,0.009395,0.08151,...,-0.200057,-0.118543,0.082504,-0.064201,-0.029714,0.073884,0.186111,0.105346,-0.022998,0.202353
3,6,3,6_3,0.16046,0.285798,0.063682,-0.104289,0.061716,0.016656,0.126485,...,-0.188633,-0.087426,0.102778,-0.018449,-0.064665,0.006555,0.178324,0.028487,-0.038289,0.191645
4,6,4,6_4,0.159856,0.281828,0.073446,-0.112441,0.063491,0.021445,0.110178,...,-0.178722,-0.094382,0.093599,-0.025704,-0.07686,0.015346,0.189978,0.027995,-0.04023,0.188706


In [None]:
#TODO MAKE CSV OF OUTPUT WITH CORRECT MATCH IDS?
# For the duplicate period ids, just use same prediction for that period id!

"""order = [6, 16, 9, 15]
sorted_predictions = []
id_predictions = []
matchid_predictions = []
for row_index, row in df_eval.iterrows():
    sorted_predictions.append(float(predicted_classes[int(row["MatchID"]), int(row["PeriodID"])]))
    matchid_predictions.append(order[int(row["MatchID"])])
    id_predictions.append(str(matchid_predictions[-1]) + "_" + str(int(row["PeriodID"])))

prediction_tab = pd.DataFrame(df_eval["PeriodID"])
prediction_tab['ID'] = id_predictions
prediction_tab["EventType"] = sorted_predictions
prediction_tab["MatchID"] = matchid_predictions
prediction_tab.sort_values(by=['MatchID', 'PeriodID'], inplace=True)
prediction_tab.drop(columns=["MatchID", 'PeriodID'], inplace=True)"""


match_ids = {6 : 0, 9 : 1, 15 : 2, 16 : 3}
sorted_predictions = []
for row_index, row in df_eval.iterrows():
    sorted_predictions.append(float(predicted_classes[match_ids[int(row["MatchID"])], int(row["PeriodID"])]))

prediction_tab = pd.DataFrame(df_eval[["ID"]])
prediction_tab["EventType"] = sorted_predictions

submission_filename = "submissions/Submission " + datetime.datetime.now().strftime("%d%m-%H%M%S") + ".csv"

prediction_tab.to_csv(submission_filename, index=False)


In [None]:
# NOTES
# HOW TO MAKE SURE THAT we:
# 1. DO NOT ignore the order of the tweets -> (LSTM)
# 2. Treat each time period as RELATED to the football match they belong to -> treat each match as a sequence, train LSTM on every sequence
#                      since pytorch tensor expects multiple sequences (batches)



# for LSTM: Each input sequence should consist of tweets from a specific match, ordered by Period ID.
#   Tweets of different matches are unrelated, but tweets of a same match are related sequentially (chronologically)
#   Structure training data such that tweets are grouped by match id, and ordered by period id