# Import libraries and model

In [1]:
from os import listdir, path
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import gc
import gensim.downloader as api

import torch
import torch.nn as nn

from math import ceil
import torch.optim as optim
import csv

nltk.download('stopwords')
nltk.download('wordnet')

# Load GloVe model with Gensim's API - Twitter specific embedding
embeddings_model = api.load("glove-twitter-200")  # 200-dimensional GloVe embeddings

#To check that T4 GPU is connected
#!nvidia-smi

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/victormicha/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/victormicha/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Data preprocessing

In [2]:
# Read all training files and concatenate them into one dataframe

#import os
#print(os.getcwd())

li = []
i = 0
for filename in listdir("train_tweets"):
    if filename != '.ipynb_checkpoints':
        print(filename)
        df = pd.read_csv("train_tweets/" + filename)
        df.drop(columns=['Timestamp'], inplace=True)
        # drop unused column(s)
        df['MatchID'] = str(i)
        df['ID'] = str(i)+ '_' + df['PeriodID'].astype(str)
        # makes sure that the match IDs are ordered from 0,1,2... with no missing values
        # this is for convenience and so it is easier to debug and follow along
        i+=1
        li.append(df)
df = pd.concat(li, ignore_index=True)
#print(len(df))
df

AustraliaSpain34.csv
PortugalGhana58.csv
CameroonBrazil36.csv
GermanyBrazil74.csv
BelgiumSouthKorea59.csv
NetherlandsChile35.csv
GermanyAlgeria67.csv
FranceGermany70.csv
MexicoCroatia37.csv
FranceNigeria66.csv
AustraliaNetherlands29.csv
HondurasSwitzerland54.csv
ArgentinaGermanyFinal77.csv
ArgentinaBelgium72.csv
USASlovenia2010.csv
GermanyUSA57.csv


Unnamed: 0,ID,MatchID,PeriodID,EventType,Tweet
0,0_0,0,0,0,RT @soccerdotcom: If #ESP beats #AUS we'll giv...
1,0_0,0,0,0,Visit the #SITEP official web site here http:/...
2,0_0,0,0,0,RT @soccerdotcom: If #ESP beats #AUS we'll giv...
3,0_0,0,0,0,RT @worldsoccershop: If there is a winner in t...
4,0_0,0,0,0,RT @soccerdotcom: If #AUS beats #ESP we'll giv...
...,...,...,...,...,...
5056045,15_129,15,129,1,RT @BBCSport: Portugal fourth team in top 10 o...
5056046,15_129,15,129,1,RT @NBCSports: USA MOVES ON! Germany beats #US...
5056047,15_129,15,129,1,Ronaldo could have easily scored 4-5 goals ton...
5056048,15_129,15,129,1,RT @TheSelenatorBoy: Ppl getting mad bc Pepe i...


In [3]:
# Preprocessing of tweet
def preprocess_text(text):
    # Lowercasing
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Tokenization
    words = text.split()
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

In [4]:
# Apply preprocessing to each tweet
df['Tweet'] = df['Tweet'].apply(preprocess_text)
df

Unnamed: 0,ID,MatchID,PeriodID,EventType,Tweet
0,0_0,0,0,0,rt soccerdotcom esp beat au well give away spa...
1,0_0,0,0,0,visit sitep official web site httptcoehzkslan ...
2,0_0,0,0,0,rt soccerdotcom esp beat au well give away spa...
3,0_0,0,0,0,rt worldsoccershop winner au v esp match well ...
4,0_0,0,0,0,rt soccerdotcom au beat esp well give away aus...
...,...,...,...,...,...
5056045,15_129,15,129,1,rt bbcsport portugal fourth team top fifa worl...
5056046,15_129,15,129,1,rt nbcsports usa move germany beat usmnt portu...
5056047,15_129,15,129,1,ronaldo could easily scored goal tonight finis...
5056048,15_129,15,129,1,rt theselenatorboy ppl getting mad bc pepe bra...


# Tweet Embeddings

In [5]:
# Get vector tweet embeddings
# TODOOOOOOOOOOOOOOOO maybe instead of avg word embedding for each tweet can get sentence
#   embeddings to retain more information
#   -> can try more complex functions here
#   -> avg embedding of each word for a tweet is fine for now, maybe works well enough

# Function to compute the average word vector for a tweet
def get_avg_embedding(tweet, model, vector_size=200):
    words = tweet.split()  # Tokenize by whitespace
    word_vectors = [model[word] for word in words if word in model]
    if not word_vectors:  # If no words in the tweet are in the vocabulary, return a zero vector
        return np.zeros(vector_size)
    return np.mean(word_vectors, axis=0)

In [6]:
# Crashes after using all available RAM :( on google colab
# 

# obtain vector tweet embeddings
vector_size = 200  # Adjust based on the chosen GloVe model
tweet_vectors = np.vstack([get_avg_embedding(tweet, embeddings_model, vector_size) for tweet in df['Tweet']])
tweet_df = pd.DataFrame(tweet_vectors)
tweet_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
0,0.148069,0.342504,-0.097915,0.002166,-0.05984,0.025755,0.244918,0.081042,0.236453,0.027198,...,-0.202918,-0.076171,0.066193,0.010218,-0.020414,0.010595,0.00493,-0.005967,-0.108431,0.07064
1,-0.183972,0.119888,-0.25376,0.012623,0.012891,-0.120238,-0.026952,-0.339493,0.033273,0.106456,...,0.249775,-0.15252,0.006334,-0.085193,0.005175,0.456785,-0.064834,-0.083434,0.05472,0.030099
2,0.148069,0.342504,-0.097915,0.002166,-0.05984,0.025755,0.244918,0.081042,0.236453,0.027198,...,-0.202918,-0.076171,0.066193,0.010218,-0.020414,0.010595,0.00493,-0.005967,-0.108431,0.07064
3,0.209126,0.390986,-0.130056,-0.068354,-0.096441,0.010439,0.074133,0.04572,0.215201,0.200725,...,-0.235941,-0.005941,0.070192,0.024676,0.003736,0.074399,0.169565,0.024788,0.028519,0.177178
4,0.16164,0.308513,-0.093269,0.001645,-0.071475,0.003183,0.22516,0.069612,0.229182,0.051714,...,-0.164617,-0.078824,0.064404,-0.035373,-0.01658,0.003644,0.010155,0.036428,-0.095518,0.084394


In [7]:
# Attach the vectors into the original dataframe
df = pd.concat([df, tweet_df], axis=1)

# Drop the columns that are not useful anymore
# no need for Tweet column since we have its corresponding vector embedding
df.drop(columns=['Tweet'], inplace=True)



In [8]:
# by now should have df with columns: ID, match id, period id, Event Type, tweet_vector. Tweet_vector is just 200 columns
df

Unnamed: 0,ID,MatchID,PeriodID,EventType,0,1,2,3,4,5,...,190,191,192,193,194,195,196,197,198,199
0,0_0,0,0,0,0.148069,0.342504,-0.097915,0.002166,-0.059840,0.025755,...,-0.202918,-0.076171,0.066193,0.010218,-0.020414,0.010595,0.004930,-0.005967,-0.108431,0.070640
1,0_0,0,0,0,-0.183972,0.119888,-0.253760,0.012623,0.012891,-0.120238,...,0.249775,-0.152520,0.006334,-0.085193,0.005175,0.456785,-0.064834,-0.083434,0.054720,0.030099
2,0_0,0,0,0,0.148069,0.342504,-0.097915,0.002166,-0.059840,0.025755,...,-0.202918,-0.076171,0.066193,0.010218,-0.020414,0.010595,0.004930,-0.005967,-0.108431,0.070640
3,0_0,0,0,0,0.209126,0.390986,-0.130056,-0.068354,-0.096441,0.010439,...,-0.235941,-0.005941,0.070192,0.024676,0.003736,0.074399,0.169565,0.024788,0.028519,0.177178
4,0_0,0,0,0,0.161640,0.308513,-0.093269,0.001645,-0.071475,0.003183,...,-0.164617,-0.078824,0.064404,-0.035373,-0.016580,0.003644,0.010155,0.036428,-0.095518,0.084394
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5056045,15_129,15,129,1,0.145174,0.190100,0.214790,-0.310834,0.050761,0.039853,...,-0.122601,-0.259632,0.023675,-0.151280,-0.023655,0.116062,0.097146,0.071710,0.007577,0.182598
5056046,15_129,15,129,1,0.328279,0.334743,0.125396,-0.164282,-0.078111,0.175972,...,-0.183001,-0.166399,0.161126,-0.056147,-0.037496,0.046396,0.347816,-0.070108,0.051800,0.209709
5056047,15_129,15,129,1,0.279302,0.184175,0.197833,-0.072442,0.001534,0.218018,...,-0.427638,-0.113268,0.022538,-0.108198,0.106550,-0.147467,0.300702,-0.088761,0.043255,0.272322
5056048,15_129,15,129,1,0.054918,0.149426,0.001621,0.107246,-0.106812,0.091331,...,-0.114883,-0.071184,-0.071171,-0.123343,0.146086,-0.073930,0.174728,0.167955,-0.172603,0.042918


In [9]:
# group the tweets into their corresponding periods to generate an average embedding vector for each period
# so there are no duplicate period id rows per match
# decreases size of data + makes it easier to fit into LSTM model
df = df.groupby(['MatchID', 'PeriodID', 'ID']).mean().reset_index()
df.drop(columns=['ID'], inplace=True) 
df['MatchID'] = df['MatchID'].astype(int)
df['PeriodID'] = df['PeriodID'].astype(int)
# need to convert to int before sorting
df.sort_values(by=['MatchID', 'PeriodID'], inplace=True)
df.reset_index(drop=True, inplace=True)


In [10]:
df

Unnamed: 0,MatchID,PeriodID,EventType,0,1,2,3,4,5,6,...,190,191,192,193,194,195,196,197,198,199
0,0,0,0.0,0.153605,0.295456,-0.071657,-0.067463,-0.036952,0.021420,0.068080,...,-0.148767,-0.054045,0.062174,0.002222,-0.051431,0.047291,0.116521,0.044131,-0.057307,0.098845
1,0,1,0.0,0.154335,0.307603,-0.075244,-0.065478,-0.043863,0.017503,0.080918,...,-0.159720,-0.055965,0.072527,0.000227,-0.048192,0.054950,0.110632,0.037886,-0.052907,0.090703
2,0,2,1.0,0.143487,0.288700,-0.079885,-0.075364,-0.048674,0.031021,0.051307,...,-0.140722,-0.049518,0.066939,0.005431,-0.058602,0.064900,0.113738,0.036233,-0.055866,0.088050
3,0,3,0.0,0.144639,0.291448,-0.073258,-0.071962,-0.046910,0.031197,0.071218,...,-0.144395,-0.041473,0.061161,-0.007658,-0.053164,0.047168,0.134569,0.051628,-0.065686,0.068157
4,0,4,0.0,0.165457,0.270594,-0.064547,-0.099924,-0.053324,0.047209,0.071359,...,-0.139803,-0.017011,0.061073,-0.017830,-0.067607,0.056875,0.139652,0.040223,-0.053406,0.065884
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2132,15,125,1.0,0.206152,0.219004,0.111657,-0.066945,-0.078561,0.122580,0.056943,...,-0.227824,-0.137099,0.033568,-0.069731,0.019147,0.016467,0.178041,-0.017193,-0.052301,0.133755
2133,15,126,1.0,0.207868,0.218072,0.104710,-0.049019,-0.085458,0.122823,0.084739,...,-0.238725,-0.131323,0.029688,-0.069378,0.018137,0.003592,0.175383,-0.013808,-0.057169,0.129336
2134,15,127,1.0,0.208063,0.218851,0.104856,-0.050168,-0.083898,0.126063,0.087578,...,-0.237974,-0.130646,0.027115,-0.068678,0.015409,-0.000151,0.173740,-0.014482,-0.056237,0.131228
2135,15,128,1.0,0.184386,0.230244,0.096840,-0.045003,-0.075028,0.105333,0.075484,...,-0.241519,-0.130642,0.009970,-0.068182,0.041595,-0.004035,0.180474,-0.023068,-0.055477,0.134162


# Separate Train and Test data

In [11]:
# train on of the first 13 of 16 matches (16*0.8=12.8~=13)
# and the test data would be the last 3 matches. 
# Before submitting on Kaggle we should train on full dataset, so al 16 matches
train_percentage = 0.8
unique_match_ids = df['MatchID'].unique()
print(unique_match_ids)
num_matches_training = int(ceil(len(unique_match_ids)*train_percentage))
print(num_matches_training)

[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15]
13


In [12]:
target_match_id = num_matches_training
# target_match_id is first match id that will appear in test set
# all matches from target_match_id and after will be in test test
print(target_match_id)

13


In [13]:

#df2 = df['MatchID'] == 15
#df2

In [14]:
# row_index is first row with match id target_match_id
# row_index is then the first row of the matches that will go to the test


row_index = (df['MatchID'] == target_match_id).idxmax()
#row_index = df[df['MatchID'] == target_match_id].first_valid_index()
df_X_train = df[:row_index].copy()
df_X_test = df[row_index:].copy()


In [15]:
df_y_train = df_X_train['EventType']
df_y_test = df_X_test['EventType']

In [16]:
df_y_train

0       0.0
1       0.0
2       1.0
3       0.0
4       0.0
       ... 
1742    1.0
1743    1.0
1744    1.0
1745    1.0
1746    1.0
Name: EventType, Length: 1747, dtype: float64

In [17]:
df_y_test.reset_index(drop=True, inplace=True)
df_y_test

0      0.0
1      0.0
2      0.0
3      0.0
4      0.0
      ... 
385    1.0
386    1.0
387    1.0
388    1.0
389    1.0
Name: EventType, Length: 390, dtype: float64

In [18]:
df_X_train.drop(['EventType'], axis=1, inplace=True)
df_X_test.drop(['EventType'], axis=1, inplace=True)

In [19]:
df_X_train

Unnamed: 0,MatchID,PeriodID,0,1,2,3,4,5,6,7,...,190,191,192,193,194,195,196,197,198,199
0,0,0,0.153605,0.295456,-0.071657,-0.067463,-0.036952,0.021420,0.068080,0.053640,...,-0.148767,-0.054045,0.062174,0.002222,-0.051431,0.047291,0.116521,0.044131,-0.057307,0.098845
1,0,1,0.154335,0.307603,-0.075244,-0.065478,-0.043863,0.017503,0.080918,0.055528,...,-0.159720,-0.055965,0.072527,0.000227,-0.048192,0.054950,0.110632,0.037886,-0.052907,0.090703
2,0,2,0.143487,0.288700,-0.079885,-0.075364,-0.048674,0.031021,0.051307,0.040299,...,-0.140722,-0.049518,0.066939,0.005431,-0.058602,0.064900,0.113738,0.036233,-0.055866,0.088050
3,0,3,0.144639,0.291448,-0.073258,-0.071962,-0.046910,0.031197,0.071218,0.054919,...,-0.144395,-0.041473,0.061161,-0.007658,-0.053164,0.047168,0.134569,0.051628,-0.065686,0.068157
4,0,4,0.165457,0.270594,-0.064547,-0.099924,-0.053324,0.047209,0.071359,0.024932,...,-0.139803,-0.017011,0.061073,-0.017830,-0.067607,0.056875,0.139652,0.040223,-0.053406,0.065884
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1742,12,175,0.148736,0.254114,0.071930,-0.081893,-0.053124,0.076305,0.187860,0.095647,...,-0.170393,-0.143958,0.085017,-0.068650,-0.114114,-0.004780,0.128914,0.015764,-0.063717,0.125779
1743,12,176,0.151184,0.249580,0.068957,-0.073939,-0.058152,0.084052,0.191258,0.095749,...,-0.179819,-0.142343,0.083550,-0.068391,-0.116784,-0.005575,0.125730,0.013425,-0.069919,0.134599
1744,12,177,0.150791,0.249956,0.067933,-0.073984,-0.049316,0.077864,0.184421,0.098590,...,-0.177149,-0.138094,0.079852,-0.066268,-0.112408,-0.003402,0.122552,0.013779,-0.067702,0.131713
1745,12,178,0.148567,0.244226,0.063626,-0.068488,-0.053766,0.075960,0.187324,0.098316,...,-0.177631,-0.133941,0.077539,-0.064008,-0.109018,-0.005017,0.126014,0.011561,-0.066714,0.124967


In [20]:
df_X_test.reset_index(drop=True, inplace=True)
df_X_test

Unnamed: 0,MatchID,PeriodID,0,1,2,3,4,5,6,7,...,190,191,192,193,194,195,196,197,198,199
0,13,0,0.122099,0.259289,0.021811,-0.091114,-0.020353,0.027769,0.110603,0.024921,...,-0.191140,-0.069440,0.019676,-0.021974,-0.065337,-0.005958,0.155431,0.028152,-0.102278,0.169321
1,13,1,0.118798,0.257246,0.022628,-0.091629,-0.030559,0.027826,0.097386,0.022053,...,-0.196974,-0.067863,0.016041,-0.022768,-0.066973,-0.010289,0.156989,0.027647,-0.104473,0.168100
2,13,2,0.120084,0.244924,0.021755,-0.087242,-0.043526,0.036788,0.112539,0.011313,...,-0.185856,-0.067514,0.009049,-0.031775,-0.063545,-0.013830,0.166954,0.020025,-0.102942,0.172742
3,13,3,0.113977,0.246675,0.032208,-0.093585,-0.039601,0.042192,0.097110,0.006285,...,-0.178769,-0.069509,-0.006021,-0.024460,-0.073347,-0.017333,0.173202,0.023651,-0.108952,0.177780
4,13,4,0.118590,0.251655,0.035730,-0.097995,-0.043148,0.033088,0.094264,0.009966,...,-0.182203,-0.071062,-0.001199,-0.028591,-0.081255,-0.013600,0.180348,0.019224,-0.112693,0.181622
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
385,15,125,0.206152,0.219004,0.111657,-0.066945,-0.078561,0.122580,0.056943,0.064981,...,-0.227824,-0.137099,0.033568,-0.069731,0.019147,0.016467,0.178041,-0.017193,-0.052301,0.133755
386,15,126,0.207868,0.218072,0.104710,-0.049019,-0.085458,0.122823,0.084739,0.065784,...,-0.238725,-0.131323,0.029688,-0.069378,0.018137,0.003592,0.175383,-0.013808,-0.057169,0.129336
387,15,127,0.208063,0.218851,0.104856,-0.050168,-0.083898,0.126063,0.087578,0.064920,...,-0.237974,-0.130646,0.027115,-0.068678,0.015409,-0.000151,0.173740,-0.014482,-0.056237,0.131228
388,15,128,0.184386,0.230244,0.096840,-0.045003,-0.075028,0.105333,0.075484,0.067555,...,-0.241519,-0.130642,0.009970,-0.068182,0.041595,-0.004035,0.180474,-0.023068,-0.055477,0.134162


In [21]:
# now df_X_train and df_X_test should have columns MatchID, PeriodID, tweet_vector. Tweet_vector is just 200 columns
# df_y_train and df_y_test should have 1 column, EventType
# the matchids are grouped together so all the rows of the same
# match ids are grouped next to each other, and the periodID are ordered chronologically.

In [22]:
# now we have df_X_train, df_X_test, df_y_train, df_y_test
# we no longer need df so we should free up the memory
del df  # remove reference to the original DataFrame
gc.collect()  # force garbage collection to free up memory

0

In [23]:
max_periods = df_X_train.groupby('MatchID')['PeriodID'].max().reset_index()
max_periods
# as we can see not every match has the same number of periods!

Unnamed: 0,MatchID,PeriodID
0,0,129
1,1,129
2,2,129
3,3,129
4,4,129
5,5,129
6,6,169
7,7,129
8,8,129
9,9,129


In [24]:
max_periods = df_X_test.groupby('MatchID')['PeriodID'].max().reset_index()
max_periods

Unnamed: 0,MatchID,PeriodID
0,13,129
1,14,129
2,15,129


# Format data for PyTorch LSTM

In [25]:
# input tensor for a PyTorch LSTM should have the shape of (when setting batch_first=True)
# (batch_size, seq_len, num_features) when using the batch_first=True parameter
# batch_size is number of sequences processed at once

# TRY WITHOUT SLIDING WINDOW APPROACH
#    which would mean batch size = number of matches
#    much easier to format for LSTM as 3D tensor
#    dimension of 3D tensor with batch_first=True:(batch_size = num_matches, seq_len = num_periods, num _features = 200)
#    (match_id, period_id, num_features=200)
#     not every match has the same number of periods!, so seq_len can vary between different matches
#     fix: pad with zeroes
# we want X_tensor[match_id][period_id] to return list len 200 of corresponding tweet vector


In [26]:
def convert_df_to_3D_tensor(df_X, df_y):
    # df_X should have columns MatchID, PeriodID, tweet_vector. Tweet_vector is just 200 columns
    # rows with same matchID should be grouped together (adjacent rows)
    # df_y should have one column (the EventType)
    # returns tensor_X numpy array already padded! shape: (num_matches, max_num_periods, num _features = 200)
    # and tensor_y of shape: (num_matches, max_num_periods) 
    
    num_matches = len(df_X['MatchID'].unique())
    max_periods = df_X.groupby('MatchID')['PeriodID'].max().reset_index()
    total_max_period = max_periods['PeriodID'].max()
    #total_max_period is max seq len

    tensor_X = np.zeros((num_matches, total_max_period+1, 200))

    tensor_y = np.zeros((num_matches, total_max_period+1))
    print(tensor_X.shape)
    print(tensor_y.shape)
    
    i=0
    previous_match_id = df_X['MatchID'][0]
    for row_index, row in df_X.iterrows():
        match_id = int(row['MatchID'])

        if match_id != previous_match_id:
            i+=1
            previous_match_id = match_id
        
        period_id = int(row['PeriodID'])
        
        features = row[2:].values  # Skip MatchID and PeriodID
        tensor_X[i, period_id, :] = features
        tensor_y[i,period_id] = df_y[row_index]
        
    return tensor_X, tensor_y


X_train_tensor, y_train_tensor = convert_df_to_3D_tensor(df_X_train, df_y_train)
# X_train_tensor[match_id][period_id] to return list len 200 of corresponding tweet vector
# y_train_tensor[match_id][period_id] to return corresponding EventType (1 or 0)
# match_id index starts at 0 even if first match in df doesnt have match id 0
#X_train_tensor[12][175]
#X_train_tensor[12][179]
#X_train_tensor[2][129]


(13, 180, 200)
(13, 180)


In [27]:
#print(X_train_tensor[0][3])
#print(y_train_tensor[0][3])

In [28]:
# SCALE DATA? minmaxscaler for example!
# SCALING MIGHT BE UNNECESSARY SINCE OUTPUT OF GLOVE TWEET 200 IS ALREADY SCALED BETWEEN -1 AND 1
#scaler = MinMaxScaler()
#tensor = scaler.fit_transform(tensor)

# CONVERT TO PYTORCH TENSOR
X_train_tensor = torch.tensor(X_train_tensor, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train_tensor, dtype=torch.float32)

print(X_train_tensor.shape)
print(y_train_tensor.shape)
# X_train_tensor, y_train_tensor are now pytorch tensors

torch.Size([13, 180, 200])
torch.Size([13, 180])


# LSTM Model

In [29]:
# TODO VERIFY ITS CORRECT + MAKE MORE SOPHISTICATED
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, dropout_rate):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout_rate)
        self.fc = nn.Linear(hidden_size, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        out = self.fc(lstm_out)
        out = self.sigmoid(out) # applying sigmoid to convert to probabilities
        return out.squeeze(-1)

#TODOOOOOOOOOO torch.nn.utils.rnn.pack_padded_sequence. This allows the model to ignore the padded values during computation.

# Train model

In [30]:
hidden_size = 500 # can tune
num_layers = 4 # can tune
dropout_rate = 0.2 # can tune
num_epochs = 500 # can tune
lr = 0.001 # can tune

model = LSTMModel(input_size=200, hidden_size=hidden_size, num_layers=num_layers, dropout_rate=dropout_rate)
optimizer = optim.Adam(model.parameters(), lr=lr)
criterion = nn.BCELoss() # great for binary classification
#print(f"Shape of X_train_tensor: {X_train_tensor.shape}")
for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()
    outputs = model(X_train_tensor)
    #print(f"shape of outputs: {outputs.shape}")
    
    loss = criterion(outputs, y_train_tensor)
    loss.backward()
    optimizer.step()
    if epoch % 10 == 0:
        print(f"Epoch [{epoch}/{num_epochs}], Loss: {loss.item():.4f}")

print("Model is trained! (on training data)")

Epoch [0/500], Loss: 0.6914
Epoch [10/500], Loss: 0.5671
Epoch [20/500], Loss: 0.5464
Epoch [30/500], Loss: 0.6369
Epoch [40/500], Loss: 0.5444
Epoch [50/500], Loss: 0.5399
Epoch [60/500], Loss: 0.5315
Epoch [70/500], Loss: 0.5259
Epoch [80/500], Loss: 0.5198
Epoch [90/500], Loss: 0.5096
Epoch [100/500], Loss: 0.5085
Epoch [110/500], Loss: 0.4903
Epoch [120/500], Loss: 0.4759
Epoch [130/500], Loss: 0.4667
Epoch [140/500], Loss: 0.4597
Epoch [150/500], Loss: 0.4450
Epoch [160/500], Loss: 0.4380
Epoch [170/500], Loss: 0.4161
Epoch [180/500], Loss: 0.5250
Epoch [190/500], Loss: 0.5192
Epoch [200/500], Loss: 0.7014
Epoch [210/500], Loss: 0.6746
Epoch [220/500], Loss: 0.6721
Epoch [230/500], Loss: 0.6378
Epoch [240/500], Loss: 0.5321
Epoch [250/500], Loss: 0.5405
Epoch [260/500], Loss: 0.5177
Epoch [270/500], Loss: 0.5151
Epoch [280/500], Loss: 0.5084
Epoch [290/500], Loss: 0.5012
Epoch [300/500], Loss: 0.5516
Epoch [310/500], Loss: 0.5368
Epoch [320/500], Loss: 0.5192
Epoch [330/500], Loss

# Evaluate on test data

In [31]:
# convert df_X_test and df_y_test to correct format/dimensions
X_test_tensor, y_test_tensor = convert_df_to_3D_tensor(df_X_test, df_y_test)
# CONVERT TO PYTORCH TENSOR
X_test_tensor = torch.tensor(X_test_tensor, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test_tensor, dtype=torch.float32)

(3, 130, 200)
(3, 130)


In [32]:
#print(X_test_tensor[2][129])
#print(y_test_tensor[2][129])

In [33]:




model.eval()

with torch.no_grad():
    predictions = model(X_test_tensor)

# predictions have values between 0 and 1 because forward pass of LSTM contains sigmoid at output
#print(predictions)

predicted_classes = (predictions > 0.5).float() # 0.5 is threshold
#this converts to same dimensional array of True or false, and .float() converts True to 1 and False to 0

#print(predicted_classes)
 

In [34]:
#print(predictions.shape)

In [35]:
# performance metrics

loss = criterion(predictions, y_test_tensor) # use predictions for loss calculation

print(f"Binary Cross-Entropy Loss: {loss.item():.4f}")

def accuracy(y_true, y_pred):
    if y_true.dtype != y_pred.dtype or y_true.shape != y_pred.shape:
        raise ValueError(f"Inputs do not have same type or shape!")
    correct_predictions = (y_true == y_pred).sum().item()
    total_predictions = y_true.numel()
    accuracy = correct_predictions / total_predictions * 100
    return accuracy
accuracy = accuracy(y_test_tensor, predicted_classes)


print(f"Accuracy: {accuracy:.4f}")

#print(y_test_tensor.shape)
#print(predicted_classes.shape)


# Visualization of Actual vs Predicted Classes
# import matplotlib.pyplot as plt
# TODO COULD USE PLT TO VISUALIZE?

Binary Cross-Entropy Loss: 0.7266
Accuracy: 62.3077


# For Kaggle Submission

In [36]:
# RETRAIN MODEL ON ENTIRE TRAINING DATA AND EVALUATE EVAL TWEETS




df_X = pd.concat([df_X_train, df_X_test], ignore_index=True)
df_y = pd.concat([df_y_train, df_y_test], ignore_index=True)
print(df_X['MatchID'].unique())
print(df_X.shape)
print(df_y.shape)


[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15]
(2137, 202)
(2137,)


In [37]:
df_X

Unnamed: 0,MatchID,PeriodID,0,1,2,3,4,5,6,7,...,190,191,192,193,194,195,196,197,198,199
0,0,0,0.153605,0.295456,-0.071657,-0.067463,-0.036952,0.021420,0.068080,0.053640,...,-0.148767,-0.054045,0.062174,0.002222,-0.051431,0.047291,0.116521,0.044131,-0.057307,0.098845
1,0,1,0.154335,0.307603,-0.075244,-0.065478,-0.043863,0.017503,0.080918,0.055528,...,-0.159720,-0.055965,0.072527,0.000227,-0.048192,0.054950,0.110632,0.037886,-0.052907,0.090703
2,0,2,0.143487,0.288700,-0.079885,-0.075364,-0.048674,0.031021,0.051307,0.040299,...,-0.140722,-0.049518,0.066939,0.005431,-0.058602,0.064900,0.113738,0.036233,-0.055866,0.088050
3,0,3,0.144639,0.291448,-0.073258,-0.071962,-0.046910,0.031197,0.071218,0.054919,...,-0.144395,-0.041473,0.061161,-0.007658,-0.053164,0.047168,0.134569,0.051628,-0.065686,0.068157
4,0,4,0.165457,0.270594,-0.064547,-0.099924,-0.053324,0.047209,0.071359,0.024932,...,-0.139803,-0.017011,0.061073,-0.017830,-0.067607,0.056875,0.139652,0.040223,-0.053406,0.065884
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2132,15,125,0.206152,0.219004,0.111657,-0.066945,-0.078561,0.122580,0.056943,0.064981,...,-0.227824,-0.137099,0.033568,-0.069731,0.019147,0.016467,0.178041,-0.017193,-0.052301,0.133755
2133,15,126,0.207868,0.218072,0.104710,-0.049019,-0.085458,0.122823,0.084739,0.065784,...,-0.238725,-0.131323,0.029688,-0.069378,0.018137,0.003592,0.175383,-0.013808,-0.057169,0.129336
2134,15,127,0.208063,0.218851,0.104856,-0.050168,-0.083898,0.126063,0.087578,0.064920,...,-0.237974,-0.130646,0.027115,-0.068678,0.015409,-0.000151,0.173740,-0.014482,-0.056237,0.131228
2135,15,128,0.184386,0.230244,0.096840,-0.045003,-0.075028,0.105333,0.075484,0.067555,...,-0.241519,-0.130642,0.009970,-0.068182,0.041595,-0.004035,0.180474,-0.023068,-0.055477,0.134162


In [38]:
df_y

0       0.0
1       0.0
2       1.0
3       0.0
4       0.0
       ... 
2132    1.0
2133    1.0
2134    1.0
2135    1.0
2136    1.0
Name: EventType, Length: 2137, dtype: float64

In [39]:
# convert df_X_test and df_y_test to correct format/dimensions
X_tensor, y_tensor = convert_df_to_3D_tensor(df_X, df_y)
# CONVERT TO PYTORCH TENSOR
X_tensor = torch.tensor(X_tensor, dtype=torch.float32)
y_tensor = torch.tensor(y_tensor, dtype=torch.float32)

(16, 180, 200)
(16, 180)


In [40]:
print(X_tensor.shape)
print(y_tensor.shape)

torch.Size([16, 180, 200])
torch.Size([16, 180])


In [41]:
# retrain model on all 16 matches (with same hyper parameters)
model = LSTMModel(input_size=200, hidden_size=hidden_size, num_layers=num_layers, dropout_rate=dropout_rate)
optimizer = optim.Adam(model.parameters(), lr=lr)
criterion = nn.BCELoss() # great for binary classification

for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()
    outputs = model(X_tensor)
    
    loss = criterion(outputs, y_tensor)
    loss.backward()
    optimizer.step()
    if epoch % 10 == 0:
        print(f"Epoch [{epoch}/{num_epochs}], Loss: {loss.item():.4f}")

print("Model is trained! (on all 16 matches)")

Epoch [0/500], Loss: 0.6944
Epoch [10/500], Loss: 0.5607
Epoch [20/500], Loss: 0.5294
Epoch [30/500], Loss: 0.5164
Epoch [40/500], Loss: 0.5381
Epoch [50/500], Loss: 0.5354
Epoch [60/500], Loss: 0.5286
Epoch [70/500], Loss: 0.5217
Epoch [80/500], Loss: 0.5160
Epoch [90/500], Loss: 0.5105
Epoch [100/500], Loss: 0.4951
Epoch [110/500], Loss: 0.4759
Epoch [120/500], Loss: 0.4691
Epoch [130/500], Loss: 0.4583
Epoch [140/500], Loss: 0.4874
Epoch [150/500], Loss: 0.4543
Epoch [160/500], Loss: 0.4433
Epoch [170/500], Loss: 0.4153
Epoch [180/500], Loss: 0.3991
Epoch [190/500], Loss: 0.3839
Epoch [200/500], Loss: 0.4178
Epoch [210/500], Loss: 0.3671
Epoch [220/500], Loss: 0.3501
Epoch [230/500], Loss: 0.3336
Epoch [240/500], Loss: 0.3099
Epoch [250/500], Loss: 0.3108
Epoch [260/500], Loss: 0.3059
Epoch [270/500], Loss: 0.3005
Epoch [280/500], Loss: 0.2688
Epoch [290/500], Loss: 0.2488
Epoch [300/500], Loss: 0.2599
Epoch [310/500], Loss: 0.2209
Epoch [320/500], Loss: 0.2118
Epoch [330/500], Loss

In [42]:
# READ EVAL_TWEETS AND PREPROCESS DATA

# Read all eval files and concatenate them into one dataframe

li = []
i = 0
match_id_order = {}
for filename in listdir("eval_tweets"):
    if filename != '.ipynb_checkpoints':
        print(filename)
        df_eval = pd.read_csv("eval_tweets/" + filename)
        df_eval.drop(columns=['Timestamp'], inplace=True)
        # drop unused column(s)
        print(df_eval['MatchID'].unique())
        match_id = str(df_eval['MatchID'].unique()[0])
        match_id_order[match_id] = i
        # match_id_order[match_id]  = i means that the predictions of match_id are in the ith sequence
        df_eval['MatchID'] = str(i)
        df_eval['ID'] = str(i)+ '_' + df_eval['PeriodID'].astype(str)
        # makes sure that the match IDs are ordered from 0,1,2... with no missing values
        # this is for convenience and so it is easier to debug and follow along
        i+=1
        li.append(df_eval)
df_eval = pd.concat(li, ignore_index=True)


GermanyGhana32.csv
[6]
GermanySerbia2010.csv
[16]
NetherlandsMexico64.csv
[15]
GreeceIvoryCoast44.csv
[9]


In [43]:
df_eval

Unnamed: 0,ID,MatchID,PeriodID,Tweet
0,0_0,0,0,I Finally get to see Germany play\n#GER 🇩🇪⚽🏆
1,0_0,0,0,RT @Wor1dCup2014: If Any of the Boateng Brothe...
2,0_0,0,0,Fascinated for this #GERvsGHA match. This will...
3,0_0,0,0,: #GER and #GHA in a few.
4,0_0,0,0,BOATENG GRUDGE MATCH: 21/2 for Jermaine to sco...
...,...,...,...,...
1072923,3_129,3,129,RT @xja_mila: Ivory coast didnt fucking deserv...
1072924,3_129,3,129,If you lose because of poor conditioning and l...
1072925,3_129,3,129,"Oh dear @bobbykemp81, Spain, Italy,Bosnia & Iv..."
1072926,3_129,3,129,"Exactly ""@Naijablogger: Ivory Coast Was So clo..."


In [44]:
df_eval['Tweet'] = df_eval['Tweet'].apply(preprocess_text)

In [45]:
tweet_vectors = np.vstack([get_avg_embedding(tweet, embeddings_model, vector_size) for tweet in df_eval['Tweet']])
tweet_df = pd.DataFrame(tweet_vectors)
tweet_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
0,0.183428,0.351652,-0.059248,-0.182003,0.081787,0.131392,0.513264,0.108436,0.235608,0.015548,...,-0.126687,-0.086776,0.224327,-0.2981,-0.039604,-0.170743,0.212588,-0.165797,-0.058123,0.24174
1,0.252394,0.291525,-0.054652,0.019181,0.082992,-0.067042,0.106594,0.06285,0.114198,0.168639,...,-0.273192,-0.043879,0.165205,0.01279,-0.035947,-0.023603,0.090462,0.079817,-0.096872,0.275643
2,0.133759,0.340722,-0.202986,-0.155269,-0.05257,0.217507,0.175424,0.069934,0.147071,0.122774,...,-0.041979,0.041901,0.099874,-0.158076,0.03502,0.1587,0.197988,0.048215,0.172673,0.004555
3,0.132508,0.662775,0.1056,-0.12087,0.362255,-0.073177,-0.708465,0.064674,0.255179,0.3855,...,0.14236,-0.204735,0.4138,-0.16136,-0.254585,0.11611,0.463215,0.00453,-0.096614,0.37292
4,0.349205,0.474341,-0.059732,-0.133805,0.189881,0.259303,0.060578,0.032346,-0.050215,0.255715,...,-0.366065,0.063324,0.365045,-0.195428,-0.176695,-0.036868,0.357725,-0.104471,0.130819,0.210772


In [46]:
# Attach the vectors into the original dataframe
df_eval = pd.concat([df_eval, tweet_df], axis=1)

# Drop the columns that are not useful anymore
# no need for Tweet column since we have its corresponding vector embedding
df_eval.drop(columns=['Tweet'], inplace=True)


In [47]:
# by now should have df with columns: ID, match id, period id, tweet_vector. Tweet_vector is just 200 columns
df_eval

Unnamed: 0,ID,MatchID,PeriodID,0,1,2,3,4,5,6,...,190,191,192,193,194,195,196,197,198,199
0,0_0,0,0,0.183428,0.351652,-0.059248,-0.182003,0.081787,0.131392,0.513264,...,-0.126687,-0.086776,0.224327,-0.298100,-0.039604,-0.170743,0.212588,-0.165797,-0.058123,0.241740
1,0_0,0,0,0.252394,0.291525,-0.054652,0.019181,0.082992,-0.067042,0.106594,...,-0.273192,-0.043879,0.165205,0.012790,-0.035947,-0.023603,0.090462,0.079817,-0.096872,0.275643
2,0_0,0,0,0.133759,0.340722,-0.202986,-0.155269,-0.052570,0.217507,0.175424,...,-0.041979,0.041901,0.099874,-0.158076,0.035020,0.158700,0.197988,0.048215,0.172673,0.004555
3,0_0,0,0,0.132508,0.662775,0.105600,-0.120870,0.362255,-0.073177,-0.708465,...,0.142360,-0.204735,0.413800,-0.161360,-0.254585,0.116110,0.463215,0.004530,-0.096614,0.372920
4,0_0,0,0,0.349205,0.474341,-0.059732,-0.133805,0.189881,0.259303,0.060578,...,-0.366065,0.063324,0.365045,-0.195428,-0.176695,-0.036868,0.357725,-0.104471,0.130819,0.210772
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1072923,3_129,3,129,-0.105002,0.224684,0.176999,0.101739,-0.291487,-0.120343,0.380783,...,-0.104685,-0.303610,-0.033417,-0.164878,0.166354,-0.166971,0.051521,0.208779,-0.145871,0.007826
1072924,3_129,3,129,-0.108304,0.122763,0.076810,-0.143285,-0.040679,0.204895,0.330245,...,-0.083390,-0.239286,0.095482,-0.067263,0.173352,0.063049,0.196669,0.119492,0.043604,0.002617
1072925,3_129,3,129,-0.175343,0.185886,0.081908,-0.024416,-0.239233,-0.042910,0.085362,...,-0.408840,-0.105759,0.292960,-0.019169,0.108689,-0.132698,0.074461,-0.149884,0.093741,0.167117
1072926,3_129,3,129,0.027873,0.062907,0.210258,0.215266,-0.350590,-0.230121,0.302010,...,0.218694,-0.292444,-0.007068,0.008480,0.338337,0.066870,0.111968,0.039413,0.127208,-0.262923


In [48]:
# group the tweets into their corresponding periods to generate an average embedding vector for each period
# so there are no duplicate period id rows per match
# decreases size of data + makes it easier to fit into LSTM model
df_eval = df_eval.groupby(['MatchID', 'PeriodID', 'ID']).mean().reset_index()
df_eval.drop(columns=['ID'], inplace=True) 
df_eval['MatchID'] = df_eval['MatchID'].astype(int)
df_eval['PeriodID'] = df_eval['PeriodID'].astype(int)
# need to convert to int before sorting
df_eval.sort_values(by=['MatchID', 'PeriodID'], inplace=True)
df_eval.reset_index(drop=True, inplace=True)


In [49]:
df_eval

Unnamed: 0,MatchID,PeriodID,0,1,2,3,4,5,6,7,...,190,191,192,193,194,195,196,197,198,199
0,0,0,0.158896,0.264814,0.057981,-0.102842,0.061002,0.023203,0.131890,0.060724,...,-0.184701,-0.101496,0.081690,-0.004283,-0.057534,0.006105,0.171110,0.025386,-0.039204,0.189157
1,0,1,0.156288,0.271375,0.059343,-0.108422,0.052298,0.019057,0.119804,0.069230,...,-0.193451,-0.098110,0.085782,-0.014780,-0.065975,0.008123,0.176577,0.028697,-0.037489,0.189704
2,0,2,0.145923,0.240633,0.057680,-0.104799,0.108712,0.009395,0.081510,0.103349,...,-0.200057,-0.118543,0.082504,-0.064201,-0.029714,0.073884,0.186111,0.105346,-0.022998,0.202353
3,0,3,0.160460,0.285798,0.063682,-0.104289,0.061716,0.016656,0.126485,0.074434,...,-0.188633,-0.087426,0.102778,-0.018449,-0.064665,0.006555,0.178324,0.028487,-0.038289,0.191645
4,0,4,0.159856,0.281828,0.073446,-0.112441,0.063491,0.021445,0.110178,0.074091,...,-0.178722,-0.094382,0.093599,-0.025704,-0.076860,0.015346,0.189978,0.027995,-0.040230,0.188706
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
511,3,125,-0.001477,0.083830,0.183564,0.011918,-0.117692,0.023970,0.170858,0.037517,...,0.002239,-0.190334,0.053429,-0.025280,0.185503,0.019539,0.151855,0.100506,0.024328,0.031229
512,3,126,0.005208,0.094197,0.180032,0.007408,-0.109134,0.021600,0.169906,0.036721,...,-0.005178,-0.184373,0.054620,-0.018582,0.174117,0.017425,0.152247,0.104245,0.018185,0.032406
513,3,127,0.014003,0.088663,0.178135,-0.009662,-0.101905,0.011872,0.172667,0.038090,...,-0.004500,-0.184957,0.047334,-0.022453,0.164805,0.017564,0.148168,0.098168,0.019318,0.023921
514,3,128,0.005228,0.095664,0.178037,-0.004261,-0.102904,0.015367,0.173290,0.034406,...,-0.006183,-0.185274,0.050824,-0.022196,0.170925,0.020117,0.147296,0.099269,0.021762,0.024964


In [50]:
# NO EVENTTYPE, CAN ONLY MAKE PREDICTIONS WITHOUT KNOWING ACCURACY

# there is no df_y when we are trying to evaluate the matches in eval_tweets
# for kaggle submission!!
# let df_y have all zeros with the same number of rows as df_X
# this is just to make code run more easily, df_y
#     and tensor_y (value returned by convert_df_to_3D_tensor) will not be used

# df_y has no real meaning, only for ease of coding!
df_y = pd.Series(0, index=df_eval.index)


In [51]:
df_y

0      0
1      0
2      0
3      0
4      0
      ..
511    0
512    0
513    0
514    0
515    0
Length: 516, dtype: int64

In [52]:
X_eval_tensor, _ = convert_df_to_3D_tensor(df_eval, df_y)
# CONVERT TO PYTORCH TENSOR
X_eval_tensor = torch.tensor(X_eval_tensor, dtype=torch.float32)


(4, 130, 200)
(4, 130)


In [53]:
X_eval_tensor

tensor([[[ 0.1589,  0.2648,  0.0580,  ...,  0.0254, -0.0392,  0.1892],
         [ 0.1563,  0.2714,  0.0593,  ...,  0.0287, -0.0375,  0.1897],
         [ 0.1459,  0.2406,  0.0577,  ...,  0.1053, -0.0230,  0.2024],
         ...,
         [ 0.1209,  0.2895,  0.1666,  ...,  0.1049, -0.0291,  0.0956],
         [ 0.1459,  0.2925,  0.1178,  ...,  0.0700, -0.0339,  0.1269],
         [ 0.1424,  0.2870,  0.1126,  ...,  0.0668, -0.0390,  0.1218]],

        [[ 0.0911,  0.2609,  0.0674,  ...,  0.1012, -0.0218,  0.0741],
         [ 0.0741,  0.2192,  0.1009,  ...,  0.1315, -0.0232,  0.0677],
         [ 0.0641,  0.2537,  0.0954,  ...,  0.0894, -0.0264,  0.0502],
         ...,
         [ 0.0968,  0.2205,  0.1096,  ...,  0.1026, -0.0453,  0.0523],
         [ 0.0904,  0.2277,  0.1050,  ...,  0.1223, -0.0478,  0.0635],
         [ 0.0932,  0.2114,  0.1104,  ...,  0.1117, -0.0403,  0.0537]],

        [[ 0.1054,  0.2370,  0.0124,  ...,  0.0596, -0.0988,  0.1500],
         [ 0.0932,  0.2318,  0.0044,  ...,  0

In [54]:
model.eval()

with torch.no_grad():
    predictions = model(X_eval_tensor)

# predictions have values between 0 and 1 because forward pass of LSTM contains sigmoid at output
#print(predictions)

predicted_classes = (predictions > 0.5).float() # 0.5 is threshold
#this converts to same dimensional array of True or false, and .float() converts True to 1 and False to 0


In [55]:
predicted_classes

tensor([[0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 1., 1., 1.,
         1., 1., 1., 1., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0.,
         0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 0., 0., 1., 1., 1., 1., 1., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1.],
        [0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 0., 0., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 1.,
         1., 1., 0., 0., 0., 0., 0., 0., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1.,
  

In [56]:
predicted_classes.shape

torch.Size([4, 130])

In [57]:
# MAKE CSV OF OUTPUT WITH CORRECT MATCH IDS

In [58]:
# confirm file "our_predictions.csv" exists
# the first column is already hardcoded since it always has the same values

file_name = "our_predictions.csv"

if path.exists(file_name):
    print("File exists!")
else:
    raise ValueError(f"File '{file_name}' does not exist in the current directory.")
    

File exists!


In [59]:
# now loop through file and add predictions

In [60]:
# match_id_order[match_id]  = i means that the predictions of match_id are in the ith sequence
match_id_order

{'6': 0, '16': 1, '15': 2, '9': 3}

In [61]:

# Read the CSV file
with open(file_name, 'r') as file:
    reader = csv.reader(file)
    rows = list(reader)

# add the prediction to each row

for i in range(1,len(rows)): # skip first row: ID,EventType
    row = rows[i]
    # row[0] is first column: ID: matchID_periodID
    # row[1] is second column: EventType, which we want to write with the prediction
    
    match_id, period_id = row[0].split("_")
    
    prediction = predicted_classes[match_id_order[match_id]][int(period_id)]
    print(f"{match_id} and {period_id} has prediction: {prediction}")
    row[1] = float(prediction)

# write the modified data back to the CSV file
with open(file_name, 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerows(rows)

6 and 0 has prediction: 0.0
6 and 1 has prediction: 0.0
6 and 2 has prediction: 0.0
6 and 3 has prediction: 0.0
6 and 4 has prediction: 0.0
6 and 5 has prediction: 1.0
6 and 6 has prediction: 1.0
6 and 7 has prediction: 1.0
6 and 8 has prediction: 1.0
6 and 9 has prediction: 1.0
6 and 10 has prediction: 1.0
6 and 11 has prediction: 1.0
6 and 12 has prediction: 0.0
6 and 13 has prediction: 0.0
6 and 14 has prediction: 0.0
6 and 15 has prediction: 0.0
6 and 16 has prediction: 0.0
6 and 17 has prediction: 0.0
6 and 18 has prediction: 0.0
6 and 19 has prediction: 0.0
6 and 20 has prediction: 0.0
6 and 21 has prediction: 0.0
6 and 22 has prediction: 0.0
6 and 23 has prediction: 0.0
6 and 24 has prediction: 0.0
6 and 25 has prediction: 0.0
6 and 26 has prediction: 0.0
6 and 27 has prediction: 1.0
6 and 28 has prediction: 1.0
6 and 29 has prediction: 1.0
6 and 30 has prediction: 0.0
6 and 31 has prediction: 0.0
6 and 32 has prediction: 0.0
6 and 33 has prediction: 0.0
6 and 34 has prediction:

In [62]:
# our_predictions.csv contains the predictions!
# our_predictions.csv NEEDS TO BE IN CURRENT DIRECTORY WITH FIRST ROW AND FIRST COLUMN WITH EXPECTED VALUES
#     THIS IS WHY our_predictions.csv is added to github repo, it is needed to run the code
# DONE!!!

In [63]:
# NOTES
# HOW TO MAKE SURE THAT we:
# 1. DO NOT ignore the order of the tweets -> (LSTM)
# 2. treat each time period as RELATED to the football match they belong to -> treat each match as a sequence, train LSTM on every sequence
#                      since pytorch tensor expects multiple sequences (batches)



# for LSTM: Each input sequence consists of tweet embeddings (200 dimensional) from a specific match, ordered by Period ID.
#   tweets of different matches are unrelated, but tweets of a same match are related sequentially (chronologically)