# Import libraries and model

In [1]:
from os import listdir, path
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import gc
import gensim.downloader as api

import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

from math import ceil
import torch.optim as optim
import csv

nltk.download('stopwords')
nltk.download('wordnet')

# Load GloVe model with Gensim's API - Twitter specific embedding
embeddings_model = api.load("glove-twitter-200")  # 200-dimensional GloVe embeddings

#To check that T4 GPU is connected
#!nvidia-smi

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/infres/kbrowder-24/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/infres/kbrowder-24/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Data preprocessing

In [2]:
# Read all training files and concatenate them into one dataframe

#import os
#print(os.getcwd())

li = []
i = 0
for filename in listdir("train_tweets"):
    if filename != '.ipynb_checkpoints':
        print(filename)
        df = pd.read_csv("train_tweets/" + filename)
        df.drop(columns=['Timestamp'], inplace=True)
        # drop unused column(s)
        df['MatchID'] = str(i)
        df['ID'] = str(i)+ '_' + df['PeriodID'].astype(str)
        # makes sure that the match IDs are ordered from 0,1,2... with no missing values
        # this is for convenience and so it is easier to debug and follow along
        i+=1
        li.append(df)
df = pd.concat(li, ignore_index=True)
#print(len(df))
df

USASlovenia2010.csv
ArgentinaBelgium72.csv
AustraliaSpain34.csv
ArgentinaGermanyFinal77.csv
AustraliaNetherlands29.csv
BelgiumSouthKorea59.csv
HondurasSwitzerland54.csv
FranceGermany70.csv
GermanyBrazil74.csv
GermanyUSA57.csv
MexicoCroatia37.csv
FranceNigeria66.csv
CameroonBrazil36.csv
NetherlandsChile35.csv
PortugalGhana58.csv
GermanyAlgeria67.csv


Unnamed: 0,ID,MatchID,PeriodID,EventType,Tweet
0,0_0,0,0,0,#USA All My Stateside Followers Stand Up And R...
1,0_0,0,0,0,@Lynz_89 I think the ref might have been Basil...
2,0_0,0,0,0,Hoping a #USA win can help ease the pain of la...
3,0_0,0,0,0,When does this actually start? #worldcup
4,0_0,0,0,0,Hanson and Roy are a proper pundit line up. #w...
...,...,...,...,...,...
5056045,15_169,15,169,0,RT @FOXSoccer: 3/4 of the #WorldCup quarterfin...
5056046,15_169,15,169,0,RT @Rodolph_hilal: Plz guys RETWEET .. \n\nLet...
5056047,15_169,15,169,0,RT @Joey7Barton: Algeria can take a lot of pos...
5056048,15_169,15,169,0,"RT @caughtoffside: #ALG gave it their all, was..."


In [3]:
# Preprocessing of tweet
def preprocess_text(text):
    # Lowercasing
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Tokenization
    words = text.split()
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

In [4]:
# Apply preprocessing to each tweet
df['Tweet'] = df['Tweet'].apply(preprocess_text)
df

Unnamed: 0,ID,MatchID,PeriodID,EventType,Tweet
0,0_0,0,0,0,usa stateside follower stand represent beautif...
1,0_0,0,0,0,lynz_ think ref might basil fawlty actually wo...
2,0_0,0,0,0,hoping usa win help ease pain last night loss ...
3,0_0,0,0,0,actually start worldcup
4,0_0,0,0,0,hanson roy proper pundit line worldcup
...,...,...,...,...,...
5056045,15_169,15,169,0,rt foxsoccer worldcup quarterfinal set ger v f...
5056046,15_169,15,169,0,rt rodolph_hilal plz guy retweet let trend alg...
5056047,15_169,15,169,0,rt joeybarton algeria take lot positive big bi...
5056048,15_169,15,169,0,rt caughtoffside alg gave wasnt enough heart t...


# Tweet Embeddings

In [5]:
# Get vector tweet embeddings
# TODOOOOOOOOOOOOOOOO maybe instead of avg word embedding for each tweet can get sentence
#   embeddings to retain more information
#   -> can try more complex functions here
#   -> avg embedding of each word for a tweet is fine for now, maybe works well enough

# Function to compute the average word vector for a tweet
def get_avg_embedding(tweet, model, vector_size=200):
    words = tweet.split()  # Tokenize by whitespace
    word_vectors = [model[word] for word in words if word in model]
    if not word_vectors:  # If no words in the tweet are in the vocabulary, return a zero vector
        return np.zeros(vector_size)
    return np.mean(word_vectors, axis=0)

In [6]:
# Crashes after using all available RAM :( on google colab
# 

# obtain vector tweet embeddings
vector_size = 200  # Adjust based on the chosen GloVe model
tweet_vectors = np.vstack([get_avg_embedding(tweet, embeddings_model, vector_size) for tweet in df['Tweet']])
tweet_df = pd.DataFrame(tweet_vectors)
tweet_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
0,0.066179,0.256214,0.080828,-0.339078,0.017108,0.146547,0.08749,0.158443,0.24754,-0.031605,...,-0.215871,-0.080837,0.073679,0.022517,0.070002,0.291239,0.07973,-0.014516,-0.047773,-0.077725
1,-0.166161,0.393736,0.10649,0.004497,-0.155041,0.039733,0.366459,-0.155454,0.209879,0.128544,...,-0.153934,-0.047368,0.195058,0.023768,0.040157,-0.003317,0.337965,0.331624,-0.117682,0.218918
2,0.223382,0.272528,0.071598,-0.293135,-0.021328,0.048036,0.38369,-0.006551,0.172468,0.059215,...,-0.318161,-0.020952,0.18358,-0.087949,0.183349,-0.040151,0.276681,0.072967,0.056575,-0.014119
3,-0.108951,0.288303,0.409793,-0.22219,-0.159377,0.224041,0.82633,0.065553,0.261637,-0.148706,...,-0.154402,-0.185433,-0.003813,0.109507,-0.344327,-0.001267,0.045143,0.43395,0.021387,0.35958
4,-0.112306,0.114112,0.246104,-0.454574,-0.059892,-0.105618,0.328258,-0.055273,0.075983,-0.211882,...,-0.133122,-0.091702,-0.027074,0.077817,0.030298,0.059711,0.367403,0.361568,-0.169055,0.064508


In [7]:
# Attach the vectors into the original dataframe
df = pd.concat([df, tweet_df], axis=1)

# Drop the columns that are not useful anymore
# no need for Tweet column since we have its corresponding vector embedding
df.drop(columns=['Tweet'], inplace=True)



In [8]:
# by now should have df with columns: ID, match id, period id, Event Type, tweet_vector. Tweet_vector is just 200 columns
df

Unnamed: 0,ID,MatchID,PeriodID,EventType,0,1,2,3,4,5,...,190,191,192,193,194,195,196,197,198,199
0,0_0,0,0,0,0.066179,0.256214,0.080828,-0.339078,0.017108,0.146547,...,-0.215871,-0.080837,0.073679,0.022517,0.070002,0.291239,0.079730,-0.014516,-0.047773,-0.077725
1,0_0,0,0,0,-0.166161,0.393736,0.106490,0.004497,-0.155041,0.039733,...,-0.153934,-0.047368,0.195058,0.023768,0.040157,-0.003317,0.337965,0.331624,-0.117682,0.218918
2,0_0,0,0,0,0.223382,0.272528,0.071598,-0.293135,-0.021328,0.048036,...,-0.318161,-0.020952,0.183580,-0.087949,0.183349,-0.040151,0.276681,0.072967,0.056575,-0.014119
3,0_0,0,0,0,-0.108951,0.288303,0.409793,-0.222190,-0.159377,0.224041,...,-0.154402,-0.185433,-0.003813,0.109507,-0.344327,-0.001267,0.045143,0.433950,0.021387,0.359580
4,0_0,0,0,0,-0.112306,0.114112,0.246104,-0.454574,-0.059892,-0.105618,...,-0.133122,-0.091702,-0.027074,0.077817,0.030298,0.059711,0.367403,0.361568,-0.169055,0.064508
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5056045,15_169,15,169,0,0.130473,0.285121,0.018671,-0.052116,0.028197,-0.112828,...,-0.167767,-0.057690,0.198401,0.020693,-0.162055,0.049583,0.192499,0.099254,-0.090062,0.111031
5056046,15_169,15,169,0,0.307439,0.368945,-0.172900,-0.276857,-0.122469,-0.301321,...,-0.024198,0.115410,0.005415,0.049138,-0.168726,-0.071524,0.103462,-0.025341,0.071714,-0.055731
5056047,15_169,15,169,0,0.147703,0.201878,-0.126858,-0.095563,-0.061741,-0.022171,...,-0.218651,-0.086297,0.081748,-0.153307,-0.026059,-0.072721,0.151214,-0.086326,-0.116747,-0.127796
5056048,15_169,15,169,0,0.016947,0.284084,0.080437,0.101969,0.009306,0.078792,...,-0.174617,0.055047,-0.047173,-0.010757,0.057742,-0.077205,0.082589,0.080722,0.094374,-0.039958


In [9]:
# group the tweets into their corresponding periods to generate an average embedding vector for each period
# so there are no duplicate period id rows per match
# decreases size of data + makes it easier to fit into LSTM model
df = df.groupby(['MatchID', 'PeriodID', 'ID']).mean().reset_index()
df.drop(columns=['ID'], inplace=True) 
df['MatchID'] = df['MatchID'].astype(int)
df['PeriodID'] = df['PeriodID'].astype(int)
# need to convert to int before sorting
df.sort_values(by=['MatchID', 'PeriodID'], inplace=True)
df.reset_index(drop=True, inplace=True)


In [10]:
df

Unnamed: 0,MatchID,PeriodID,EventType,0,1,2,3,4,5,6,...,190,191,192,193,194,195,196,197,198,199
0,0,0,0.0,0.090203,0.226362,0.095120,-0.143475,-0.008777,0.073592,0.129656,...,-0.155071,-0.113961,0.039491,0.007716,-0.090425,0.064496,0.190450,0.081906,-0.048242,0.042346
1,0,1,0.0,0.118030,0.224145,0.107249,-0.148456,0.035647,0.056143,0.091818,...,-0.170008,-0.123982,0.030933,0.027041,-0.104665,0.091441,0.200424,0.094985,-0.046679,0.046522
2,0,2,0.0,0.104254,0.217891,0.119842,-0.180029,0.018067,0.066002,0.102639,...,-0.157436,-0.131377,0.031544,0.015286,-0.106139,0.086386,0.186296,0.089349,-0.038044,0.037974
3,0,3,1.0,0.114958,0.220945,0.118956,-0.157640,0.012345,0.065088,0.083313,...,-0.163629,-0.106229,0.030797,0.031875,-0.086206,0.074843,0.224553,0.086549,-0.057588,0.016611
4,0,4,0.0,0.110193,0.228858,0.112959,-0.162340,0.002300,0.080856,0.127424,...,-0.167278,-0.100561,0.039007,0.009540,-0.085179,0.064918,0.214720,0.055244,-0.040498,0.037002
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2132,15,165,1.0,0.128285,0.239256,0.045888,-0.057041,-0.073387,0.057296,0.131580,...,-0.199651,-0.202325,0.204370,-0.035747,-0.076555,0.004716,0.133694,0.019453,-0.054738,0.077643
2133,15,166,1.0,0.131026,0.239009,0.049180,-0.061535,-0.069464,0.056230,0.132437,...,-0.198245,-0.194118,0.195637,-0.037976,-0.078846,0.005053,0.127778,0.019744,-0.044675,0.080291
2134,15,167,1.0,0.126997,0.241214,0.047327,-0.051941,-0.067960,0.049733,0.121557,...,-0.194344,-0.187726,0.185099,-0.032484,-0.082734,0.008448,0.126175,0.020067,-0.047074,0.075651
2135,15,168,1.0,0.128382,0.245466,0.043619,-0.055429,-0.068091,0.038901,0.120740,...,-0.192710,-0.192152,0.190167,-0.034960,-0.087215,0.007034,0.127286,0.021578,-0.048067,0.075383


# Separate Train and Test data

In [11]:
# train on of the first 13 of 16 matches (16*0.8=12.8~=13)
# and the test data would be the last 3 matches. 
# Before submitting on Kaggle we should train on full dataset, so al 16 matches
train_percentage = 0.8
unique_match_ids = df['MatchID'].unique()
print(unique_match_ids)
num_matches_training = int(ceil(len(unique_match_ids)*train_percentage))
print(num_matches_training)

[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15]
13


In [12]:
target_match_id = num_matches_training
# target_match_id is first match id that will appear in test set
# all matches from target_match_id and after will be in test test
print(target_match_id)

13


In [13]:

#df2 = df['MatchID'] == 15
#df2

In [14]:
# row_index is first row with match id target_match_id
# row_index is then the first row of the matches that will go to the test


row_index = (df['MatchID'] == target_match_id).idxmax()
#row_index = df[df['MatchID'] == target_match_id].first_valid_index()
df_X_train = df[:row_index].copy()
df_X_test = df[row_index:].copy()


In [15]:
df_y_train = df_X_train['EventType']
df_y_test = df_X_test['EventType']

In [16]:
df_y_train

0       0.0
1       0.0
2       0.0
3       1.0
4       0.0
       ... 
1702    1.0
1703    1.0
1704    1.0
1705    1.0
1706    1.0
Name: EventType, Length: 1707, dtype: float64

In [17]:
df_y_test.reset_index(drop=True, inplace=True)
df_y_test

0      0.0
1      0.0
2      1.0
3      1.0
4      1.0
      ... 
425    1.0
426    1.0
427    1.0
428    1.0
429    0.0
Name: EventType, Length: 430, dtype: float64

In [18]:
df_X_train.drop(['EventType'], axis=1, inplace=True)
df_X_test.drop(['EventType'], axis=1, inplace=True)

In [19]:
df_X_train

Unnamed: 0,MatchID,PeriodID,0,1,2,3,4,5,6,7,...,190,191,192,193,194,195,196,197,198,199
0,0,0,0.090203,0.226362,0.095120,-0.143475,-0.008777,0.073592,0.129656,0.083900,...,-0.155071,-0.113961,0.039491,0.007716,-0.090425,0.064496,0.190450,0.081906,-0.048242,0.042346
1,0,1,0.118030,0.224145,0.107249,-0.148456,0.035647,0.056143,0.091818,0.094932,...,-0.170008,-0.123982,0.030933,0.027041,-0.104665,0.091441,0.200424,0.094985,-0.046679,0.046522
2,0,2,0.104254,0.217891,0.119842,-0.180029,0.018067,0.066002,0.102639,0.092920,...,-0.157436,-0.131377,0.031544,0.015286,-0.106139,0.086386,0.186296,0.089349,-0.038044,0.037974
3,0,3,0.114958,0.220945,0.118956,-0.157640,0.012345,0.065088,0.083313,0.099258,...,-0.163629,-0.106229,0.030797,0.031875,-0.086206,0.074843,0.224553,0.086549,-0.057588,0.016611
4,0,4,0.110193,0.228858,0.112959,-0.162340,0.002300,0.080856,0.127424,0.115216,...,-0.167278,-0.100561,0.039007,0.009540,-0.085179,0.064918,0.214720,0.055244,-0.040498,0.037002
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1702,12,125,0.016553,0.207181,0.122646,-0.185391,0.059920,-0.026130,-0.179838,-0.027683,...,-0.045443,-0.095434,-0.061143,-0.028591,-0.127751,0.064417,0.204314,0.123278,-0.089490,0.131382
1703,12,126,0.006991,0.221904,0.114750,-0.203162,0.061973,-0.014181,-0.184535,-0.022929,...,-0.040919,-0.086301,-0.054943,-0.032487,-0.135607,0.093646,0.199398,0.134367,-0.096521,0.150498
1704,12,127,0.009623,0.228086,0.106309,-0.203891,0.064704,-0.010036,-0.195334,-0.023981,...,-0.041586,-0.086463,-0.052670,-0.027168,-0.138804,0.098265,0.199657,0.137645,-0.097100,0.154269
1705,12,128,0.012392,0.215962,0.115303,-0.207024,0.056991,-0.013960,-0.184343,-0.023527,...,-0.039392,-0.075463,-0.052215,-0.028925,-0.147637,0.102975,0.191022,0.128611,-0.108194,0.152573


In [20]:
df_X_test.reset_index(drop=True, inplace=True)
df_X_test

Unnamed: 0,MatchID,PeriodID,0,1,2,3,4,5,6,7,...,190,191,192,193,194,195,196,197,198,199
0,13,0,0.158481,0.257868,-0.022338,-0.093147,-0.002405,-0.030854,-0.053274,0.096762,...,-0.180573,-0.037701,0.020883,0.033740,-0.065151,-0.008039,0.170601,0.045094,-0.108414,0.123681
1,13,1,0.155648,0.253481,-0.016269,-0.097016,-0.005047,-0.027279,-0.054086,0.096198,...,-0.181899,-0.035467,0.007620,0.041171,-0.062731,-0.005319,0.166944,0.046265,-0.110438,0.122801
2,13,2,0.151312,0.250384,-0.014702,-0.093926,-0.006961,-0.018035,-0.050121,0.088727,...,-0.182366,-0.030688,0.019412,0.037483,-0.067257,-0.002014,0.174397,0.043693,-0.105461,0.117063
3,13,3,0.155171,0.225126,-0.012214,-0.098732,-0.018331,0.003354,-0.030645,0.074597,...,-0.179215,-0.015111,0.020229,0.038932,-0.073913,-0.019716,0.184073,0.042528,-0.106684,0.117708
4,13,4,0.142500,0.219101,-0.000908,-0.106087,-0.024891,0.008740,-0.063446,0.073259,...,-0.174396,-0.017677,0.015991,0.036385,-0.078154,-0.008176,0.196789,0.049799,-0.107582,0.124269
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
425,15,165,0.128285,0.239256,0.045888,-0.057041,-0.073387,0.057296,0.131580,0.057945,...,-0.199651,-0.202325,0.204370,-0.035747,-0.076555,0.004716,0.133694,0.019453,-0.054738,0.077643
426,15,166,0.131026,0.239009,0.049180,-0.061535,-0.069464,0.056230,0.132437,0.062405,...,-0.198245,-0.194118,0.195637,-0.037976,-0.078846,0.005053,0.127778,0.019744,-0.044675,0.080291
427,15,167,0.126997,0.241214,0.047327,-0.051941,-0.067960,0.049733,0.121557,0.055224,...,-0.194344,-0.187726,0.185099,-0.032484,-0.082734,0.008448,0.126175,0.020067,-0.047074,0.075651
428,15,168,0.128382,0.245466,0.043619,-0.055429,-0.068091,0.038901,0.120740,0.060496,...,-0.192710,-0.192152,0.190167,-0.034960,-0.087215,0.007034,0.127286,0.021578,-0.048067,0.075383


In [21]:
# now df_X_train and df_X_test should have columns MatchID, PeriodID, tweet_vector. Tweet_vector is just 200 columns
# df_y_train and df_y_test should have 1 column, EventType
# the matchids are grouped together so all the rows of the same
# match ids are grouped next to each other, and the periodID are ordered chronologically.

In [22]:
# now we have df_X_train, df_X_test, df_y_train, df_y_test
# we no longer need df so we should free up the memory
del df  # remove reference to the original DataFrame
gc.collect()  # force garbage collection to free up memory

0

In [23]:
max_periods = df_X_train.groupby('MatchID')['PeriodID'].max().reset_index()
max_periods
# as we can see not every match has the same number of periods!

Unnamed: 0,MatchID,PeriodID
0,0,129
1,1,129
2,2,129
3,3,179
4,4,96
5,5,129
6,6,129
7,7,129
8,8,129
9,9,129


In [24]:
max_periods = df_X_test.groupby('MatchID')['PeriodID'].max().reset_index()
max_periods

Unnamed: 0,MatchID,PeriodID
0,13,129
1,14,129
2,15,169


# Format Train and Test data for PyTorch LSTM

In [25]:
# input tensor for a PyTorch LSTM should have the shape of (when setting batch_first=True)
# (batch_size, seq_len, num_features) when using the batch_first=True parameter
# batch_size is number of sequences processed at once

# TRY WITHOUT SLIDING WINDOW APPROACH
#    which would mean batch size = number of matches
#    much easier to format for LSTM as 3D tensor
#    dimension of 3D tensor with batch_first=True:(batch_size = num_matches, seq_len = num_periods, num _features = 200)
#    (match_id, period_id, num_features=200)
#     not every match has the same number of periods!, so seq_len can vary between different matches
#     fix: pad with zeroes
# we want X_tensor[match_id][period_id] to return list len 200 of corresponding tweet vector


In [26]:
def convert_df_to_3D_tensor(df_X, df_y):
    # df_X should have columns MatchID, PeriodID, tweet_vector. Tweet_vector is just 200 columns
    # rows with same matchID should be grouped together (adjacent rows)
    # df_y should have one column (the EventType)
    # returns tensor_X numpy array already padded! shape: (num_matches, max_num_periods, num _features = 200)
    # and tensor_y of shape: (num_matches, max_num_periods) 
    
    num_matches = len(df_X['MatchID'].unique())
    max_periods = df_X.groupby('MatchID')['PeriodID'].max().reset_index()
    total_max_period = max_periods['PeriodID'].max()
    #total_max_period is max seq len

    tensor_X = np.zeros((num_matches, total_max_period+1, 200))

    tensor_y = np.zeros((num_matches, total_max_period+1))
    print(tensor_X.shape)
    print(tensor_y.shape)
    
    i=0
    previous_match_id = df_X['MatchID'][0]
    for row_index, row in df_X.iterrows():
        match_id = int(row['MatchID'])

        if match_id != previous_match_id:
            i+=1
            previous_match_id = match_id
        
        period_id = int(row['PeriodID'])
        
        features = row[2:].values  # Skip MatchID and PeriodID
        tensor_X[i, period_id, :] = features
        tensor_y[i,period_id] = df_y[row_index]
        
    return tensor_X, tensor_y

In [27]:
# SCALING MIGHT BE UNNECESSARY SINCE OUTPUT OF GLOVE TWEET 200 IS ALREADY SCALED BETWEEN -1 AND 1
#scaler = MinMaxScaler()
#tensor = scaler.fit_transform(tensor)

X_train_tensor, y_train_tensor = convert_df_to_3D_tensor(df_X_train, df_y_train)
# X_train_tensor[match_id][period_id] to return list len 200 of corresponding tweet vector
# y_train_tensor[match_id][period_id] to return corresponding EventType (1 or 0)
# match_id index starts at 0 even if first match in df doesnt have match id 0
#X_train_tensor[12][175]
#X_train_tensor[12][179]
#X_train_tensor[2][129]

# CONVERT TO PYTORCH TENSOR
X_train_tensor = torch.tensor(X_train_tensor, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train_tensor, dtype=torch.float32)

print(X_train_tensor.shape)
print(y_train_tensor.shape)
# X_train_tensor, y_train_tensor are now pytorch tensors

(13, 180, 200)
(13, 180)
torch.Size([13, 180, 200])
torch.Size([13, 180])


In [28]:
# to use pack_padded_sequence for variable length sequences
# torch.nn.utils.rnn.pack_padded_sequence. This allows the model to ignore the padded values during computation.
max_periods = df_X_train.groupby('MatchID')['PeriodID'].max().reset_index()
X_train_seq_lengths = (max_periods['PeriodID']+1).tolist() # add +1 since max period ID + 1 is the seq len
X_train_seq_lengths = torch.tensor(X_train_seq_lengths)
X_train_seq_lengths

tensor([130, 130, 130, 180,  97, 130, 130, 130, 130, 130, 130, 130, 130])

In [29]:
# convert df_X_test and df_y_test to correct format/dimensions
X_test_tensor, y_test_tensor = convert_df_to_3D_tensor(df_X_test, df_y_test)
# CONVERT TO PYTORCH TENSOR
X_test_tensor = torch.tensor(X_test_tensor, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test_tensor, dtype=torch.float32)

(3, 170, 200)
(3, 170)


In [30]:
max_periods = df_X_test.groupby('MatchID')['PeriodID'].max().reset_index()
X_test_seq_lengths = (max_periods['PeriodID']+1).tolist() # add +1 since max period ID + 1 is the seq len
X_test_seq_lengths = torch.tensor(X_test_seq_lengths)
X_test_seq_lengths

tensor([130, 130, 170])

# LSTM Model

In [31]:
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, dropout_rate):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout_rate)
        self.fc = nn.Linear(hidden_size, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x, lengths):
        # Use PackedSequence, since # of periods in matches (seq len) varies in between matches
        # so LSTM only processes the actual sequence content, ignoring padded values
        # prevents model from learning patterns from padding (noise)        
        
        # Pack the input
        packed_input = pack_padded_sequence(x, lengths, batch_first=True, enforce_sorted=False)
        
        # Process with LSTM
        packed_output, _ = self.lstm(packed_input)
        
        # Unpack the output
        lstm_out, _ = pad_packed_sequence(packed_output, batch_first=True)

        out = self.fc(lstm_out)
        out = self.sigmoid(out) # applying sigmoid to convert to probabilities

        return out.squeeze(-1)


def get_loss_criterion():
    ###### reduction = 'none' needed to only calculate loss on non-padded values using mask
    return nn.BCELoss(reduction='none') # BCE great for binary classification

def get_optimizer(lr, model):
    #optim.NAdam(model.parameters(), lr=lr)
    #return optim.Adam(model.parameters(), lr=lr)
    return optim.AdamW(model.parameters(), lr=lr)

# Useful functions

In [32]:
def train_model(model, optimizer, criterion, X, y, seq_lengths, num_epochs, verbose=True, device='cpu'):
    for epoch in range(num_epochs):
        model.train()
        optimizer.zero_grad()
        outputs = model(X, seq_lengths)
        #print(f"shape of outputs: {outputs.shape}")
        #break
    
        loss = criterion(outputs, y) # apply loss to padded values also

        # Use the mask to compute loss only on non-padded elements
        # so only non-padded elements contribute to the loss
        mask = (torch.arange(outputs.size(1)).unsqueeze(0) < seq_lengths.unsqueeze(1)).float().to(device)
        #print(f"Mask shape: {mask.shape}")
        #print(mask[10][97:])
        
        # mask has shape [num_sequences, max_seq_length], with 1s for valid positions and 0s for padded positions in each sequence.

    
        #print(f"Loss shape: {loss.shape}")
        # set loss of padded values to zero using mask!
        # also normalize loss by number of valid (non-padded elements)
        loss = (loss * mask).sum() / mask.sum() 
        #print(loss.shape)
        loss.backward()
        optimizer.step()
         
        
        if verbose and (epoch % 100 == 0 or epoch == num_epochs-1):
            print(f"Epoch [{epoch}/{num_epochs}], Loss: {loss.item():.4f}")
        
        #if loss.item()<0.1: # 0.1 is threshold, stop training when loss is this small
        #    break

    #print("Model is trained!")
    return

In [33]:
def get_loss(predictions, y_test_tensor, X_test_seq_lengths, criterion):
    # NOT USED BUT COULD BE USEFUL
    loss = criterion(predictions, y_test_tensor) # use predictions for loss calculation

    mask = (torch.arange(predictions.size(1)).unsqueeze(0) < X_test_seq_lengths.unsqueeze(1)).float()
    print(mask)
    loss = (loss * mask).sum() / mask.sum() 

    print(f"Binary Cross-Entropy Loss: {loss.item():.4f}")
    return loss

def get_accuracy(y_true, y_pred):
    if y_true.dtype != y_pred.dtype or y_true.shape != y_pred.shape:
        raise ValueError(f"Inputs do not have same type or shape!")
    correct_predictions = (y_true == y_pred).sum().item()
    total_predictions = y_true.numel()
    accuracy = correct_predictions / total_predictions * 100
    return accuracy


def predict(model, X, seq_lengths, threshold=0.5):
    model.eval()
    with torch.no_grad():
        predictions = model(X, seq_lengths)

    # predictions have values between 0 and 1 because forward pass of LSTM contains sigmoid at output
    #print(predictions)

    
    predicted_classes = (predictions > threshold).float() 
    #this converts to same dimensional array of True or false, and .float() converts True to 1 and False to 0
    
    #print(predicted_classes)
    return predicted_classes

# Find best hyperparameters

In [35]:
gpu = torch.device('cuda:1')

In [36]:
dropout_rate = 0.2 # can tune
num_epochs = 400 # can tune
lr = 0.005 # can tune
def find_best_hyperparameters(params):
    # input: list of tuples (hidden_size, num_layers)
    # returns tuple (hidden_size, num_layers) with highest accuracy on test data
    # since prediction accuracy over multiple runs of the same hidden_size, num_layer
    #     varies, we take the avg over multiple runs
    accuracies = []
    num_runs = 3 # can tune
    
    for hidden_size, num_layers in params:
        temp_accuracies = [] 
        # temp_accuracies:
        #     holds all (num_runs) accuracies of current param tuple, to take avg of later
        
        criterion = get_loss_criterion()
        criterion = criterion.to(gpu)
        print(f"hidden_size: {hidden_size}, num_layers: {num_layers}")
        for r in range(num_runs):
            # Dropout introduces randomness in both training and evaluation phases
            # so we should retrain and retest for every run

            model = LSTMModel(input_size=200, hidden_size=hidden_size, num_layers=num_layers, dropout_rate=dropout_rate)
            model = model.to(gpu)
            optimizer = get_optimizer(lr, model)
            
            # train on training data
            train_model(model, optimizer, criterion, X_train_tensor.to(gpu), y_train_tensor.to(gpu), X_train_seq_lengths, num_epochs, verbose=True, device=gpu)
            # predict on test data
            predicted_classes = predict(model, X_test_tensor.to(gpu), X_test_seq_lengths)
            # get accuracy on test data
            accuracy = get_accuracy(y_test_tensor.to(gpu), predicted_classes)
            temp_accuracies.append(accuracy)
            print(f"\t-> accuracy of run #{r}: {accuracy}")
        accuracy = sum(temp_accuracies)/len(temp_accuracies)
        # overall accuracy of a param tuple is the avg of the accuracy over multiple runs
        print(f"\tavg accuracy: {accuracy}")
        
        accuracies.append(accuracy)
    
    # FIND INDEX OF HIGHEST ACCURACY AND RETURN CORRESPONDONG HIDDENS SIZE NUM LAYER TUPLE
    max_accuracy_index = accuracies.index(max(accuracies))
    return params[max_accuracy_index], max(accuracies)
    
possible_params = [(32, 2), (64, 2), (128, 2), (256, 2), 
                  (32, 3), (64, 3), (128, 3), (256, 3), 
                  (32, 4), (64, 4), (128, 4), (256, 4)]
(best_hidden_size, best_num_layers), max_accuracy = find_best_hyperparameters(possible_params)
print(f"\nBest hidden size: {best_hidden_size}, Best num layers: {best_num_layers} with accuracy of {max_accuracy}")

hidden_size: 32, num_layers: 2
Epoch [0/400], Loss: 0.7010
Epoch [100/400], Loss: 0.4067
Epoch [200/400], Loss: 0.2976
Epoch [300/400], Loss: 0.2023
Epoch [399/400], Loss: 0.1418
	-> accuracy of run #0: 73.92156862745098
Epoch [0/400], Loss: 0.7041
Epoch [100/400], Loss: 0.4073
Epoch [200/400], Loss: 0.2992
Epoch [300/400], Loss: 0.2054
Epoch [399/400], Loss: 0.1223
	-> accuracy of run #1: 74.31372549019608
Epoch [0/400], Loss: 0.6923
Epoch [100/400], Loss: 0.3958
Epoch [200/400], Loss: 0.3097
Epoch [300/400], Loss: 0.2054
Epoch [399/400], Loss: 0.1592
	-> accuracy of run #2: 73.13725490196077
	avg accuracy: 73.79084967320262
hidden_size: 64, num_layers: 2
Epoch [0/400], Loss: 0.7011
Epoch [100/400], Loss: 0.3640
Epoch [200/400], Loss: 0.2275
Epoch [300/400], Loss: 0.3644
Epoch [399/400], Loss: 0.0754
	-> accuracy of run #0: 71.76470588235294
Epoch [0/400], Loss: 0.6859
Epoch [100/400], Loss: 0.3637
Epoch [200/400], Loss: 0.2016
Epoch [300/400], Loss: 0.0740
Epoch [399/400], Loss: 0.02

# For Kaggle Submission

In [37]:
# RETRAIN MODEL ON ENTIRE TRAINING DATA AND EVALUATE EVAL TWEETS


df_X = pd.concat([df_X_train, df_X_test], ignore_index=True)
df_y = pd.concat([df_y_train, df_y_test], ignore_index=True)
print(df_X['MatchID'].unique())
print(df_X.shape)
print(df_y.shape)


[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15]
(2137, 202)
(2137,)


In [38]:
df_X

Unnamed: 0,MatchID,PeriodID,0,1,2,3,4,5,6,7,...,190,191,192,193,194,195,196,197,198,199
0,0,0,0.090203,0.226362,0.095120,-0.143475,-0.008777,0.073592,0.129656,0.083900,...,-0.155071,-0.113961,0.039491,0.007716,-0.090425,0.064496,0.190450,0.081906,-0.048242,0.042346
1,0,1,0.118030,0.224145,0.107249,-0.148456,0.035647,0.056143,0.091818,0.094932,...,-0.170008,-0.123982,0.030933,0.027041,-0.104665,0.091441,0.200424,0.094985,-0.046679,0.046522
2,0,2,0.104254,0.217891,0.119842,-0.180029,0.018067,0.066002,0.102639,0.092920,...,-0.157436,-0.131377,0.031544,0.015286,-0.106139,0.086386,0.186296,0.089349,-0.038044,0.037974
3,0,3,0.114958,0.220945,0.118956,-0.157640,0.012345,0.065088,0.083313,0.099258,...,-0.163629,-0.106229,0.030797,0.031875,-0.086206,0.074843,0.224553,0.086549,-0.057588,0.016611
4,0,4,0.110193,0.228858,0.112959,-0.162340,0.002300,0.080856,0.127424,0.115216,...,-0.167278,-0.100561,0.039007,0.009540,-0.085179,0.064918,0.214720,0.055244,-0.040498,0.037002
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2132,15,165,0.128285,0.239256,0.045888,-0.057041,-0.073387,0.057296,0.131580,0.057945,...,-0.199651,-0.202325,0.204370,-0.035747,-0.076555,0.004716,0.133694,0.019453,-0.054738,0.077643
2133,15,166,0.131026,0.239009,0.049180,-0.061535,-0.069464,0.056230,0.132437,0.062405,...,-0.198245,-0.194118,0.195637,-0.037976,-0.078846,0.005053,0.127778,0.019744,-0.044675,0.080291
2134,15,167,0.126997,0.241214,0.047327,-0.051941,-0.067960,0.049733,0.121557,0.055224,...,-0.194344,-0.187726,0.185099,-0.032484,-0.082734,0.008448,0.126175,0.020067,-0.047074,0.075651
2135,15,168,0.128382,0.245466,0.043619,-0.055429,-0.068091,0.038901,0.120740,0.060496,...,-0.192710,-0.192152,0.190167,-0.034960,-0.087215,0.007034,0.127286,0.021578,-0.048067,0.075383


In [39]:
df_y

0       0.0
1       0.0
2       0.0
3       1.0
4       0.0
       ... 
2132    1.0
2133    1.0
2134    1.0
2135    1.0
2136    0.0
Name: EventType, Length: 2137, dtype: float64

In [40]:
# convert df_X_test and df_y_test to correct format/dimensions
X_tensor, y_tensor = convert_df_to_3D_tensor(df_X, df_y)
# CONVERT TO PYTORCH TENSOR
X_tensor = torch.tensor(X_tensor, dtype=torch.float32)
y_tensor = torch.tensor(y_tensor, dtype=torch.float32)

(16, 180, 200)
(16, 180)


In [41]:
print(X_tensor.shape)
print(y_tensor.shape)

torch.Size([16, 180, 200])
torch.Size([16, 180])


In [42]:
max_periods = df_X.groupby('MatchID')['PeriodID'].max().reset_index()
X_seq_lengths = (max_periods['PeriodID']+1).tolist() # add +1 since max period ID + 1 is the seq len
X_seq_lengths = torch.tensor(X_seq_lengths)
X_seq_lengths

tensor([130, 130, 130, 180,  97, 130, 130, 130, 130, 130, 130, 130, 130, 130,
        130, 170])

In [43]:
# Best hidden size: 256, Best num layers: 2 with accuracy of 68.07692307692308
#best_hidden_size = 32
#best_num_layers = 3
# these seem to work very well!

In [44]:
# retrain model on all 16 matches (with best hyper parameters found above)
print(f"{best_hidden_size}, {best_num_layers}")
model = LSTMModel(input_size=200, hidden_size=best_hidden_size, num_layers=best_num_layers, dropout_rate=dropout_rate)
optimizer = get_optimizer(lr, model)
criterion = get_loss_criterion()


train_model(model, optimizer, criterion, X_tensor, y_tensor, X_seq_lengths, num_epochs=600)

32, 2
Epoch [0/600], Loss: 0.6903
Epoch [100/600], Loss: 0.3750
Epoch [200/600], Loss: 0.2747
Epoch [300/600], Loss: 0.1968
Epoch [400/600], Loss: 0.1489
Epoch [500/600], Loss: 0.0960
Epoch [599/600], Loss: 0.0749


In [45]:
# READ EVAL_TWEETS AND PREPROCESS DATA

# Read all eval files and concatenate them into one dataframe

li = []
i = 0
match_id_order = {}
for filename in listdir("eval_tweets"):
    if filename != '.ipynb_checkpoints':
        print(filename)
        df_eval = pd.read_csv("eval_tweets/" + filename)
        df_eval.drop(columns=['Timestamp'], inplace=True)
        # drop unused column(s)
        print(df_eval['MatchID'].unique())
        match_id = str(df_eval['MatchID'].unique()[0])
        match_id_order[match_id] = i
        # match_id_order[match_id]  = i means that the predictions of match_id are in the ith sequence
        df_eval['MatchID'] = str(i)
        df_eval['ID'] = str(i)+ '_' + df_eval['PeriodID'].astype(str)
        # makes sure that the match IDs are ordered from 0,1,2... with no missing values
        # this is for convenience and so it is easier to debug and follow along
        i+=1
        li.append(df_eval)
df_eval = pd.concat(li, ignore_index=True)


GreeceIvoryCoast44.csv
[9]
NetherlandsMexico64.csv
[15]
GermanyGhana32.csv
[6]
GermanySerbia2010.csv
[16]


In [46]:
df_eval

Unnamed: 0,ID,MatchID,PeriodID,Tweet
0,0_0,0,0,Wana place a bet on an ivory coast win but ain...
1,0_0,0,0,#WatchLive @FIFAWorldCup: Greece vs. Ivory Coa...
2,0_0,0,0,Gonna watch the Colombia and Japan game becaus...
3,0_0,0,0,#CIV vs #COL & #GRE vs #JPN! It would be inter...
4,0_0,0,0,RT @DuncanCastles: Quite a statistic this: Gre...
...,...,...,...,...
1072923,3_129,3,129,LETS GO #USA #worldcup
1072924,3_129,3,129,another upset in #WC2010 #Srb beat #Ger by 1-0 !!
1072925,3_129,3,129,RT @FIFAcom: #GER 0:1 #SRB: TheÂ finalÂ whistl...
1072926,3_129,3,129,dukung yg menang -_- RT @AlikaZahira: #bra #fr...


In [47]:
df_eval['Tweet'] = df_eval['Tweet'].apply(preprocess_text)

In [48]:
tweet_vectors = np.vstack([get_avg_embedding(tweet, embeddings_model, vector_size) for tweet in df_eval['Tweet']])
tweet_df = pd.DataFrame(tweet_vectors)
tweet_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
0,-0.134681,0.229862,-0.042323,0.013444,-0.203157,0.07497,0.330626,0.040816,0.315068,0.155409,...,0.071476,-0.158876,-0.003906,-0.285829,0.167653,-0.065119,0.301132,0.072499,0.408611,0.011564
1,-0.039456,0.129093,0.275312,-0.218841,-0.140618,0.05032,-0.131703,-0.141104,0.38786,-0.262571,...,0.197294,-0.105539,-0.111869,0.043191,0.016699,0.038589,0.253951,-0.07624,0.007587,-0.038478
2,-0.014135,0.097365,0.24203,-0.263193,-0.33675,0.127846,0.323738,-0.024958,0.320704,-0.346318,...,0.190761,-0.274125,0.035664,-0.119048,0.020838,-0.030073,0.063768,0.0703,-0.005234,0.117632
3,0.152916,0.359603,0.275224,-0.033671,-0.038763,0.01355,-0.083626,-0.0311,0.300301,0.081735,...,-0.026851,-0.156188,0.14242,0.092976,-0.050861,0.107874,0.201227,0.074644,-0.096115,-0.003723
4,0.132566,0.179217,0.242708,0.092353,-0.000681,0.071389,0.046733,0.104104,0.240512,0.004074,...,-0.030989,-0.123497,0.078241,0.070216,0.026782,0.100645,0.133896,0.100492,-0.094329,-0.016839


In [49]:
# Attach the vectors into the original dataframe
df_eval = pd.concat([df_eval, tweet_df], axis=1)

# Drop the columns that are not useful anymore
# no need for Tweet column since we have its corresponding vector embedding
df_eval.drop(columns=['Tweet'], inplace=True)


In [50]:
# by now should have df with columns: ID, match id, period id, tweet_vector. Tweet_vector is just 200 columns
df_eval

Unnamed: 0,ID,MatchID,PeriodID,0,1,2,3,4,5,6,...,190,191,192,193,194,195,196,197,198,199
0,0_0,0,0,-0.134681,0.229862,-0.042323,0.013444,-0.203157,0.074970,0.330626,...,0.071476,-0.158876,-0.003906,-0.285829,0.167653,-0.065119,0.301132,0.072499,0.408611,0.011564
1,0_0,0,0,-0.039456,0.129093,0.275312,-0.218841,-0.140618,0.050320,-0.131703,...,0.197294,-0.105539,-0.111869,0.043191,0.016699,0.038589,0.253951,-0.076240,0.007587,-0.038478
2,0_0,0,0,-0.014135,0.097365,0.242030,-0.263193,-0.336750,0.127846,0.323738,...,0.190761,-0.274125,0.035664,-0.119048,0.020838,-0.030073,0.063768,0.070300,-0.005234,0.117632
3,0_0,0,0,0.152916,0.359603,0.275224,-0.033671,-0.038763,0.013550,-0.083626,...,-0.026851,-0.156188,0.142420,0.092976,-0.050861,0.107874,0.201227,0.074644,-0.096115,-0.003723
4,0_0,0,0,0.132566,0.179217,0.242708,0.092353,-0.000681,0.071389,0.046733,...,-0.030989,-0.123497,0.078241,0.070216,0.026782,0.100645,0.133896,0.100492,-0.094329,-0.016839
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1072923,3_129,3,129,0.242499,0.274799,0.253372,-0.344633,0.068218,0.214728,0.314110,...,-0.092508,-0.006807,0.001040,0.088608,-0.015627,0.096820,0.532033,-0.064175,-0.045848,-0.013750
1072924,3_129,3,129,0.303763,0.572543,0.057643,-0.208308,0.002397,0.081378,0.025993,...,-0.206641,-0.101010,-0.044188,-0.060697,-0.091385,0.103763,0.232590,-0.094657,-0.023285,-0.056341
1072925,3_129,3,129,0.043595,0.414611,0.130372,-0.059497,0.081376,-0.034127,-0.195386,...,-0.147604,-0.150352,-0.072764,-0.004861,-0.175550,0.377446,0.180492,0.206887,-0.409686,0.206524
1072926,3_129,3,129,0.137295,0.125495,0.025162,-0.160762,0.075350,-0.050837,-0.310498,...,-0.182690,-0.233215,-0.060355,0.166417,-0.706687,-0.028383,-0.080826,0.050139,-0.361144,0.299408


In [51]:
# group the tweets into their corresponding periods to generate an average embedding vector for each period
# so there are no duplicate period id rows per match
# decreases size of data + makes it easier to fit into LSTM model
df_eval = df_eval.groupby(['MatchID', 'PeriodID', 'ID']).mean().reset_index()
df_eval.drop(columns=['ID'], inplace=True) 
df_eval['MatchID'] = df_eval['MatchID'].astype(int)
df_eval['PeriodID'] = df_eval['PeriodID'].astype(int)
# need to convert to int before sorting
df_eval.sort_values(by=['MatchID', 'PeriodID'], inplace=True)
df_eval.reset_index(drop=True, inplace=True)


In [52]:
df_eval

Unnamed: 0,MatchID,PeriodID,0,1,2,3,4,5,6,7,...,190,191,192,193,194,195,196,197,198,199
0,0,0,0.015923,0.149446,0.173955,-0.034232,-0.112096,-0.020943,0.098379,0.032518,...,0.031604,-0.160349,0.069600,0.003995,0.094348,0.067146,0.155998,0.066230,0.057143,0.031930
1,0,1,-0.001169,0.133767,0.182607,-0.027490,-0.134708,-0.008566,0.098646,0.020663,...,0.030606,-0.149372,0.059397,0.002850,0.117250,0.063284,0.179458,0.057333,0.048655,0.028986
2,0,2,0.007580,0.124594,0.206335,-0.035794,-0.129705,-0.000459,0.088574,0.039749,...,0.029702,-0.160906,0.071999,0.009865,0.114960,0.069602,0.175239,0.050672,0.066157,0.014529
3,0,3,-0.010865,0.103209,0.217068,-0.027762,-0.135133,-0.004802,0.100052,0.011394,...,0.057479,-0.161670,0.057051,-0.012966,0.151669,0.060390,0.207365,0.051373,0.089186,0.013806
4,0,4,-0.011664,0.120250,0.216792,-0.039718,-0.133382,0.006180,0.087144,0.011984,...,0.048003,-0.160830,0.055104,-0.011787,0.144863,0.058888,0.208625,0.055450,0.089034,0.017248
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
511,3,125,0.090554,0.227452,0.107798,-0.111091,0.034236,0.063783,0.053129,0.087328,...,-0.198507,-0.122853,0.024147,0.000304,-0.161041,0.092954,0.143728,0.113752,-0.049263,0.056543
512,3,126,0.086859,0.236306,0.107814,-0.125368,0.026619,0.063352,0.062930,0.091589,...,-0.189677,-0.125140,0.027448,0.003508,-0.144993,0.096095,0.155616,0.101075,-0.042474,0.058476
513,3,127,0.096795,0.220505,0.109566,-0.119462,0.023341,0.063660,0.084856,0.081134,...,-0.180416,-0.117967,0.033671,0.007132,-0.137807,0.088667,0.160295,0.102587,-0.045250,0.052347
514,3,128,0.090397,0.227700,0.104989,-0.123692,0.035497,0.059867,0.049117,0.097395,...,-0.188227,-0.115212,0.019899,0.006688,-0.167376,0.096072,0.149536,0.122288,-0.047816,0.063510


In [53]:
# NO EVENTTYPE, CAN ONLY MAKE PREDICTIONS WITHOUT KNOWING ACCURACY

# there is no df_y when we are trying to evaluate the matches in eval_tweets
# for kaggle submission!!
# let df_y have all zeros with the same number of rows as df_X
# this is just to make code run more easily, df_y
#     and tensor_y (value returned by convert_df_to_3D_tensor) will not be used

# df_y has no real meaning, only for ease of coding!
df_y = pd.Series(0, index=df_eval.index)


In [54]:
df_y

0      0
1      0
2      0
3      0
4      0
      ..
511    0
512    0
513    0
514    0
515    0
Length: 516, dtype: int64

In [55]:
X_eval_tensor, _ = convert_df_to_3D_tensor(df_eval, df_y)
# CONVERT TO PYTORCH TENSOR
X_eval_tensor = torch.tensor(X_eval_tensor, dtype=torch.float32)


(4, 130, 200)
(4, 130)


In [56]:
X_eval_tensor

tensor([[[ 0.0159,  0.1494,  0.1740,  ...,  0.0662,  0.0571,  0.0319],
         [-0.0012,  0.1338,  0.1826,  ...,  0.0573,  0.0487,  0.0290],
         [ 0.0076,  0.1246,  0.2063,  ...,  0.0507,  0.0662,  0.0145],
         ...,
         [ 0.0140,  0.0887,  0.1781,  ...,  0.0982,  0.0193,  0.0239],
         [ 0.0052,  0.0957,  0.1780,  ...,  0.0993,  0.0218,  0.0250],
         [ 0.0159,  0.0961,  0.1673,  ...,  0.1030,  0.0149,  0.0278]],

        [[ 0.1054,  0.2370,  0.0124,  ...,  0.0596, -0.0988,  0.1500],
         [ 0.0932,  0.2318,  0.0044,  ...,  0.0578, -0.0992,  0.1461],
         [ 0.0896,  0.2348, -0.0018,  ...,  0.0530, -0.0899,  0.1400],
         ...,
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],

        [[ 0.1589,  0.2648,  0.0580,  ...,  0.0254, -0.0392,  0.1892],
         [ 0.1563,  0.2714,  0.0593,  ...,  0

In [57]:
max_periods = df_eval.groupby('MatchID')['PeriodID'].max().reset_index()
eval_seq_lengths = (max_periods['PeriodID']+1).tolist() # add +1 since max period ID + 1 is the seq len
eval_seq_lengths = torch.tensor(eval_seq_lengths)
eval_seq_lengths

tensor([130, 126, 130, 130])

In [58]:
# make prediction
predicted_classes = predict(model, X_eval_tensor, eval_seq_lengths, threshold = 0.5)

In [59]:
predicted_classes

tensor([[0., 0., 1., 1., 1., 0., 0., 0., 1., 1., 1., 0., 1., 0., 1., 1., 0., 0.,
         0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0.,
         0., 0., 1., 0., 0., 0., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0.,
         0., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 0., 1., 1., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1.],
        [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
         0., 0., 0., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 1., 1., 1., 1., 0., 0., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0.,
  

In [60]:
predicted_classes.shape

torch.Size([4, 130])

In [61]:
# MAKE CSV OF OUTPUT WITH CORRECT MATCH IDS

In [62]:
# confirm file "our_predictions.csv" exists
# the first column is already hardcoded since it always has the same values

file_name = "our_predictions.csv"

if path.exists(file_name):
    print("File exists!")
else:
    raise ValueError(f"File '{file_name}' does not exist in the current directory.")
    

File exists!


In [63]:
# now loop through file and add predictions

In [64]:
# match_id_order[match_id]  = i means that the predictions of match_id are in the ith sequence
match_id_order

{'9': 0, '15': 1, '6': 2, '16': 3}

In [65]:

# Read the CSV file
with open(file_name, 'r') as file:
    reader = csv.reader(file)
    rows = list(reader)

# add the prediction to each row

for i in range(1,len(rows)): # skip first row: ID,EventType
    row = rows[i]
    # row[0] is first column: ID: matchID_periodID
    # row[1] is second column: EventType, which we want to write with the prediction
    
    match_id, period_id = row[0].split("_")
    
    prediction = predicted_classes[match_id_order[match_id]][int(period_id)]
    print(f"{match_id} and {period_id} has prediction: {prediction}")
    row[1] = float(prediction)

# write the modified data back to the CSV file
with open(file_name, 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerows(rows)

6 and 0 has prediction: 0.0
6 and 1 has prediction: 0.0
6 and 2 has prediction: 0.0
6 and 3 has prediction: 0.0
6 and 4 has prediction: 0.0
6 and 5 has prediction: 1.0
6 and 6 has prediction: 1.0
6 and 7 has prediction: 1.0
6 and 8 has prediction: 1.0
6 and 9 has prediction: 1.0
6 and 10 has prediction: 1.0
6 and 11 has prediction: 1.0
6 and 12 has prediction: 1.0
6 and 13 has prediction: 1.0
6 and 14 has prediction: 0.0
6 and 15 has prediction: 0.0
6 and 16 has prediction: 0.0
6 and 17 has prediction: 0.0
6 and 18 has prediction: 0.0
6 and 19 has prediction: 0.0
6 and 20 has prediction: 0.0
6 and 21 has prediction: 0.0
6 and 22 has prediction: 0.0
6 and 23 has prediction: 0.0
6 and 24 has prediction: 0.0
6 and 25 has prediction: 1.0
6 and 26 has prediction: 0.0
6 and 27 has prediction: 1.0
6 and 28 has prediction: 1.0
6 and 29 has prediction: 1.0
6 and 30 has prediction: 0.0
6 and 31 has prediction: 0.0
6 and 32 has prediction: 0.0
6 and 33 has prediction: 1.0
6 and 34 has prediction:

In [66]:
# our_predictions.csv contains the predictions!
# our_predictions.csv NEEDS TO BE IN CURRENT DIRECTORY WITH FIRST ROW AND FIRST COLUMN WITH EXPECTED VALUES
#     THIS IS WHY our_predictions.csv is added to github repo, it is needed to run the code
# DONE!!!

In [67]:
# NOTES
# HOW TO MAKE SURE THAT we:
# 1. DO NOT ignore the order of the tweets -> (LSTM)
# 2. treat each time period as RELATED to the football match they belong to -> treat each match as a sequence, train LSTM on every sequence
#                      since pytorch tensor expects multiple sequences (batches)



# for LSTM: Each input sequence consists of tweet embeddings (200 dimensional) from a specific match, ordered by Period ID.
#   tweets of different matches are unrelated, but tweets of a same match are related sequentially (chronologically)