# Import libraries and model

In [1]:
from os import listdir
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import gc
import gensim.downloader as api

import torch
import torch.nn as nn

from math import ceil
import torch.optim as optim

nltk.download('stopwords')
nltk.download('wordnet')

# Load GloVe model with Gensim's API - Twitter specific embedding
embeddings_model = api.load("glove-twitter-200")  # 200-dimensional GloVe embeddings

#To check that T4 GPU is connected
#!nvidia-smi

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/infres/kbrowder-24/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/infres/kbrowder-24/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# %pip install "modin[all]"
# import modin.pandas as pd

In [3]:
import pandas as pd

In [4]:
# %pip install swifter
# import swifter

In [5]:
import swifter

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
# import os
# os.environ["MODIN_ENGINE"] = "dask"  # Options: "ray" or "dask"
# import modin.pandas as pd

In [7]:
# swifter

# Data preprocessing

In [8]:
# Read all training files and concatenate them into one dataframe

#import os
#print(os.getcwd())
def make_dataset_df(dir_name):
    li = []
    i = 0
    for filename in listdir(dir_name):
        if filename != '.ipynb_checkpoints':
            print(filename)
            df = pd.read_csv(dir_name + "/" + filename)
            # df.drop(columns=['Timestamp'], inplace=True)
            # drop unused column(s)
            df['MatchID'] = str(i)
            df['ID'] = str(i)+ '_' + df['PeriodID'].astype(str)
            # makes sure that the match IDs are ordered from 0,1,2... with no missing values
            i+=1
            li.append(df)
    df = pd.concat(li, ignore_index=True)
    del li
    gc.collect()
    #print(len(df))
    return df

In [9]:
df = make_dataset_df("train_tweets")
df

USASlovenia2010.csv
ArgentinaBelgium72.csv
AustraliaSpain34.csv
ArgentinaGermanyFinal77.csv
AustraliaNetherlands29.csv
BelgiumSouthKorea59.csv
HondurasSwitzerland54.csv
FranceGermany70.csv
GermanyBrazil74.csv
GermanyUSA57.csv
MexicoCroatia37.csv
FranceNigeria66.csv
CameroonBrazil36.csv
NetherlandsChile35.csv
PortugalGhana58.csv
GermanyAlgeria67.csv


Unnamed: 0,ID,MatchID,PeriodID,EventType,Timestamp,Tweet
0,0_0,0,0,0,1276869000000,#USA All My Stateside Followers Stand Up And R...
1,0_0,0,0,0,1276869000000,@Lynz_89 I think the ref might have been Basil...
2,0_0,0,0,0,1276869000000,Hoping a #USA win can help ease the pain of la...
3,0_0,0,0,0,1276869000000,When does this actually start? #worldcup
4,0_0,0,0,0,1276869000000,Hanson and Roy are a proper pundit line up. #w...
...,...,...,...,...,...,...
5056045,15_169,15,169,0,1404168000000,RT @FOXSoccer: 3/4 of the #WorldCup quarterfin...
5056046,15_169,15,169,0,1404168000000,RT @Rodolph_hilal: Plz guys RETWEET .. \n\nLet...
5056047,15_169,15,169,0,1404168000000,RT @Joey7Barton: Algeria can take a lot of pos...
5056048,15_169,15,169,0,1404168000000,"RT @caughtoffside: #ALG gave it their all, was..."


In [10]:
def preprocess_series(ser:pd.Series):
    # Preprocessing of tweet
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    def preprocess_text(text):
        # Lowercasing
        text = text.lower()
        # Remove non letter and whitespace
        text = re.sub(r'[^a-z\s]', '', text)
        # Remove numbers
        # text = re.sub(r'\d+', '', text)
        # Tokenization
        words = text.split()
        # Remove stopwords and lemmatize
        words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
        # Lemmatization
        # words = [lemmatizer.lemmatize(word) for word in words]
        return ' '.join(words)
    return ser.swifter.apply(preprocess_text)


In [11]:
# Apply preprocessing to each tweet
df['Tweet'] = preprocess_series(df['Tweet'])

Pandas Apply: 100%|██████████| 5056050/5056050 [02:55<00:00, 28769.58it/s]


# Tweet Embeddings

In [17]:
def gen_avg_embeddings(ser: pd.Series, vector_size = 200):
    # Get vector tweet embeddings
    # TODOOOOOOOOOOOOOOOO maybe instead of avg word embedding for each tweet can get sentence
    #   embeddings to retain more information
    #   -> can try more complex functions here
    #   -> avg embedding of each word for a tweet is fine for now, maybe works well enough

    # Function to compute the average word vector for a tweet
    def get_avg_embedding(tweet, model):
        words = tweet.split()  # Tokenize by whitespace
        word_vectors = [model[word] for word in words if word in model]
        if not word_vectors:  # If no words in the tweet are in the vocabulary, return a zero vector
            return np.zeros(vector_size)
        return np.mean(word_vectors, axis=0)
    
    from functools import partial

    f = partial(get_avg_embedding, model=embeddings_model)
    f.__name__ = "paritla"
    f.__module__ = get_avg_embedding.__module__

    return ser.swifter.apply(f)
    

In [18]:
df['tweet_vector'] = gen_avg_embeddings(df['Tweet'], vector_size=200)

Pandas Apply: 100%|██████████| 5056050/5056050 [02:26<00:00, 34625.78it/s]


In [19]:
# by now should have df with columns: ID, match id, period id, Event Type, tweet_vector. Tweet_vector is just 200 columns
df

Unnamed: 0,ID,MatchID,PeriodID,EventType,Timestamp,Tweet,tweet_vector
0,0_0,0,0,0,1276869000000,usa stateside follower stand represent beautif...,"[0.066179484, 0.25621355, 0.08082778, -0.33907..."
1,0_0,0,0,0,1276869000000,lynz think ref might basil fawlty actually wor...,"[-0.1474687, 0.3667189, 0.10468656, 0.049433, ..."
2,0_0,0,0,0,1276869000000,hoping usa win help ease pain last night loss ...,"[0.22338197, 0.27252772, 0.07159816, -0.293135..."
3,0_0,0,0,0,1276869000000,actually start worldcup,"[-0.10895123, 0.28830335, 0.40979335, -0.22218..."
4,0_0,0,0,0,1276869000000,hanson roy proper pundit line worldcup,"[-0.112306446, 0.11411217, 0.24610366, -0.4545..."
...,...,...,...,...,...,...,...
5056045,15_169,15,169,0,1404168000000,rt foxsoccer worldcup quarterfinal set ger v f...,"[0.13047262, 0.2851209, 0.018671205, -0.052115..."
5056046,15_169,15,169,0,1404168000000,rt rodolphhilal plz guy retweet let trend alg ...,"[0.30743918, 0.36894467, -0.1729, -0.27685714,..."
5056047,15_169,15,169,0,1404168000000,rt joeybarton algeria take lot positive big bi...,"[0.14770256, 0.20187803, -0.1268579, -0.095563..."
5056048,15_169,15,169,0,1404168000000,rt caughtoffside alg gave wasnt enough heart t...,"[0.016947214, 0.28408444, 0.08043686, 0.101969..."


In [20]:
period_chunks = 10

In [21]:
# Sort by MatchID, PeriodID, and Timestamp to maintain order
df = df.sort_values(by=['MatchID', 'PeriodID', 'Timestamp']).reset_index(drop=True)

# Helper function to assign chunks
def assign_chunks(group, n_chunks=period_chunks):
    chunk_size = len(group) / n_chunks
    return (np.floor(np.arange(len(group)) / chunk_size)).astype(int)

# Apply chunk assignment within each MatchID and PeriodID
df['chunk'] = df.groupby(['MatchID', 'PeriodID']).apply(
    lambda group: assign_chunks(group)
).explode().reset_index(drop=True)

  df['chunk'] = df.groupby(['MatchID', 'PeriodID']).apply(


In [22]:
# group the tweets into their corresponding periods to generate an average embedding vector for each period
# so there are no duplicate period id rows per match
# decreases size of data + makes it easier to fit into LSTM model
df.drop(columns=['Tweet'], inplace=True)
df = df.groupby(['MatchID', 'PeriodID', 'ID','chunk']).mean().reset_index()
df.drop(columns=['ID'], inplace=True) 
df['MatchID'] = df['MatchID'].astype(int)
df['PeriodID'] = df['PeriodID'].astype(int)
# need to convert to int before sorting
df.sort_values(by=['MatchID', 'PeriodID', 'chunk'], inplace=True)
df.reset_index(drop=True, inplace=True)


In [23]:
# df.loc[df["EventType"] != 0, "EventType"] = 1
df

Unnamed: 0,MatchID,PeriodID,chunk,EventType,Timestamp,tweet_vector
0,0,0,0,0.0,1.276869e+12,"[0.057772532, 0.23096578, 0.07216391, -0.19907..."
1,0,0,1,0.0,1.276869e+12,"[0.041801233, 0.15167381, 0.0639174, -0.114342..."
2,0,0,2,0.0,1.276869e+12,"[0.109296665, 0.26712674, 0.095571965, -0.1406..."
3,0,0,3,0.0,1.276869e+12,"[0.07338316502192846, 0.23660319900283447, 0.1..."
4,0,0,4,0.0,1.276869e+12,"[0.105503604, 0.24286334, 0.10696903, -0.17454..."
...,...,...,...,...,...,...
21365,15,169,5,0.0,1.404168e+12,"[0.12281445685245027, 0.24359074410450818, 0.0..."
21366,15,169,6,0.0,1.404168e+12,"[0.122550234, 0.25029683, 0.04034065, -0.05934..."
21367,15,169,7,0.0,1.404168e+12,"[0.12122838, 0.23808606, 0.04404255, -0.045798..."
21368,15,169,8,0.0,1.404168e+12,"[0.13335473583790153, 0.24798255305110292, 0.0..."


In [24]:
# df['tweet_vector'] = df['tweet_vector'].swifter.apply(lambda v: v / np.linalg.norm(v))

In [25]:
df_agg = (
    df.groupby(['MatchID', 'PeriodID'])
    .agg(
        period_matrix=('tweet_vector', lambda x: np.stack(x.to_numpy())),  # Stack the mean vectors
        mean_event_type=('EventType', 'mean')  # Compute the mean event type
    )
    .reset_index()
)

In [26]:
df = df_agg

In [27]:
df.rename(columns={'period_matrix': 'tweet_vector', 'mean_event_type':'EventType'}, inplace=True)

In [28]:
df

Unnamed: 0,MatchID,PeriodID,tweet_vector,EventType
0,0,0,"[[0.05777253210544586, 0.2309657782316208, 0.0...",0.0
1,0,1,"[[0.09481757879257202, 0.21990607678890228, 0....",0.0
2,0,2,"[[0.14055303, 0.20796777, 0.15461813, -0.15855...",0.0
3,0,3,"[[0.10580117255449295, 0.19765836000442505, 0....",1.0
4,0,4,"[[0.13657256960868835, 0.2028564214706421, 0.1...",0.0
...,...,...,...,...
2132,15,165,"[[0.12338580191135406, 0.23787692189216614, 0....",1.0
2133,15,166,"[[0.129442, 0.23737937, 0.051139485, -0.065910...",1.0
2134,15,167,"[[0.12258135131051058, 0.23725832031364097, 0....",1.0
2135,15,168,"[[0.14088391, 0.23857994, 0.048132025, -0.0540...",1.0


# Separate Train and Test data

In [29]:
# train on of the first 13 of 16 matches (16*0.8=12.8~=13)
# and the test data would be the last 3 matches. 
# Before submitting on Kaggle we should train on full dataset, so al 16 matches
train_percentage = 0.8
unique_match_ids = df['MatchID'].unique()
print(unique_match_ids)
num_matches_training = int(ceil(len(unique_match_ids)*train_percentage))
print(num_matches_training)

[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15]
13


In [30]:
target_match_id = num_matches_training
# target_match_id is first match id that will appear in test set
# all matches from target_match_id and after will be in test test
print(target_match_id)

13


In [31]:

#df2 = df['MatchID'] == 15
#df2

In [32]:
# row_index is first row with match id target_match_id
# row_index is then the first row of the matches that will go to the test


row_index = (df['MatchID'] == target_match_id).idxmax()
#row_index = df[df['MatchID'] == target_match_id].first_valid_index()
df_X_train = df[:row_index].copy()
df_X_test = df[row_index:].copy()


In [33]:
df_y_train = df_X_train['EventType']
df_y_test = df_X_test['EventType']

In [34]:
df_y_train

0       0.0
1       0.0
2       0.0
3       1.0
4       0.0
       ... 
1702    1.0
1703    1.0
1704    1.0
1705    1.0
1706    1.0
Name: EventType, Length: 1707, dtype: float64

In [35]:
df_y_test.reset_index(drop=True, inplace=True)
df_y_test

0      0.0
1      0.0
2      1.0
3      1.0
4      1.0
      ... 
425    1.0
426    1.0
427    1.0
428    1.0
429    0.0
Name: EventType, Length: 430, dtype: float64

In [36]:
df_X_train.drop(['EventType'], axis=1, inplace=True)
df_X_test.drop(['EventType'], axis=1, inplace=True)

In [37]:
df_X_train

Unnamed: 0,MatchID,PeriodID,tweet_vector
0,0,0,"[[0.05777253210544586, 0.2309657782316208, 0.0..."
1,0,1,"[[0.09481757879257202, 0.21990607678890228, 0...."
2,0,2,"[[0.14055303, 0.20796777, 0.15461813, -0.15855..."
3,0,3,"[[0.10580117255449295, 0.19765836000442505, 0...."
4,0,4,"[[0.13657256960868835, 0.2028564214706421, 0.1..."
...,...,...,...
1702,12,125,"[[0.013565172, 0.21806225, 0.12027183, -0.1911..."
1703,12,126,"[[0.019887732, 0.18984045, 0.12175157, -0.1859..."
1704,12,127,"[[0.013504726, 0.2305444, 0.10003953, -0.21081..."
1705,12,128,"[[0.006752035, 0.23206353, 0.1094508, -0.21238..."


In [38]:
df_X_test.reset_index(drop=True, inplace=True)
df_X_test

Unnamed: 0,MatchID,PeriodID,tweet_vector
0,13,0,"[[0.1479858, 0.24614787, -0.03657544, -0.09510..."
1,13,1,"[[0.1741992, 0.24961157, -0.017150475, -0.0957..."
2,13,2,"[[0.14361763, 0.26430285, -0.023285097, -0.092..."
3,13,3,"[[0.157705, 0.23600611, -0.013279277, -0.09872..."
4,13,4,"[[0.13839455, 0.1971522, -0.012201323, -0.1086..."
...,...,...,...
425,15,165,"[[0.12338580191135406, 0.23787692189216614, 0...."
426,15,166,"[[0.129442, 0.23737937, 0.051139485, -0.065910..."
427,15,167,"[[0.12258135131051058, 0.23725832031364097, 0...."
428,15,168,"[[0.14088391, 0.23857994, 0.048132025, -0.0540..."


In [39]:
# now df_X_train and df_X_test should have columns MatchID, PeriodID, tweet_vector. Tweet_vector is just 200 columns
# df_y_train and df_y_test should have 1 column, EventType
# the matchids are grouped together so all the rows of the same
# match ids are grouped next to each other, and the periodID are ordered chronologically.

In [40]:
# not working on my machine so I keep DF

# now we have df_X_train, df_X_test, df_y_train, df_y_test
# we no longer need df so we should free up the memory
# del df  # remove reference to the original DataFrame
# gc.collect()  # force garbage collection to free up memory

In [41]:
max_periods = df_X_train.groupby('MatchID')['PeriodID'].max().reset_index()
max_periods
# as we can see not every match has the same number of periods!

Unnamed: 0,MatchID,PeriodID
0,0,129
1,1,129
2,2,129
3,3,179
4,4,96
5,5,129
6,6,129
7,7,129
8,8,129
9,9,129


In [42]:
max_periods = df_X_test.groupby('MatchID')['PeriodID'].max().reset_index()
max_periods

Unnamed: 0,MatchID,PeriodID
0,13,129
1,14,129
2,15,169


# Format data for PyTorch LSTM

In [43]:
# input tensor for a PyTorch LSTM should have the shape of (when setting batch_first=True)
# (batch_size, seq_len, num_features) when using the batch_first=True parameter
# batch_size is number of sequences processed at once

# TRY WITHOUT SLIDING WINDOW APPROACH
#    which would mean batch size = number of matches
#    much easier to format for LSTM as 3D tensor
#    dimension of 3D tensor with batch_first=True:(batch_size = num_matches, seq_len = num_periods, num _features = 200)
#    (match_id, period_id, num_features=200)
#     not every match has the same number of periods!, so seq_len can vary between different matches
#     fix: will have to pad with zeroes
# we want tensor[match_id][period_id] to return list len 200 of corresponding tweet vector


In [44]:
df_X_train

Unnamed: 0,MatchID,PeriodID,tweet_vector
0,0,0,"[[0.05777253210544586, 0.2309657782316208, 0.0..."
1,0,1,"[[0.09481757879257202, 0.21990607678890228, 0...."
2,0,2,"[[0.14055303, 0.20796777, 0.15461813, -0.15855..."
3,0,3,"[[0.10580117255449295, 0.19765836000442505, 0...."
4,0,4,"[[0.13657256960868835, 0.2028564214706421, 0.1..."
...,...,...,...
1702,12,125,"[[0.013565172, 0.21806225, 0.12027183, -0.1911..."
1703,12,126,"[[0.019887732, 0.18984045, 0.12175157, -0.1859..."
1704,12,127,"[[0.013504726, 0.2305444, 0.10003953, -0.21081..."
1705,12,128,"[[0.006752035, 0.23206353, 0.1094508, -0.21238..."


In [45]:
# modified for array tweet_vector column
def convert_df_to_3D_tensor(df_X, df_y, max_num_period=None):
    # df_X should have columns MatchID, PeriodID, tweet_vector. Tweet_vector is just 200x1 array
    # rows with same matchID should be grouped together (adjacent rows)
    # df_y should have one column (the EventType)
    # returns tensor_X numpy array already padded! shape: (num_matches, max_num_periods, num _features = 200)
    # and tensor_y of shape: (num_matches, max_num_periods)
    num_matches = len(df_X['MatchID'].unique())
    max_periods = df_X.groupby('MatchID')['PeriodID'].max().reset_index()
    total_max_period = max_periods['PeriodID'].max()
    if max_num_period is None:
        max_num_period = total_max_period + 1
    #total_max_period is max seq len

    tensor_X = np.zeros((num_matches, max_num_period*period_chunks, 200))

    tensor_y = np.zeros((num_matches, max_num_period))
    print(tensor_X.shape)
    print(tensor_y.shape)
    
    i=0
    previous_match_id = df_X['MatchID'][0]
    for row_index, row in df_X.iterrows():
        match_id = int(row['MatchID'])

        if match_id != previous_match_id:
            i+=1
            previous_match_id = match_id
        
        period_id = int(row['PeriodID'])
        p_i = period_id * period_chunks
        p_i_end = p_i + period_chunks
        
        features = row['tweet_vector']  # Skip MatchID and PeriodID
        tensor_X[i, p_i:p_i_end, :] = features
        tensor_y[i,period_id] = df_y[row_index]
        
    return tensor_X, tensor_y


X_train_tensor, y_train_tensor = convert_df_to_3D_tensor(df_X_train, df_y_train)
# X_train_tensor[match_id][period_id] to return list len 200 of corresponding tweet vector
# y_train_tensor[match_id][period_id] to return corresponding EventType (1 or 0)
# match_id index starts at 0 even if first match in df doesnt have match id 0
#X_train_tensor[12][175]
#X_train_tensor[12][179]
#X_train_tensor[2][129]


(13, 1800, 200)
(13, 180)


In [46]:
#print(X_train_tensor[0][3])
#print(y_train_tensor[0][3])

In [47]:
# SCALE DATA? minmaxscaler for example!
# SCALING MIGHT BE UNNECESSARY SINCE OUTPUT OF GLOVE TWEET 200 IS ALREADY SCALED BETWEEN -1 AND 1
#scaler = MinMaxScaler()
#tensor = scaler.fit_transform(tensor)

# CONVERT TO PYTORCH TENSOR
X_train_tensor = torch.tensor(X_train_tensor, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train_tensor, dtype=torch.float32)

print(X_train_tensor.shape)
print(y_train_tensor.shape)
# X_train_tensor, y_train_tensor are now pytorch tensors

torch.Size([13, 1800, 200])
torch.Size([13, 180])


# LSTM Model

In [48]:
# TODO VERIFY ITS CORRECT + MAKE MORE SOPHISTICATED
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, dropout_rate):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout_rate)
        self.fc = nn.Linear(hidden_size, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        out = self.fc(lstm_out)
        out = out[:,0::period_chunks,:]
        out = self.sigmoid(out) # applying sigmoid to convert to probabilities
        return out.squeeze(-1)

#TODOOOOOOOOOO torch.nn.utils.rnn.pack_padded_sequence. This allows the model to ignore the padded values during computation.

In [49]:
# Hyperparams
batch_size = 120
hidden_size = 300 # can tune
num_layers = 2 # can tune
dropout_rate = 0.75 # can tune
num_epochs = 300 # can tune
lr = 0.0003 # can tune

In [50]:
batch_idxs = list(range(0,X_train_tensor.shape[0],batch_size))
batched_data = []
for idx in batch_idxs:
    batched_data.append((X_train_tensor[idx:idx+batch_size], y_train_tensor[idx:idx+batch_size]))

In [51]:
gpu = torch.device('cuda:1')

In [52]:
# convert df_X_test and df_y_test to correct format/dimensions
X_test_tensor, y_test_tensor = convert_df_to_3D_tensor(df_X_test, df_y_test, max_num_period=180)
# CONVERT TO PYTORCH TENSOR
X_test_tensor = torch.tensor(X_test_tensor, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test_tensor, dtype=torch.float32)

(3, 1800, 200)
(3, 180)


# Train model

In [53]:
model = LSTMModel(input_size=200, hidden_size=hidden_size, num_layers=num_layers, dropout_rate=dropout_rate)
model = model.to(gpu)
optimizer = optim.Adam(model.parameters(), lr=lr)
criterion = nn.BCELoss() # great for binary classification
criterion = criterion.to(gpu)
#print(f"Shape of X_train_tensor: {X_train_tensor.shape}")
model.train()
for epoch in range(num_epochs):
    optimizer.zero_grad()
    for train, label in batched_data:
        train = train.to(gpu)
        label = label.to(gpu)
        outputs = model(train)
        #print(f"shape of outputs: {outputs.shape}")
        
        loss = criterion(outputs, label)
        loss.backward()
    optimizer.step()
    if epoch % 10 == 0:
        with torch.no_grad():
            predict = model(X_test_tensor.to(gpu))
            e_loss = criterion(predict, y_test_tensor.to(gpu))
            print(f"Epoch [{epoch}/{num_epochs}], Loss: {loss.item():.4f}", f"Eval Loss: {e_loss.item():.4f}")

print("Model is trained! (on training data)")

Epoch [0/300], Loss: 0.6976 Eval Loss: 0.6970
Epoch [10/300], Loss: 0.6567 Eval Loss: 0.6758
Epoch [20/300], Loss: 0.4884 Eval Loss: 0.5638
Epoch [30/300], Loss: 0.4883 Eval Loss: 0.5552
Epoch [40/300], Loss: 0.4777 Eval Loss: 0.5484
Epoch [50/300], Loss: 0.4674 Eval Loss: 0.5372
Epoch [60/300], Loss: 0.4545 Eval Loss: 0.5347
Epoch [70/300], Loss: 0.4457 Eval Loss: 0.5242
Epoch [80/300], Loss: 0.4374 Eval Loss: 0.5093
Epoch [90/300], Loss: 0.4325 Eval Loss: 0.5097
Epoch [100/300], Loss: 0.4272 Eval Loss: 0.5046
Epoch [110/300], Loss: 0.4192 Eval Loss: 0.4999
Epoch [120/300], Loss: 0.4136 Eval Loss: 0.5020
Epoch [130/300], Loss: 0.4091 Eval Loss: 0.5124
Epoch [140/300], Loss: 0.4066 Eval Loss: 0.5204
Epoch [150/300], Loss: 0.4055 Eval Loss: 0.4998
Epoch [160/300], Loss: 0.4013 Eval Loss: 0.4854
Epoch [170/300], Loss: 0.3946 Eval Loss: 0.4920
Epoch [180/300], Loss: 0.3963 Eval Loss: 0.5159
Epoch [190/300], Loss: 0.3881 Eval Loss: 0.4975
Epoch [200/300], Loss: 0.4102 Eval Loss: 0.4847
Epo

# Evaluate on test data

In [54]:
#print(X_test_tensor[2][129])
#print(y_test_tensor[2][129])

In [55]:




model.eval()

with torch.no_grad():
    predictions = model(X_test_tensor.to(gpu)).to('cpu')

# predictions have values between 0 and 1 because forward pass of LSTM contains sigmoid at output
#print(predictions)

predicted_classes = (predictions > 0.5).float() # 0.5 is threshold
#this converts to same dimensional array of True or false, and .float() converts True to 1 and False to 0

#print(predicted_classes)
 

In [56]:
predicted_classes

tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
         0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1.,
         1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0.

In [57]:
#print(predictions.shape)

In [58]:
# performance metrics

loss = criterion(predictions, y_test_tensor) # use predictions for loss calculation

print(f"Binary Cross-Entropy Loss: {loss.item():.4f}")

def accuracy(y_true, y_pred):
    if y_true.dtype != y_pred.dtype or y_true.shape != y_pred.shape:
        raise ValueError(f"Inputs do not have same type or shape!")
    y_rand = torch.randint(0,2,y_true.shape)
    correct_predictions = (y_true == y_pred).sum().item()
    total_predictions = y_true.numel()
    accuracy = correct_predictions / total_predictions * 100
    return accuracy
accuracy = accuracy(y_test_tensor, predicted_classes)


print(f"Accuracy: {accuracy:.4f}")

#print(y_test_tensor.shape)
#print(predicted_classes.shape)


# Visualization of Actual vs Predicted Classes
# import matplotlib.pyplot as plt
# TODO COULD USE PLT TO VISUALIZE?

Binary Cross-Entropy Loss: 0.5183
Accuracy: 73.8889


# For Kaggle Submission

In [59]:
# RETRAIN MODEL ON ENTIRE TRAINING DATA AND EVALUATE EVAL TWEETS




df_X = pd.concat([df_X_train, df_X_test], ignore_index=True)
df_y = pd.concat([df_y_train, df_y_test], ignore_index=True)




In [60]:
df_X

Unnamed: 0,MatchID,PeriodID,tweet_vector
0,0,0,"[[0.05777253210544586, 0.2309657782316208, 0.0..."
1,0,1,"[[0.09481757879257202, 0.21990607678890228, 0...."
2,0,2,"[[0.14055303, 0.20796777, 0.15461813, -0.15855..."
3,0,3,"[[0.10580117255449295, 0.19765836000442505, 0...."
4,0,4,"[[0.13657256960868835, 0.2028564214706421, 0.1..."
...,...,...,...
2132,15,165,"[[0.12338580191135406, 0.23787692189216614, 0...."
2133,15,166,"[[0.129442, 0.23737937, 0.051139485, -0.065910..."
2134,15,167,"[[0.12258135131051058, 0.23725832031364097, 0...."
2135,15,168,"[[0.14088391, 0.23857994, 0.048132025, -0.0540..."


In [61]:
df_y

0       0.0
1       0.0
2       0.0
3       1.0
4       0.0
       ... 
2132    1.0
2133    1.0
2134    1.0
2135    1.0
2136    0.0
Name: EventType, Length: 2137, dtype: float64

In [62]:
# convert df_X_test and df_y_test to correct format/dimensions
X_tensor, y_tensor = convert_df_to_3D_tensor(df_X, df_y)
# CONVERT TO PYTORCH TENSOR
X_tensor = torch.tensor(X_tensor, dtype=torch.float32)
y_tensor = torch.tensor(y_tensor, dtype=torch.float32)

(16, 1800, 200)
(16, 180)


In [63]:
# NOTES
# HOW TO MAKE SURE THAT we:
# 1. DO NOT ignore the order of the tweets -> (LSTM)
# 2. treat each time period as RELATED to the football match they belong to -> treat each match as a sequence, train LSTM on every sequence
#                      since pytorch tensor expects multiple sequences (batches)



# for LSTM: Each input sequence should consist of tweets from a specific match, ordered by Period ID.
#   tweets of different matches are unrelated, but tweets of a same match are related sequentially (chronologically)
#   structure training data such that tweets are grouped by match id, and ordered by period id

In [64]:
batch_size = 100
batch_idxs = list(range(0,X_tensor.shape[0],batch_size))
batched_data = []
for idx in batch_idxs:
    batched_data.append((X_tensor[idx:idx+batch_size], y_tensor[idx:idx+batch_size]))

In [65]:
model = LSTMModel(input_size=200, hidden_size=hidden_size, num_layers=num_layers, dropout_rate=dropout_rate)
model = model.to(gpu)
optimizer = optim.Adam(model.parameters(), lr=lr)
criterion = nn.BCELoss() # great for binary classification
criterion = criterion.to(gpu)
#print(f"Shape of X_train_tensor: {X_train_tensor.shape}")
model.train()
for epoch in range(num_epochs):
    optimizer.zero_grad()
    for train, label in batched_data:
        train = train.to(gpu)
        label = label.to(gpu)
        outputs = model(train)
        #print(f"shape of outputs: {outputs.shape}")
        
        loss = criterion(outputs, label)
        loss.backward()
    optimizer.step()
    if epoch % 10 == 0:
        print(f"Epoch [{epoch}/{num_epochs}], Loss: {loss.item():.4f}")

print("Model is trained! (on training data)")

Epoch [0/300], Loss: 0.6928
Epoch [10/300], Loss: 0.6486
Epoch [20/300], Loss: 0.4979
Epoch [30/300], Loss: 0.4773
Epoch [40/300], Loss: 0.4682
Epoch [50/300], Loss: 0.4583
Epoch [60/300], Loss: 0.4494
Epoch [70/300], Loss: 0.4425
Epoch [80/300], Loss: 0.4337
Epoch [90/300], Loss: 0.4267
Epoch [100/300], Loss: 0.4194
Epoch [110/300], Loss: 0.4155
Epoch [120/300], Loss: 0.4160
Epoch [130/300], Loss: 0.4053
Epoch [140/300], Loss: 0.4065
Epoch [150/300], Loss: 0.4009
Epoch [160/300], Loss: 0.3984
Epoch [170/300], Loss: 0.3925
Epoch [180/300], Loss: 0.3893
Epoch [190/300], Loss: 0.6507
Epoch [200/300], Loss: 0.5389
Epoch [210/300], Loss: 0.4928
Epoch [220/300], Loss: 0.4786
Epoch [230/300], Loss: 0.4825
Epoch [240/300], Loss: 0.4740
Epoch [250/300], Loss: 0.4595
Epoch [260/300], Loss: 0.4395
Epoch [270/300], Loss: 0.4322
Epoch [280/300], Loss: 0.4280
Epoch [290/300], Loss: 0.4183
Model is trained! (on training data)


In [66]:
# Read all training files and concatenate them into one dataframe

#import os
#print(os.getcwd())

li = []
i = 0
for filename in listdir("eval_tweets"):
    if filename != '.ipynb_checkpoints':
        print(filename)
        df = pd.read_csv("eval_tweets/" + filename)
        # df.drop(columns=['Timestamp'], inplace=True)
        # drop unused column(s)
        # makes sure that the match IDs are ordered from 0,1,2... with no missing values
        i+=1
        li.append(df)
eval_df = pd.concat(li, ignore_index=True)
#print(len(df))
eval_df

GreeceIvoryCoast44.csv
NetherlandsMexico64.csv
GermanyGhana32.csv
GermanySerbia2010.csv


Unnamed: 0,ID,MatchID,PeriodID,Timestamp,Tweet
0,9_0,9,0,1403639400000,Wana place a bet on an ivory coast win but ain...
1,9_0,9,0,1403639400000,#WatchLive @FIFAWorldCup: Greece vs. Ivory Coa...
2,9_0,9,0,1403639400000,Gonna watch the Colombia and Japan game becaus...
3,9_0,9,0,1403639400000,#CIV vs #COL & #GRE vs #JPN! It would be inter...
4,9_0,9,0,1403639400000,RT @DuncanCastles: Quite a statistic this: Gre...
...,...,...,...,...,...
1072923,16_129,16,129,1276867800000,LETS GO #USA #worldcup
1072924,16_129,16,129,1276867800000,another upset in #WC2010 #Srb beat #Ger by 1-0 !!
1072925,16_129,16,129,1276867800000,RT @FIFAcom: #GER 0:1 #SRB: TheÂ finalÂ whistl...
1072926,16_129,16,129,1276867800000,dukung yg menang -_- RT @AlikaZahira: #bra #fr...


In [67]:
# Apply preprocessing to each tweet
eval_df['Tweet'] = preprocess_series(eval_df['Tweet'])

Pandas Apply: 100%|██████████| 1072928/1072928 [00:37<00:00, 28765.45it/s]


In [69]:
eval_df["tweet_vector"] = gen_avg_embeddings(eval_df['Tweet'])

Pandas Apply: 100%|██████████| 1072928/1072928 [00:30<00:00, 35577.16it/s]


In [70]:
# Sort by MatchID, PeriodID, and Timestamp to maintain order
eval_df = eval_df.sort_values(by=['MatchID', 'PeriodID', 'Timestamp']).reset_index(drop=True)

# Helper function to assign chunks
def assign_chunks(group, n_chunks=10):
    chunk_size = len(group) / n_chunks
    return (np.floor(np.arange(len(group)) / chunk_size)).astype(int)

# Apply chunk assignment within each MatchID and PeriodID
eval_df['chunk'] = eval_df.groupby(['MatchID', 'PeriodID']).apply(
    lambda group: assign_chunks(group)
).explode().reset_index(drop=True)

  eval_df['chunk'] = eval_df.groupby(['MatchID', 'PeriodID']).apply(


In [71]:
eval_df.drop(columns=['Tweet'], inplace=True)

In [72]:
# group the tweets into their corresponding periods to generate an average embedding vector for each period
# so there are no duplicate period id rows per match
# decreases size of data + makes it easier to fit into LSTM model
eval_df = eval_df.groupby(['MatchID', 'PeriodID', 'ID','chunk']).mean().reset_index()
eval_df.drop(columns=['ID'], inplace=True) 
eval_df['MatchID'] = eval_df['MatchID'].astype(int)
eval_df['PeriodID'] = eval_df['PeriodID'].astype(int)
# need to convert to int before sorting
eval_df.sort_values(by=['MatchID', 'PeriodID', 'chunk'], inplace=True)
eval_df.reset_index(drop=True, inplace=True)

In [73]:
eval_df_agg = (
    eval_df.groupby(['MatchID', 'PeriodID'])
    .agg(
        period_matrix=('tweet_vector', lambda x: np.stack(x.to_numpy())),  # Stack the mean vectors
    )
    .reset_index()
)

In [74]:
eval_df = eval_df_agg

In [75]:
eval_df.rename(columns={'period_matrix': 'tweet_vector'}, inplace=True)

In [76]:
eval_df

Unnamed: 0,MatchID,PeriodID,tweet_vector
0,6,0,"[[0.16653539, 0.26003703, 0.059985828, -0.0924..."
1,6,1,"[[0.15020987, 0.2654425, 0.05469284, -0.114660..."
2,6,2,"[[0.16435114, 0.27378768, 0.05723621, -0.10151..."
3,6,3,"[[0.17175293896567598, 0.27191852900369834, 0...."
4,6,4,"[[0.15491624, 0.28381658, 0.058633655, -0.0947..."
...,...,...,...
511,16,125,"[[0.09658312052488327, 0.2646336853504181, 0.1..."
512,16,126,"[[0.07132586327974091, 0.2369798426262357, 0.0..."
513,16,127,"[[0.08168929815292358, 0.2357352077960968, 0.1..."
514,16,128,"[[0.13000393, 0.25957957, 0.109353416, -0.1556..."


In [77]:
num_matches = len(eval_df['MatchID'].unique())
eval_X = np.zeros((num_matches, 180*period_chunks, 200))
i=0
previous_match_id = eval_df['MatchID'][0]
for row_index, row in eval_df.iterrows():
    match_id = int(row['MatchID'])

    if match_id != previous_match_id:
        i+=1
        previous_match_id = match_id
    
    period_id = int(row['PeriodID'])
    p_i = period_id * period_chunks
    p_i_end = p_i + period_chunks
    
    features = row['tweet_vector']  # Skip MatchID and PeriodID
    eval_X[i, p_i:p_i_end, :] = features

In [78]:
eval_X = torch.tensor(eval_X, dtype=torch.float32)

In [79]:
model.eval()
eval_predictions = model(eval_X.to(gpu)).to('cpu')

In [80]:
eval_predicted_classes = (eval_predictions > 0.4).float()

In [None]:
import datetime

eval_df['ID'] = eval_df.apply(lambda row: str(row.MatchID) + '_' + str(row.PeriodID), axis=1)
match_ids = {6 : 0, 9 : 1, 15 : 2, 16 : 3}
sorted_predictions = []
for row_index, row in eval_df.iterrows():
    sorted_predictions.append(float(eval_predicted_classes[match_ids[int(row["MatchID"])], int(row["PeriodID"])]))

prediction_tab = pd.DataFrame(eval_df[["ID"]])
prediction_tab["EventType"] = sorted_predictions

submission_filename = "test_submissions/Submission " + datetime.datetime.now().strftime("%d%m-%H%M%S") + ".csv"

prediction_tab.to_csv(submission_filename, index=False)