# Import libraries and model

In [1]:
from os import listdir
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import gc
import gensim.downloader as api

nltk.download('stopwords')
nltk.download('wordnet')

# Load GloVe model with Gensim's API - Twitter specific embedding
embeddings_model = api.load("glove-twitter-200")  # 200-dimensional GloVe embeddings

#To check that T4 GPU is connected
#!nvidia-smi

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/victormicha/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/victormicha/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Data preprocessing

In [2]:
# Read all training files and concatenate them into one dataframe

#import os
#print(os.getcwd())

li = []
for filename in listdir("train_tweets"):
    if filename != '.ipynb_checkpoints':
        print(filename)
        df = pd.read_csv("train_tweets/" + filename)
        li.append(df)
df = pd.concat(li, ignore_index=True)
#print(len(df))
df.head()

AustraliaSpain34.csv
PortugalGhana58.csv
CameroonBrazil36.csv
GermanyBrazil74.csv
BelgiumSouthKorea59.csv
NetherlandsChile35.csv
GermanyAlgeria67.csv
FranceGermany70.csv
MexicoCroatia37.csv
FranceNigeria66.csv
AustraliaNetherlands29.csv
HondurasSwitzerland54.csv
ArgentinaGermanyFinal77.csv
ArgentinaBelgium72.csv
USASlovenia2010.csv
GermanyUSA57.csv


Unnamed: 0,ID,MatchID,PeriodID,EventType,Timestamp,Tweet
0,2_0,2,0,0,1403538600000,RT @soccerdotcom: If #ESP beats #AUS we'll giv...
1,2_0,2,0,0,1403538600000,Visit the #SITEP official web site here http:/...
2,2_0,2,0,0,1403538600000,RT @soccerdotcom: If #ESP beats #AUS we'll giv...
3,2_0,2,0,0,1403538600000,RT @worldsoccershop: If there is a winner in t...
4,2_0,2,0,0,1403538600000,RT @soccerdotcom: If #AUS beats #ESP we'll giv...


In [3]:
# Preprocessing of tweet
def preprocess_text(text):
    # Lowercasing
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Tokenization
    words = text.split()
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

In [4]:
# Apply preprocessing to each tweet
df.head()
df['Tweet'] = df['Tweet'].apply(preprocess_text)
df.head()

Unnamed: 0,ID,MatchID,PeriodID,EventType,Timestamp,Tweet
0,2_0,2,0,0,1403538600000,rt soccerdotcom esp beat au well give away spa...
1,2_0,2,0,0,1403538600000,visit sitep official web site httptcoehzkslan ...
2,2_0,2,0,0,1403538600000,rt soccerdotcom esp beat au well give away spa...
3,2_0,2,0,0,1403538600000,rt worldsoccershop winner au v esp match well ...
4,2_0,2,0,0,1403538600000,rt soccerdotcom au beat esp well give away aus...


# Tweet Embeddings

In [5]:
# Get vector tweet embeddings
# TODOOOOOOOOOOOOOOOO maybe instead of avg word embedding for each tweet can get sentence
#   embeddings to retain more information
#   -> can try more complex functions here
#   -> avg embedding of each word for a tweet is fine for now, maybe works well enough

# Function to compute the average word vector for a tweet
def get_avg_embedding(tweet, model, vector_size=200):
    words = tweet.split()  # Tokenize by whitespace
    word_vectors = [model[word] for word in words if word in model]
    if not word_vectors:  # If no words in the tweet are in the vocabulary, return a zero vector
        return np.zeros(vector_size)
    return np.mean(word_vectors, axis=0)

In [6]:
# Crashes after using all available RAM :( on google colab
# 

# obtain vector tweet embeddings
vector_size = 200  # Adjust based on the chosen GloVe model
tweet_vectors = np.vstack([get_avg_embedding(tweet, embeddings_model, vector_size) for tweet in df['Tweet']])
tweet_df = pd.DataFrame(tweet_vectors)
tweet_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
0,0.148069,0.342504,-0.097915,0.002166,-0.05984,0.025755,0.244918,0.081042,0.236453,0.027198,...,-0.202918,-0.076171,0.066193,0.010218,-0.020414,0.010595,0.00493,-0.005967,-0.108431,0.07064
1,-0.183972,0.119888,-0.25376,0.012623,0.012891,-0.120238,-0.026952,-0.339493,0.033273,0.106456,...,0.249775,-0.15252,0.006334,-0.085193,0.005175,0.456785,-0.064834,-0.083434,0.05472,0.030099
2,0.148069,0.342504,-0.097915,0.002166,-0.05984,0.025755,0.244918,0.081042,0.236453,0.027198,...,-0.202918,-0.076171,0.066193,0.010218,-0.020414,0.010595,0.00493,-0.005967,-0.108431,0.07064
3,0.209126,0.390986,-0.130056,-0.068354,-0.096441,0.010439,0.074133,0.04572,0.215201,0.200725,...,-0.235941,-0.005941,0.070192,0.024676,0.003736,0.074399,0.169565,0.024788,0.028519,0.177178
4,0.16164,0.308513,-0.093269,0.001645,-0.071475,0.003183,0.22516,0.069612,0.229182,0.051714,...,-0.164617,-0.078824,0.064404,-0.035373,-0.01658,0.003644,0.010155,0.036428,-0.095518,0.084394


In [7]:
# Attach the vectors into the original dataframe
df = pd.concat([df, tweet_df], axis=1)

# Drop the columns that are not useful anymore
# no need for Tweet column since we have its corresponding vector embedding
df = df.drop(columns=['Timestamp', 'Tweet'])



In [8]:
# by now should have df with columns: id, match id, period id, Event Type, tweet_vector. Tweet_vector is just 200 columns
df.head()

Unnamed: 0,ID,MatchID,PeriodID,EventType,0,1,2,3,4,5,...,190,191,192,193,194,195,196,197,198,199
0,2_0,2,0,0,0.148069,0.342504,-0.097915,0.002166,-0.05984,0.025755,...,-0.202918,-0.076171,0.066193,0.010218,-0.020414,0.010595,0.00493,-0.005967,-0.108431,0.07064
1,2_0,2,0,0,-0.183972,0.119888,-0.25376,0.012623,0.012891,-0.120238,...,0.249775,-0.15252,0.006334,-0.085193,0.005175,0.456785,-0.064834,-0.083434,0.05472,0.030099
2,2_0,2,0,0,0.148069,0.342504,-0.097915,0.002166,-0.05984,0.025755,...,-0.202918,-0.076171,0.066193,0.010218,-0.020414,0.010595,0.00493,-0.005967,-0.108431,0.07064
3,2_0,2,0,0,0.209126,0.390986,-0.130056,-0.068354,-0.096441,0.010439,...,-0.235941,-0.005941,0.070192,0.024676,0.003736,0.074399,0.169565,0.024788,0.028519,0.177178
4,2_0,2,0,0,0.16164,0.308513,-0.093269,0.001645,-0.071475,0.003183,...,-0.164617,-0.078824,0.064404,-0.035373,-0.01658,0.003644,0.010155,0.036428,-0.095518,0.084394


In [9]:
df.tail()

Unnamed: 0,ID,MatchID,PeriodID,EventType,0,1,2,3,4,5,...,190,191,192,193,194,195,196,197,198,199
5056045,17_129,17,129,1,0.145174,0.1901,0.21479,-0.310834,0.050761,0.039853,...,-0.122601,-0.259632,0.023675,-0.15128,-0.023655,0.116062,0.097146,0.07171,0.007577,0.182598
5056046,17_129,17,129,1,0.328279,0.334743,0.125396,-0.164282,-0.078111,0.175972,...,-0.183001,-0.166399,0.161126,-0.056147,-0.037496,0.046396,0.347816,-0.070108,0.0518,0.209709
5056047,17_129,17,129,1,0.279302,0.184175,0.197833,-0.072442,0.001534,0.218018,...,-0.427638,-0.113268,0.022538,-0.108198,0.10655,-0.147467,0.300702,-0.088761,0.043255,0.272322
5056048,17_129,17,129,1,0.054918,0.149426,0.001621,0.107246,-0.106812,0.091331,...,-0.114883,-0.071184,-0.071171,-0.123343,0.146086,-0.07393,0.174728,0.167955,-0.172603,0.042918
5056049,17_129,17,129,1,0.288279,0.113975,0.14777,-0.275185,-0.14969,0.422036,...,-0.341743,-0.080154,0.089465,-0.139695,0.047144,0.008956,0.12575,-0.030681,-0.087165,-0.043543


# Separate Train and Test data

In [10]:
# I think we should not group the tweets into their corresponding periods to generate an average embedding vector for each period
# because there is probably a relationship between the number of tweets per period and
#      the Event Type
# DO NOT UNCOMMENT df = df.groupby(['MatchID', 'PeriodID', 'ID']).mean().reset_index()

In [11]:
# we should incorporate a certain percentage of EACH match into training data
# so we have to group by the MatchId

training_data = 0.8 # first 80% of each match is part of training data

def get_training_data(group):
    n = len(group)
    return group.iloc[:int(n * training_data)]

def get_test_data(group):
    n = len(group)
    return group.iloc[int(n * training_data):]

# groupby does not modify df
df_X_train = df.groupby('MatchID').apply(get_training_data, include_groups=False).reset_index(drop=True)


df_X_test = df.groupby('MatchID').apply(get_test_data, include_groups=False).reset_index(drop=True)

# df still unchanged

In [12]:
# groupby removed the MatchID, but the MatchID info is still within the ID column
# so only keep the MatchID part of the ID column and rename the column
df_X_train['ID'] = df_X_train['ID'].str.split('_').str[0]
df_X_train.rename(columns={'ID': 'MatchID'}, inplace=True)

In [13]:
df_X_test['ID'] = df_X_test['ID'].str.split('_').str[0]
df_X_test.rename(columns={'ID': 'MatchID'}, inplace=True)

In [14]:
df_y_train = df_X_train['EventType']
df_y_test = df_X_test['EventType']

In [15]:
# drop column EventType from df_X_train and df_X_test
df_X_train.drop('EventType', axis=1, inplace=True)
df_X_test.drop('EventType', axis=1, inplace=True)

# now df_X_train and df_X_test should have columns MatchID, PeriodID, tweet_vector. Tweet_vector is just 200 columns
# df_y_train and df_y_test should have 1 column, EventType

In [16]:
# now we have df_X_train, df_X_test, df_y_train, df_y_test
# we no longer need df so we should free up the memory
del df  # remove reference to the original DataFrame
gc.collect()  # force garbage collection to free up memory

18

In [17]:
df_X_train

Unnamed: 0,MatchID,PeriodID,0,1,2,3,4,5,6,7,...,190,191,192,193,194,195,196,197,198,199
0,0,0,0.210488,0.234915,0.113682,-0.212455,0.051463,-0.078776,0.274296,0.241165,...,-0.175124,-0.092599,-0.040588,0.055471,-0.156285,0.029899,0.242080,0.178201,0.154546,-0.006325
1,0,0,0.280262,-0.129196,0.219125,-0.169693,0.085696,0.203334,-0.172608,0.227091,...,0.133976,-0.187335,-0.009833,0.145971,-0.192845,0.327873,-0.000228,0.229148,0.094974,0.006005
2,0,0,0.169881,0.243936,-0.099449,-0.176362,0.012294,0.018425,-0.065941,-0.263776,...,0.155020,-0.117614,0.160361,-0.053464,-0.153413,0.312181,0.284142,0.199675,-0.147685,-0.086996
3,0,0,0.061512,0.164446,0.050236,-0.155936,-0.103754,0.086353,0.224470,-0.093764,...,0.028273,-0.038717,0.092868,0.031913,-0.065388,0.094298,0.242979,-0.003386,0.083000,0.194601
4,0,0,0.202479,0.414086,-0.157795,-0.091617,-0.232310,0.000699,0.344262,0.083014,...,-0.335044,-0.176817,0.149486,-0.075771,-0.067445,-0.186252,0.152675,0.135652,0.065966,0.188542
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4044831,19,123,-0.043257,0.169306,0.148447,-0.296970,0.308839,-0.008115,-0.012287,-0.038306,...,-0.243222,-0.030162,-0.002008,0.026914,0.044741,0.026673,0.196533,0.012689,0.143271,0.021022
4044832,19,123,0.144529,0.346696,0.092591,-0.173541,0.043152,0.022830,-0.529222,-0.041446,...,-0.270446,-0.047695,0.009524,0.092314,-0.172105,0.051558,0.189966,0.068056,-0.317608,0.154726
4044833,19,123,-0.017262,0.254473,0.080020,-0.296628,-0.027179,0.033323,-0.717284,-0.120736,...,-0.132913,-0.053128,-0.195119,0.108324,-0.215015,0.040794,0.391413,0.099575,-0.219631,0.135707
4044834,19,123,0.054889,0.082520,0.081246,-0.039071,0.081615,0.247454,0.467424,-0.041003,...,-0.094756,-0.038736,-0.043979,0.101145,-0.025771,0.018821,0.400688,-0.092449,-0.063518,0.067171


In [18]:
df_X_test

Unnamed: 0,MatchID,PeriodID,0,1,2,3,4,5,6,7,...,190,191,192,193,194,195,196,197,198,199
0,0,102,-0.056314,0.037909,0.044275,-0.237082,0.084704,-0.075604,0.158376,0.023405,...,-0.400022,0.084176,0.111584,0.095779,0.104440,-0.181766,0.154504,-0.141168,0.205919,0.285236
1,0,102,-0.002173,0.217547,0.140079,-0.110333,0.015278,-0.069817,0.070584,-0.045718,...,-0.009593,-0.164010,-0.062695,0.155267,-0.167550,0.091088,0.269260,0.080071,0.026874,-0.077225
2,0,102,-0.085605,0.195424,-0.009099,-0.073787,0.047427,0.307072,-0.053244,0.177024,...,-0.384082,0.028509,0.114817,-0.104560,0.007320,-0.049311,0.203293,-0.031129,0.126213,0.217485
3,0,102,0.001052,0.101094,-0.083223,-0.307029,-0.057864,-0.103560,-0.014954,0.072488,...,-0.217840,0.038688,0.025310,0.251143,0.175051,0.076183,0.323537,-0.048407,0.048399,0.150930
4,0,102,-0.117679,0.002618,0.095835,-0.031329,-0.008819,-0.058385,0.383248,0.076436,...,-0.035318,-0.207397,0.303460,-0.057142,0.042868,-0.006127,0.018996,0.315144,0.245720,0.320586
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1011209,19,129,-0.004142,-0.108566,0.332230,-0.405819,0.043165,0.028780,-0.126152,-0.055986,...,0.188838,0.208504,-0.095228,-0.040467,-0.538533,0.281736,-0.045097,-0.047416,-0.267493,0.223752
1011210,19,129,-0.055244,0.323583,0.113369,-0.315212,0.099712,0.049791,-0.076311,-0.022134,...,-0.028659,-0.113946,-0.038118,-0.083217,-0.187571,0.196560,0.212107,0.140499,-0.106197,0.239961
1011211,19,129,0.002005,0.255280,0.058871,-0.140130,0.108749,-0.030762,-0.440400,-0.064902,...,0.036584,-0.093623,-0.093828,0.023250,-0.067607,0.121196,0.203339,0.206202,-0.094206,0.081512
1011212,19,129,0.002005,0.255280,0.058871,-0.140130,0.108749,-0.030762,-0.440400,-0.064902,...,0.036584,-0.093623,-0.093828,0.023250,-0.067607,0.121196,0.203339,0.206202,-0.094206,0.081512


In [19]:
df_y_train

0          0
1          0
2          0
3          0
4          0
          ..
4044831    1
4044832    1
4044833    1
4044834    1
4044835    1
Name: EventType, Length: 4044836, dtype: int64

In [20]:
df_y_test

0          1
1          1
2          1
3          1
4          1
          ..
1011209    1
1011210    1
1011211    1
1011212    1
1011213    1
Name: EventType, Length: 1011214, dtype: int64

In [21]:
# For testing:
#filtered_df = df_X_train[df_X_train['MatchID'] == '2']
#filtered_df

In [22]:
X_train = df_X_train.to_numpy()
y_train = df_y_train.to_numpy()

In [24]:
X_test = df_X_test.to_numpy()
y_test = df_y_test.to_numpy()

In [None]:
## X_train has the first ~80% of every match, grouped by match
## X_test has the last ~20% of every match, grouped by match

# so X_train has columns MatchID, PeriodID, tweet_vector. Tweet_vector is just 200 columns (word vector embedding).
# and the matchids are grouped together so all the rows of the same
# match ids are grouped next to each other, and the periodID are ordered chronologically.

# Format data for PyTorch LSTM

In [None]:
# input tensor for a PyTorch LSTM should have the shape of (when setting batch_first=True)
# (batch_size, seq_len, num_features) when using the batch_first=True parameter
# batch_size is number of sequences processed at once
def create_sequences(data, seq_length):
    """
    Create overlapping sequences from the input data.

    Returns:
    - A NumPy array of shape (num_sequences, seq_length, num_features) containing the sequences.
    """
    sequences = []
    
    #TODO!
    
    return np.array(sequences)



# TODO!

In [None]:



# TODO MAKE SURE THIS IS THE RIGHT FORMAT WE WANT TO GIVE TO LSTM
## NOW TIME TO FORMAT X_TRAIN ETC INTO 3RD ORDER TENSOR TO INPUT TO PYTORCH LSTM!






In [None]:
# NOTES
# HOW TO MAKE SURE THAT we:
# 1. DO NOT ignore the order of the tweets -> (LSTM)
# 2. treat each time period as RELATED to the football match they belong to -> ??



# for LSTM: Each input sequence should consist of tweets from a specific match, ordered by Period ID.
#   tweets of different matches are unrelated, but tweets of a same match are related sequentially (chronologically)
#   structure training data such that tweets are grouped by match id, and ordered by period id
#   ?additional embedding layer for Match ID for LSTM model to distinguishing tweets between different matches.