# Import libraries and model

In [2]:
from os import listdir
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

import gensim.downloader as api

nltk.download('stopwords')
nltk.download('wordnet')

# Load GloVe model with Gensim's API - Twitter specific embedding
embeddings_model = api.load("glove-twitter-200")  # 200-dimensional GloVe embeddings

#To check that T4 GPU is connected
#!nvidia-smi

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/victormicha/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/victormicha/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Data preprocessing

In [3]:
# Read all training files and concatenate them into one dataframe

#import os
#print(os.getcwd())

li = []
for filename in listdir("train_tweets"):
    print(filename)
    df = pd.read_csv("train_tweets/" + filename)
    li.append(df)
df = pd.concat(li, ignore_index=True)
#print(len(df))
df.head()

AustraliaSpain34.csv
PortugalGhana58.csv
CameroonBrazil36.csv
GermanyBrazil74.csv
BelgiumSouthKorea59.csv
NetherlandsChile35.csv
GermanyAlgeria67.csv
FranceGermany70.csv
MexicoCroatia37.csv
FranceNigeria66.csv
AustraliaNetherlands29.csv
HondurasSwitzerland54.csv
ArgentinaGermanyFinal77.csv
ArgentinaBelgium72.csv
USASlovenia2010.csv
GermanyUSA57.csv


Unnamed: 0,ID,MatchID,PeriodID,EventType,Timestamp,Tweet
0,2_0,2,0,0,1403538600000,RT @soccerdotcom: If #ESP beats #AUS we'll giv...
1,2_0,2,0,0,1403538600000,Visit the #SITEP official web site here http:/...
2,2_0,2,0,0,1403538600000,RT @soccerdotcom: If #ESP beats #AUS we'll giv...
3,2_0,2,0,0,1403538600000,RT @worldsoccershop: If there is a winner in t...
4,2_0,2,0,0,1403538600000,RT @soccerdotcom: If #AUS beats #ESP we'll giv...


In [4]:
# Preprocessing of tweet
def preprocess_text(text):
    # Lowercasing
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Tokenization
    words = text.split()
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

In [5]:
# Apply preprocessing to each tweet
df.head()
df['Tweet'] = df['Tweet'].apply(preprocess_text)
df.head()

Unnamed: 0,ID,MatchID,PeriodID,EventType,Timestamp,Tweet
0,2_0,2,0,0,1403538600000,rt soccerdotcom esp beat au well give away spa...
1,2_0,2,0,0,1403538600000,visit sitep official web site httptcoehzkslan ...
2,2_0,2,0,0,1403538600000,rt soccerdotcom esp beat au well give away spa...
3,2_0,2,0,0,1403538600000,rt worldsoccershop winner au v esp match well ...
4,2_0,2,0,0,1403538600000,rt soccerdotcom au beat esp well give away aus...


# Tweet Embeddings

In [6]:
# Get vector tweet embeddings
# TODOOOOOOOOOOOOOOOO maybe instead of avg word embedding for each tweet can get sentence
#   embeddings to retain more information
#   -> can try more complex functions here
#   -> avg embedding of each word for a tweet is fine for now, maybe works well enough

# Function to compute the average word vector for a tweet
def get_avg_embedding(tweet, model, vector_size=200):
    words = tweet.split()  # Tokenize by whitespace
    word_vectors = [model[word] for word in words if word in model]
    if not word_vectors:  # If no words in the tweet are in the vocabulary, return a zero vector
        return np.zeros(vector_size)
    return np.mean(word_vectors, axis=0)

In [7]:
# Crashes after using all available RAM :( on google colab
# 

# obtain vector tweet embeddings
vector_size = 200  # Adjust based on the chosen GloVe model
tweet_vectors = np.vstack([get_avg_embedding(tweet, embeddings_model, vector_size) for tweet in df['Tweet']])
tweet_df = pd.DataFrame(tweet_vectors)
tweet_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
0,0.148069,0.342504,-0.097915,0.002166,-0.05984,0.025755,0.244918,0.081042,0.236453,0.027198,...,-0.202918,-0.076171,0.066193,0.010218,-0.020414,0.010595,0.00493,-0.005967,-0.108431,0.07064
1,-0.183972,0.119888,-0.25376,0.012623,0.012891,-0.120238,-0.026952,-0.339493,0.033273,0.106456,...,0.249775,-0.15252,0.006334,-0.085193,0.005175,0.456785,-0.064834,-0.083434,0.05472,0.030099
2,0.148069,0.342504,-0.097915,0.002166,-0.05984,0.025755,0.244918,0.081042,0.236453,0.027198,...,-0.202918,-0.076171,0.066193,0.010218,-0.020414,0.010595,0.00493,-0.005967,-0.108431,0.07064
3,0.209126,0.390986,-0.130056,-0.068354,-0.096441,0.010439,0.074133,0.04572,0.215201,0.200725,...,-0.235941,-0.005941,0.070192,0.024676,0.003736,0.074399,0.169565,0.024788,0.028519,0.177178
4,0.16164,0.308513,-0.093269,0.001645,-0.071475,0.003183,0.22516,0.069612,0.229182,0.051714,...,-0.164617,-0.078824,0.064404,-0.035373,-0.01658,0.003644,0.010155,0.036428,-0.095518,0.084394


In [8]:
# Attach the vectors into the original dataframe
df = pd.concat([df, tweet_df], axis=1)

# Drop the columns that are not useful anymore
# no need for Tweet column since we have its corresponding vector embedding
df = df.drop(columns=['Timestamp', 'Tweet'])

# by now should have df with columns: id, match id, period id, Event Type, tweet_vector. Tweet_vector is just 200 columns
df.head()

Unnamed: 0,ID,MatchID,PeriodID,EventType,0,1,2,3,4,5,...,190,191,192,193,194,195,196,197,198,199
0,2_0,2,0,0,0.148069,0.342504,-0.097915,0.002166,-0.05984,0.025755,...,-0.202918,-0.076171,0.066193,0.010218,-0.020414,0.010595,0.00493,-0.005967,-0.108431,0.07064
1,2_0,2,0,0,-0.183972,0.119888,-0.25376,0.012623,0.012891,-0.120238,...,0.249775,-0.15252,0.006334,-0.085193,0.005175,0.456785,-0.064834,-0.083434,0.05472,0.030099
2,2_0,2,0,0,0.148069,0.342504,-0.097915,0.002166,-0.05984,0.025755,...,-0.202918,-0.076171,0.066193,0.010218,-0.020414,0.010595,0.00493,-0.005967,-0.108431,0.07064
3,2_0,2,0,0,0.209126,0.390986,-0.130056,-0.068354,-0.096441,0.010439,...,-0.235941,-0.005941,0.070192,0.024676,0.003736,0.074399,0.169565,0.024788,0.028519,0.177178
4,2_0,2,0,0,0.16164,0.308513,-0.093269,0.001645,-0.071475,0.003183,...,-0.164617,-0.078824,0.064404,-0.035373,-0.01658,0.003644,0.010155,0.036428,-0.095518,0.084394


# TODO!

In [None]:
# NOTES
# HOW TO MAKE SURE THAT we:
# 1. DO NOT ignore the order of the tweets -> (LSTM)
# 2. treat each time period as RELATED to the football match they belong to -> ??



# for LSTM: Each input sequence should consist of tweets from a specific match, ordered by Period ID.
#   tweets of different matches are unrelated, but tweets of a same match are related sequentially (chronologically)
#   structure training data such that tweets are grouped by match id, and ordered by period id
#   ?additional embedding layer for Match ID for LSTM model to distinguishing tweets between different matches.