# Preprocessing of the training data to get NLTK vectors

### Definition of the paths

In [None]:
path_training_dest = "C:/Users/victo/OneDrive/Desktop/INF554/Projet Twitter/df_train_NLTK.csv"
train_path = "C:/Users/victo/OneDrive/Desktop/INF554/Projet Twitter/challenge_data/train_tweets"

path_test_dest = "C:/Users/victo/OneDrive/Desktop/INF554/Projet Twitter/df_test_NLTK.csv"
test_path = "C:/Users/victo/OneDrive/Desktop/INF554/Projet Twitter/challenge_data/eval_tweets"

### Import of useful libraries and modules

In [None]:
import pandas as pd
import os
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import seaborn as sb
import openpyxl
import plotly.express as px
import plotly.graph_objects as go
import torch
import matplotlib.pyplot as plt
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
import os
import re
import gensim.downloader as api
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [None]:
# Download some NLP models for processing, optional
nltk.download('stopwords')
nltk.download('wordnet')
# Load GloVe model with Gensim's API
embeddings_model = api.load("glove-twitter-200")  # 200-dimensional GloVe embeddings

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\victo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\victo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Functions from the teachers' code

In [None]:
# Function to compute the average word vector for a tweet
def get_avg_embedding(tweet, model, vector_size=200):
    words = tweet.split()  # Tokenize by whitespace
    word_vectors = [model[word] for word in words if word in model]
    if not word_vectors:  # If no words in the tweet are in the vocabulary, return a zero vector
        return np.zeros(vector_size)
    return np.mean(word_vectors, axis=0)

def preprocess_text(text):
    # Lowercasing
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Tokenization
    words = text.split()
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

### Preprocessing of the training data

In [None]:
dataframes = []

for fichier in os.listdir(train_path):
    if fichier.endswith('.csv'):
        chemin_complet = os.path.join(train_path, fichier)
        df = pd.read_csv(chemin_complet)
        labels = df.groupby('PeriodID')["EventType"].sum().reset_index()
        labels["EventType"] = labels["EventType"].apply(lambda x: 1 if x>0 else 0)
        period_max = np.max(df["PeriodID"].tolist())
        df_by_period = pd.DataFrame()
        df_by_period["PeriodID"] = np.arange(0,period_max+1,1)

        #pour le taux d'accroissement, on ne considère que les tweets EFFECTIVEMENT publiés pdt la période
        tweets_per_period = df.groupby('PeriodID').size().reset_index(name='TweetCount')
        tweets_per_period['GrowthRate'] = tweets_per_period['TweetCount'].pct_change() * 100

        #pour le nb de tweets par période, on considère les tweets EFFECTIVEMENT publiés pdt la période

        df_by_period["rate_increase"] = tweets_per_period['GrowthRate']
        df_by_period["nb_tweets"] = tweets_per_period["TweetCount"]
        df_by_period.loc[0,"rate_increase"] = 0
        df_by_period["labels"] = labels["EventType"]

        df['Tweet'] = df['Tweet'].apply(lambda x: x.split(": ", 1)[1] if x.startswith("RT") and ": " in x else x) #supprimer le "RT ... : "
        df['tweet_occ'] = df['Tweet'].map(df['Tweet'].value_counts())
        df = df.drop_duplicates(subset=['Tweet']).reset_index(drop=True)

        tweets_RT_per_period = df.groupby('PeriodID')["tweet_occ"].sum().reset_index()

        df_by_period["nb_tweets_RT"] = tweets_RT_per_period["tweet_occ"]

        df['Tweet'] = df['Tweet'].apply(preprocess_text)

        # Apply preprocessing to each tweet and obtain vectors
        vector_size = 200  # Adjust based on the chosen GloVe model
        tweet_vectors = np.vstack([get_avg_embedding(tweet, embeddings_model, vector_size) for tweet in df['Tweet']])
        tweet_df = pd.DataFrame(tweet_vectors)

        # Attach the vectors into the original dataframe
        period_features = pd.concat([df, tweet_df], axis=1)
        # Drop the columns that are not useful anymore
        period_features = period_features.drop(columns=['Timestamp', 'Tweet'])
        # Group the tweets into their corresponding periods. This way we generate an average embedding vector for each period
        period_features = period_features.groupby(['MatchID', 'PeriodID', 'ID']).mean().reset_index()

        drop =period_features.drop(['MatchID', 'PeriodID', 'ID', 'EventType','tweet_occ'], axis=1)

        df_preproc = pd.concat([df_by_period, drop], axis=1)

        dataframes.append(df_preproc)

df_train = pd.concat(dataframes)
df_train.to_csv(path_training_dest, index=False)

### Preprocessing of the test data

In [None]:
dataframes_test = []

for fichier in os.listdir(test_path):
    if fichier.endswith('.csv'):
        chemin_complet = os.path.join(test_path, fichier)
        df = pd.read_csv(chemin_complet)
        period_max = np.max(df["PeriodID"].tolist())
        df_by_period = pd.DataFrame()
        df_by_period["PeriodID"] = np.arange(0,period_max+1,1)

        #pour le taux d'accroissement, on ne considère que les tweets EFFECTIVEMENT publiés pdt la période
        tweets_per_period = df.groupby('PeriodID').size().reset_index(name='TweetCount')
        tweets_per_period['GrowthRate'] = tweets_per_period['TweetCount'].pct_change() * 100

        #pour le nb de tweets par période, on considère les tweets EFFECTIVEMENT publiés pdt la période

        df_by_period["rate_increase"] = tweets_per_period['GrowthRate']
        df_by_period["nb_tweets"] = tweets_per_period["TweetCount"]
        df_by_period.loc[0,"rate_increase"] = 0

        df['Tweet'] = df['Tweet'].apply(lambda x: x.split(": ", 1)[1] if x.startswith("RT") and ": " in x else x) #supprimer le "RT ... : "
        df['tweet_occ'] = df['Tweet'].map(df['Tweet'].value_counts())
        df = df.drop_duplicates(subset=['Tweet']).reset_index(drop=True)

        tweets_RT_per_period = df.groupby('PeriodID')["tweet_occ"].sum().reset_index()

        df_by_period["nb_tweets_RT"] = tweets_RT_per_period["tweet_occ"]

        df['Tweet'] = df['Tweet'].apply(preprocess_text)

        # Apply preprocessing to each tweet and obtain vectors
        vector_size = 200  # Adjust based on the chosen GloVe model
        tweet_vectors = np.vstack([get_avg_embedding(tweet, embeddings_model, vector_size) for tweet in df['Tweet']])
        tweet_df = pd.DataFrame(tweet_vectors)

        # Attach the vectors into the original dataframe
        period_features = pd.concat([df, tweet_df], axis=1)
        # Drop the columns that are not useful anymore
        period_features = period_features.drop(columns=['Timestamp', 'Tweet'])
        # Group the tweets into their corresponding periods. This way we generate an average embedding vector for each period
        period_features = period_features.groupby(['MatchID', 'PeriodID', 'ID']).mean().reset_index()

        drop =period_features.drop(['MatchID', 'PeriodID', 'tweet_occ'], axis=1)

        df_preproc = pd.concat([df_by_period, drop], axis=1)

        dataframes_test.append(df_preproc)

df_test = pd.concat(dataframes_test)
df_test.to_csv(path_test_dest, index=False)