**Imports**

In [15]:
# 1. Data Handling and Preprocessing
import os
import re
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import spacy
from tqdm import tqdm
import inflect
from inflect import NumOutOfRangeError  # Import the specific exception

# 2. Machine Learning Models and Evaluation
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# 3. Natural Language Processing (NLP)
import nltk
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

# 4. Deep Learning (TensorFlow / Keras)
import tensorflow
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Conv1D, GlobalMaxPooling1D
from keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

# 5. Word Embeddings
import gensim.downloader as api

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


**Useful functions** (Not all functions are implemented in this notebook however they are kept in as they are useful to explore)

In [12]:
# Create dataframe with train or eval data

def load_data(folder = 'train_tweets', chunksize = 30000, drop_columns = []):
  li = []
  for filename in os.listdir(folder):
      for chunk in pd.read_csv(folder + "/" + filename, chunksize=chunksize):
          chunk = chunk.drop(columns=drop_columns)
          li.append(chunk)
  df = pd.concat(li, ignore_index=True)
  return df

In [4]:
# Checking data frame

def check(df, rows=5):
  print("df shape:", df.shape)
  print("null values:", df.isnull().sum())
  display(df.head(rows))

# Add a column to a dataframe with number of words per tweet - useful for choosing max padding length
def add_tweet_word_count(df, display = True, update = False):
  # If 'word_count' already exists, it will just display the word_count stats, unless update is set to true
  if 'word_count' not in df.columns or update == True:
    df['word_count'] = df['Tweet'].apply(lambda x: len(str(x).split()))
  if display:
    display(df['word_count'].describe())
  return df

# Find words with highest frequency in both classes
def top_words(df, event_type_column='EventType', tweet_column='Tweet', max_features=20):

    # For event tweets
    event_tweets = df[df[event_type_column] == 1][tweet_column]
    event_vectorizer = CountVectorizer(stop_words='english', max_features=max_features)
    X_event = event_vectorizer.fit_transform(event_tweets)

    # Calculate frequencies and percentages
    event_frequencies = X_event.toarray().sum(axis=0)
    total_event_words = event_frequencies.sum()
    top_event_words = pd.DataFrame({
        'word': event_vectorizer.get_feature_names_out(),
        'frequency': event_frequencies,
        'percentage': (event_frequencies / total_event_words) * 100
    }).sort_values(by='frequency', ascending=False)

    # For non-event tweets
    non_event_tweets = df[df[event_type_column] == 0][tweet_column]
    non_event_vectorizer = CountVectorizer(stop_words='english', max_features=max_features)
    X_non_event = non_event_vectorizer.fit_transform(non_event_tweets)

    # Calculate frequencies and percentages
    non_event_frequencies = X_non_event.toarray().sum(axis=0)
    total_non_event_words = non_event_frequencies.sum()
    top_non_event_words = pd.DataFrame({
        'word': non_event_vectorizer.get_feature_names_out(),
        'frequency': non_event_frequencies,
        'percentage': (non_event_frequencies / total_non_event_words) * 100
    }).sort_values(by='frequency', ascending=False)

    return top_event_words, top_non_event_words

In [7]:
# Expanding common contractions - not used ultimately as it significantly slows down preprocessing and doesn't induce better results

contractions_dict = {
    "worldcup": "world cup", "worldcup2014": "world cup", "worldcup2018": "world cup", "fifa": "fifa world cup",
    "ger": "germany", "bra": "brazil", "arg": "argentina", "fra": "france", "usa": "united states",
    "por": "portugal", "bel": "belgium", "alg": "algeria", "gha": "ghana", "ned": "netherlands",
    "chi": "chile", "mex": "mexico", "aus": "australia", "esp": "spain", "cmr": "cameroon", "nga": "nigeria",
    "dont": "do not", "cant": "can not", "wont": "will not", "didnt": "did not", "its": "it is", "hes": "he is",
    "shes": "she is", "thats": "that is", "whats": "what is", "wheres": "where is", "theres": "there is",
    "whos": "who is", "theyre": "they are", "were": "we are", "youre": "you are", "itll": "it will", "ill": "i will",
    "well": "we will", "theyll": "they will", "youll": "you will", "ive": "i have", "weve": "we have",
    "theyve": "they have", "youve": "you have", "isnt": "is not", "arent": "are not", "wasnt": "was not",
    "werent": "were not", "havent": "have not", "hasnt": "has not", "hadnt": "had not", "shouldnt": "should not",
    "wouldnt": "would not", "couldnt": "could not", "mightnt": "might not", "mustnt": "must not",
    "im": "i am", "vs": "versus"}

def expand_contractions(text, contractions=contractions_dict):
    # Replace contractions with their expanded form using word boundaries
    for contraction, expanded in contractions.items():
        text = re.sub(rf'\b{re.escape(contraction)}\b', expanded, text)
    return text

In [6]:
# Preprocessing

stop_words = set(stopwords.words('english'))

def preprocess_text_vectorized(df, text_column):

    # Step 1: Clean the text column with regex replacements (vectorized)
    df[text_column] = df[text_column].str.replace(r'^RT @\w+: ', '', regex=True)# Clean retweet prefix
    df[text_column] = df[text_column].str.replace(r'\n|\r|\t', ' ', regex=True)
    df[text_column] = df[text_column].str.replace(r'\bRT @\w+\b', '', regex=True)  # Remove mentions
    df[text_column] = df[text_column].str.replace(r'@\w+', '', regex=True)  # Remove all @mentions
    df[text_column] = df[text_column].str.lower() # Convert to lowercase
    df[text_column] = df[text_column].str.replace(r'[^\w\s]', '', regex=True) # Remove punctuation
    df[text_column] = df[text_column].str.replace(r'http\S+|www\S+', '', regex=True) # Remove URLs
    df[text_column] = df[text_column].str.strip() # Remove leading/trailing spaces

    lemmatizer = WordNetLemmatizer()
    p = inflect.engine()

    def process_words(text):
        words = text.split()  # Tokenize
        # words = [expand_contractions(word) for word in words]  # Expand contractions
        words = [word for word in words if word not in stop_words]  # Remove stopwords
        processed_words = []
        for word in words:
            if word.isdigit():
                try:
                    word = p.number_to_words(word)  # Convert numbers to words
                except NumOutOfRangeError:
                    continue  # Skip the number if it's out of range
            if word.isdigit():
                continue
            processed_words.append(lemmatizer.lemmatize(word))  # Lemmatize ##
        return ' '.join(words)

    df[text_column] = df[text_column].apply(process_words)
    df[text_column] = df[text_column].apply(lambda tweet: re.sub(r'[^a-zA-Z0-9\s]', '', tweet))
    return df

In [8]:
# Format to pass through model


# Split testing and training data by game
def train_test_game_split(df, test_size=0.2, random_state=42):
  unique_match_ids = df['MatchID'].unique()
  train_match_ids, test_match_ids = train_test_split(unique_match_ids, test_size=test_size, random_state=42)  # 25% test size
  train_df = df[df['MatchID'].isin(train_match_ids)]
  test_df = df[df['MatchID'].isin(test_match_ids)]

  return train_df, test_df


# Randomly sample a specified percentage from each bin - useful for hyperparameter tuning
def five_percent(df, groupby='ID', percentage=0.05):
    df_subset = df.groupby(groupby, group_keys=False).apply(
        lambda group: group.sample(frac=percentage)
    )
    df_subset = df_subset.reset_index(drop=True)
    return df_subset


# Sample the specified percentage from the last (time-wise) tweets in each bin - to test influence of 'Timestamp' feature
def last_five_percent(df, groupby='ID', percentage=0.05):
    df_subset = (df.groupby(groupby, group_keys=False).apply(lambda group: group.sort_values('Timestamp').iloc[int(len(group) * (1 - percentage)):]))
    df_subset = df_subset.reset_index(drop=True)  # Reset index after sampling
    return df_subset


# Putting X and y into arrays for model and accuracy and storing the IDs (necessary for voting process)
def ID_X_y(df, tokeniser, pad_max_len=60):
  IDs = df['ID'].values
  X = tokenizer.texts_to_sequences(df['Tweet']) # Converting tweets to sequences from tokeniser fit on all training tweets
  X = pad_sequences(X, maxlen=pad_max_len) # Padding as LSTM and CNN requires fixed input length
  if 'EventType' in df.columns:
    y = df['EventType'].values
    return IDs, X, y
  else:
    return IDs, X

In [9]:
# Make predictions per bin by voting for most frequent EventType

# Vote on prediction for each bin - returns predictions and accuracy before and after voting
def vote(IDs, X, y=None):

  loss_fn = tensorflow.keras.losses.BinaryCrossentropy(from_logits=False)

  predictions = modelCNN.predict(X)
  predictions1 = (predictions >= 0.5).astype(int) # 0.5 threshold for binary classification

  pred_df = pd.DataFrame({'ID': IDs, 'EventType': predictions1.flatten()})   # Create a new dataframe for predictions
  pred_df['EventType'] = pred_df.groupby('ID')['EventType'].transform(lambda x: x.mode()[0]) # Assign each tweet's EventType as most frequent in bin
  voted_predictions = pred_df['EventType'].values

  if y is not None:
    accuracy1 = np.mean(predictions1.flatten() == y)
    loss1 = loss_fn(y, predictions).numpy()
    voted_accuracy = np.mean(voted_predictions == y)
    return pred_df, accuracy1, loss1, voted_accuracy

  else:
    return pred_df

In [10]:
# Compute average embeddings for all tweets - not implemented for LSTMs and CNNs

def get_avg_embeddings(tweets, model, vector_size=60):

    result = np.zeros((len(tweets), vector_size))
    for i, tweet in enumerate(tweets): # Iterate over tweets
        words = tweet.split()  # Tokenize
        word_vectors = np.array([model[word] for word in words if word in model])  # Collect valid word vectors
        if len(word_vectors) > 0:
            result[i] = np.mean(word_vectors, axis=0) # Compute the average for valid word vectors

    return result

**Building the models on train-test split of `train_tweets/`** - explore here for hyperparameter tuning

In [13]:
# Import data from directory

# Loading training + creating dataframe
df1 = load_data(drop_columns = ['PeriodID', 'Timestamp'])
check(df1)

# Dropping duplicates
df_dropped_duplicates = df1.drop_duplicates(subset='Tweet').copy()
check(df_dropped_duplicates)

df shape: (5056050, 4)
null values: ID           0
MatchID      0
EventType    0
Tweet        0
dtype: int64


Unnamed: 0,ID,MatchID,EventType,Tweet
0,11_0,11,0,RT @2014WorIdCup: Argentina vs Belgium\n\nWho ...
1,11_0,11,0,@elijahman_ time to focus on Belgium winning t...
2,11_0,11,0,RT @FIFAWorldCup: GLOBAL STADIUM: #Joinin with...
3,11_0,11,0,RT @CatholicNewsSvc: #PopeFrancis. Uh-oh. Arge...
4,11_0,11,0,RT @soccerdotcom: If he scores vs #BEL we'll a...


df shape: (2819989, 4)
null values: ID           0
MatchID      0
EventType    0
Tweet        0
dtype: int64


Unnamed: 0,ID,MatchID,EventType,Tweet
0,11_0,11,0,RT @2014WorIdCup: Argentina vs Belgium\n\nWho ...
1,11_0,11,0,@elijahman_ time to focus on Belgium winning t...
2,11_0,11,0,RT @FIFAWorldCup: GLOBAL STADIUM: #Joinin with...
3,11_0,11,0,RT @CatholicNewsSvc: #PopeFrancis. Uh-oh. Arge...
4,11_0,11,0,RT @soccerdotcom: If he scores vs #BEL we'll a...


In [14]:
# Preprocess each tweet (4 min)
df = preprocess_text_vectorized(df_dropped_duplicates, 'Tweet')
check(df)

# Optional for understanding the data:
# print(top_words(df))
# add_tweet_word_count(df)

df shape: (2819989, 4)
null values: ID           0
MatchID      0
EventType    0
Tweet        0
dtype: int64


Unnamed: 0,ID,MatchID,EventType,Tweet
0,11_0,11,0,argentina vs belgium wins
1,11_0,11,0,time focus belgium winning world cup
2,11_0,11,0,global stadium joinin worldcup coverage argbel...
3,11_0,11,0,popefrancis uhoh argentina vs belgium 30 mins ...
4,11_0,11,0,scores vs bel well award messisigned one lucky...


In [16]:
# Fit tokenizeer on all the tweets
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(df['Tweet'])

In [17]:
# CNN model

pad_max_len = 60

modelCNN = Sequential()

# input_dim is vocabulary size, output_dim is the size of word embeddings
modelCNN.add(Embedding(input_dim=10000, output_dim=128, input_length=pad_max_len))
modelCNN.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
modelCNN.add(GlobalMaxPooling1D())
modelCNN.add(Dense(128, activation='relu'))
modelCNN.add(Dropout(0.5)) # Higher dropout used for hyperparameter tuning on small sample to avoid overfitting
modelCNN.add(Dense(1, activation='sigmoid'))

# Compile the model
modelCNN.compile(optimizer=Adam(learning_rate=0.0001), loss='binary_crossentropy', metrics=['accuracy']) # Smaller lr worked better for small sample



In [113]:
# Training the CNN model on small batches to speed up tuning

# Splitting test and training by game
train_df, test_df = train_test_game_split(df, test_size=0.2, random_state=42)
print("train_df shape:", train_df.shape)
print("test_df shape:", test_df.shape)

# Working with small subsets of data - tried increasing sizes
test_df_subset = five_percent(test_df, percentage = 0.1)
train_df_subset = five_percent(train_df, percentage= 0.05)

IDs_test, X_test, y_test = ID_X_y(test_df_subset, tokenizer)
IDs_train, X_train, y_train = ID_X_y(train_df_subset, tokenizer)

print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)


from tensorflow.keras.callbacks import EarlyStopping
early_stopping = EarlyStopping(monitor='val_accuracy', patience=3)

# train model
history = modelCNN.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_test, y_test), callbacks=[early_stopping])

# Voting final predictions
pred_df, test_accuracy1, test_loss1, voted_test_accuracy = vote(IDs_test, X_test, y_test)
print(f'Accuracy before voting: {test_accuracy1}')
print(f'Accuracy after voting: {voted_test_accuracy}')

train_df shape: (1971574, 4)
test_df shape: (848415, 4)


  df_subset = df.groupby(groupby, group_keys=False).apply(


X_test shape: (84831, 60)
y_test shape: (84831,)
Epoch 1


  df_subset = df.groupby(groupby, group_keys=False).apply(


Epoch 1/5
[1m3081/3081[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m138s[0m 44ms/step - accuracy: 0.6002 - loss: 0.6706 - val_accuracy: 0.5170 - val_loss: 0.6995
Epoch 2/5
[1m3081/3081[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m139s[0m 43ms/step - accuracy: 0.6245 - loss: 0.6375 - val_accuracy: 0.5792 - val_loss: 0.6841
Epoch 3/5
[1m3081/3081[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m192s[0m 62ms/step - accuracy: 0.6690 - loss: 0.6069 - val_accuracy: 0.5628 - val_loss: 0.6982
Epoch 4/5
[1m3081/3081[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m143s[0m 43ms/step - accuracy: 0.7186 - loss: 0.5599 - val_accuracy: 0.5614 - val_loss: 0.7138
Epoch 5/5
[1m3081/3081[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m166s[0m 51ms/step - accuracy: 0.7780 - loss: 0.4869 - val_accuracy: 0.5578 - val_loss: 0.7591
[1m2651/2651[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 11ms/step
Accuracy before voting: 0.5577560090061416
Accuracy after voting: 0.6852447807994719


In [18]:
# LSTM model

pad_max_len = 60

modelLSTM = Sequential()

modelLSTM.add(Embedding(input_dim=10000, output_dim=128, input_length=pad_max_len)) # Embedding layer 128
modelLSTM.add(LSTM(units = 128, return_sequences=False)) #128
# modelLSTM.add(GRU(units = 64, return_sequences=False)) #128
modelLSTM.add(Dense(256, activation='relu'))
modelLSTM.add(Dropout(0.2)) #to avoid overfitting
modelLSTM.add(Dense(1, activation='sigmoid'))  # Sigmoid for binary classification

modelLSTM.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy']) #Higher learning rate for LSTM

In [20]:
# Training the LSTM model on small batches to speed up tuning

# Splitting test and training by game
train_df, test_df = train_test_game_split(df, test_size=0.2, random_state=42)
print("train_df shape:", train_df.shape)
print("test_df shape:", test_df.shape)

# Working with small subsets of data - tried increasing sizes
test_df_subset = five_percent(test_df, percentage = 0.1)
train_df_subset = five_percent(train_df, percentage= 0.05)

IDs_test, X_test, y_test = ID_X_y(test_df_subset, tokenizer)
IDs_train, X_train, y_train = ID_X_y(train_df_subset, tokenizer)

print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)


from tensorflow.keras.callbacks import EarlyStopping
early_stopping = EarlyStopping(monitor='val_accuracy', patience=3)

# train model
history = modelCNN.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_test, y_test), callbacks=[early_stopping])

# Voting final predictions
pred_df, test_accuracy1, test_loss1, voted_test_accuracy = vote(IDs_test, X_test, y_test)
print(f'Accuracy before voting: {test_accuracy1}')
print(f'Accuracy after voting: {voted_test_accuracy}')

train_df shape: (1971574, 4)
test_df shape: (848415, 4)


  df_subset = df.groupby(groupby, group_keys=False).apply(
  df_subset = df.groupby(groupby, group_keys=False).apply(


X_test shape: (84831, 60)
y_test shape: (84831,)
Epoch 1/5
[1m3081/3081[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m137s[0m 44ms/step - accuracy: 0.5989 - loss: 0.6710 - val_accuracy: 0.5007 - val_loss: 0.7109
Epoch 2/5
[1m3081/3081[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m130s[0m 42ms/step - accuracy: 0.6244 - loss: 0.6387 - val_accuracy: 0.5406 - val_loss: 0.7065
Epoch 3/5
[1m3081/3081[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m129s[0m 42ms/step - accuracy: 0.6640 - loss: 0.6084 - val_accuracy: 0.5531 - val_loss: 0.7081
Epoch 4/5
[1m3081/3081[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 42ms/step - accuracy: 0.7159 - loss: 0.5571 - val_accuracy: 0.5409 - val_loss: 0.7539
Epoch 5/5
[1m3081/3081[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 42ms/step - accuracy: 0.7811 - loss: 0.4752 - val_accuracy: 0.5544 - val_loss: 0.7789
[1m2651/2651[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 8ms/step
Accuracy before voting: 0.5544317525432919


**Training for Kaggle predictions**

In [19]:
# Import data from directory
# Importing and preprocessing validation

# Load evel_tweets/ and make a dataframe
df1_val = load_data(folder = 'eval_tweets', drop_columns = ['PeriodID', 'Timestamp'])

#drop duplicates
df_dropped_duplicates_val = df1_val.drop_duplicates(subset='Tweet').copy()

# Preprocess
df_val = preprocess_text_vectorized(df_dropped_duplicates_val, 'Tweet')
check(df_val)

# Convert to sequences
IDs_val, X_val = ID_X_y(df_val, tokenizer)

df shape: (621958, 3)
null values: ID         0
MatchID    0
Tweet      0
dtype: int64


Unnamed: 0,ID,MatchID,Tweet
0,6_0,6,finally get see germany play ger
1,6_0,6,boateng brothers score today well give away pa...
2,6_0,6,fascinated gervsgha match tell us lot chances ...
3,6_0,6,ger gha
4,6_0,6,boateng grudge match 212 jermaine score ger ke...


**CNN model trained on 25% of the dataset** (same results as with 100%)

In [27]:
# Building the CNN model

pad_max_len = 60

modelCNN = Sequential()

# `input_dim` is vocabulary size, `output_dim` is the size of word embeddings
modelCNN.add(Embedding(input_dim=10000, output_dim=128, input_length=pad_max_len))
modelCNN.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
modelCNN.add(GlobalMaxPooling1D())
modelCNN.add(Dense(256, activation='relu'))
modelCNN.add(Dropout(0.3)) # Smaller dropout when training with more data
modelCNN.add(Dense(1, activation='sigmoid'))

# Compile the model
modelCNN.compile(optimizer=Adam(learning_rate=0.0005), loss='binary_crossentropy', metrics=['accuracy'])



In [152]:
# Training on 25% of all the training data, all games included

# Sampling random 25% of each bin of every training game
df = preprocess_text_vectorized(df_dropped_duplicates, 'Tweet')
df_full = five_percent(df, groupby = 'ID', percentage = 0.25)
IDs, X, y = ID_X_y(df_full, tokenizer)
check(df_full)

# Training the model
history = modelCNN.fit(X, y, epochs=2, batch_size=32)

# Making predictions
pred_df_full= vote(IDs_val, X_val) # Making predictions based on vote

# Keeping only 1 prediction per bin
pred_df_kaggle = pred_df_full.groupby('ID', as_index=False)['EventType'].agg(lambda x: x.mode()[0])

# Saving predictions in csv
pred_df_kaggle.to_csv('C_predictions.csv', index=False)

  df_subset = df.groupby(groupby, group_keys=False).apply(


df shape: (704990, 4)
null values: ID           0
MatchID      0
EventType    0
Tweet        0
dtype: int64


Unnamed: 0,ID,MatchID,EventType,Tweet
0,0_0,0,0,retweet to wish ecu hon good luck in todays...
1,0_0,0,0,hon still has a slim chance of qualifying agai...
2,0_0,0,0,match updates world cup group e honduras line...
3,0_0,0,0,lineup honduras vs swiss
4,0_0,0,0,honduras v switzerland teams announcement lin...


Epoch 1/2
[1m22031/22031[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m816s[0m 37ms/step - accuracy: 0.5980 - loss: 0.6584
Epoch 2/2
[1m22031/22031[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m849s[0m 36ms/step - accuracy: 0.6335 - loss: 0.6295
[1m19437/19437[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m170s[0m 9ms/step


**Training CNN on 100% of the data** (same results on Kaggle but much slower)

In [28]:
# Training on all the training data, all games included (C_predictions_2.csv)
# Small error at the end due to not running a previous cell before training (easy fix). The output was left in to avoid training again for 2 hours.

# Convert all tweets to sequences
IDs, X, y = ID_X_y(df, tokenizer)
check(df)

# train model
history = modelCNN.fit(X, y, epochs=2, batch_size=32)

# Making predictions based on vote
pred_df_full= vote(IDs_val, X_val)

# Keeping only 1 prediction per bin
pred_df_kaggle = pred_df_full.groupby('ID', as_index=False)['EventType'].agg(lambda x: x.mode()[0])

# Saving predictions in csv
pred_df_kaggle.to_csv('C_predictions_2.csv', index=False) # Putting into csv for Kaggle


df shape: (2819989, 4)
null values: ID           0
MatchID      0
EventType    0
Tweet        0
dtype: int64


Unnamed: 0,ID,MatchID,EventType,Tweet
0,0_0,0,0,i hope honduras win today
1,0_0,0,0,france needs to demolish ecuador while hondura...
2,0_0,0,0,world cup games at 4 pm et franceecuador on ...
3,0_0,0,0,also at 4 pm est hon v sui will sui advance t...
4,0_0,0,0,lineup honduras vs swiss


Epoch 1/2
[1m88125/88125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3812s[0m 43ms/step - accuracy: 0.6153 - loss: 0.6436
Epoch 2/2
[1m88125/88125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3826s[0m 43ms/step - accuracy: 0.6358 - loss: 0.6252


NameError: name 'IDs_val' is not defined

**LSTM model trained on different sample sizes of the dataset**

In [22]:
pad_max_len = 60

modelLSTM = Sequential()

modelLSTM.add(Embedding(input_dim=10000, output_dim=128, input_length=pad_max_len))
modelLSTM.add(LSTM(units = 128, return_sequences=False))
modelLSTM.add(Dense(256, activation='relu'))
modelLSTM.add(Dropout(0.3)) # to avoid overfitting on small dataset
modelLSTM.add(Dense(1, activation='sigmoid'))

modelLSTM.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])

In [23]:
# Training on subsets the training data, all games included (R_predictions_3.csv)

df_full = five_percent(df, groupby = 'ID', percentage = 0.15)
IDs, X, y = ID_X_y(df_full, tokenizer)
check(df_full)

# Training
history = modelCNN.fit(X, y, epochs=5, batch_size=32)

# Making predictions based on vote
pred_df_full= vote(IDs_val, X_val)

# Keeping only 1 prediction per bin
pred_df_kaggle = pred_df_full.groupby('ID', as_index=False)['EventType'].agg(lambda x: x.mode()[0])

# Saving predictions in csv
pred_df_kaggle.to_csv('R_predictions_3.csv', index=False) # Putting into csv for Kaggle

  df_subset = df.groupby(groupby, group_keys=False).apply(


df shape: (422998, 4)
null values: ID           0
MatchID      0
EventType    0
Tweet        0
dtype: int64


Unnamed: 0,ID,MatchID,EventType,Tweet
0,0_0,0,0,honduras fan kisses jersey amazonia arena mana...
1,0_0,0,0,looking prematch hon reading check piece
2,0_0,0,0,time haha good fancied flutter tonight bren ba...
3,0_0,0,0,good luck emilio izaguirre hon worldcup group ...
4,0_0,0,0,rt honduras fav switzerland wins


Epoch 1/5
[1m13219/13219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m467s[0m 35ms/step - accuracy: 0.6046 - loss: 0.6583
Epoch 2/5
[1m13219/13219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m476s[0m 36ms/step - accuracy: 0.6603 - loss: 0.6052
Epoch 3/5
[1m13219/13219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m497s[0m 36ms/step - accuracy: 0.7234 - loss: 0.5373
Epoch 4/5
[1m13219/13219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m501s[0m 36ms/step - accuracy: 0.7732 - loss: 0.4647
Epoch 5/5
[1m13219/13219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m474s[0m 36ms/step - accuracy: 0.8060 - loss: 0.4048
[1m19437/19437[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m166s[0m 9ms/step
