## Basic Setup
It is advisable to mount a certain gdrive folder to streamline the work

In [1]:
from google.colab import drive
from pathlib import Path
import os

# Mount Google Drive
drive.mount('/content/drive')

# Define and create the output folder
base_path = Path('/content/drive/My Drive/ColabOutputs')
base_path.mkdir(parents=True, exist_ok=True)  # Create folder if it doesn't exist



Mounted at /content/drive


### Read File

In [2]:
import pandas as pd
train_df = pd.read_csv(base_path / "new_train_cleaned_tweet.csv")
dev_df = pd.read_csv(base_path / "new_dev_cleaned.csv")
test_df = pd.read_csv(base_path / "new_test_cleaned.csv")

In [None]:
train_df = train_df.dropna(subset=['io_flag'])

In [None]:
train_df["io_flag"] = train_df["io_flag"].astype(int)
dev_df["io_flag"] = dev_df["io_flag"].astype(int)

### User Metrics Extraction

In [None]:
import pandas as pd
import numpy as np
from collections import Counter
from scipy.stats import entropy
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sentence_transformers import SentenceTransformer

In [None]:
df = pd.read_csv(base_path / "new_train_cleaned_user.csv")
df['row_id'] = np.arange(len(df))
df['Tweets'] = df['Tweets'].fillna('')
df['datex'] = pd.to_datetime(df['datex'])
df = df.dropna(subset=['io_flag'])

In [None]:
# SentenceBERT embedding
user_grouped = df.groupby('User')['Tweets'].apply(lambda x: ' '.join(x)).reset_index()
model = SentenceTransformer('all-MiniLM-L6-v2')
user_grouped['embedding'] = user_grouped['Tweets'].apply(lambda x: model.encode(x))
embedding_matrix = np.vstack(user_grouped['embedding'].values)

In [None]:
# Post count per active day
def active_days(dates):
    return len(set(dates.dt.date))

user_dates = df.groupby('User')['datex'].apply(list).reset_index()
user_dates['active_days'] = user_dates['datex'].apply(lambda dates: active_days(pd.Series(dates)))
user_dates['post_count'] = df.groupby('User').size().values
user_dates['avg_post_per_day'] = user_dates['post_count'] / user_dates['active_days']
user_grouped = user_grouped.merge(user_dates[['User', 'avg_post_per_day']], on='User')

# Burstiness and average tweet length
def compute_burstiness(dates):
    if len(dates) < 2:
        return 0
    dates_sorted = sorted(dates)
    gaps = [(dates_sorted[i+1] - dates_sorted[i]).total_seconds() / 3600 for i in range(len(dates_sorted)-1)]
    return np.std(gaps)

burstiness_df = df.groupby('User')['datex'].apply(compute_burstiness).reset_index(name='burstiness')
avg_len_df = df.groupby('User')['Tweets'].apply(lambda x: np.mean(x.str.len())).reset_index(name='avg_tweet_length')
behavior_df = burstiness_df.merge(avg_len_df, on='User')
user_grouped = user_grouped.merge(behavior_df, on='User')



In [None]:
# Save
np.save(base_path / 'new_user_embeddings.npy', embedding_matrix)
user_grouped.drop(columns='embedding').to_csv(base_path / 'new_user_features.csv', index=False)
behavior_df.to_csv(base_path / 'new_user_behavior_features.csv', index=False)


In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv(base_path / "new_train_cleaned_user.csv")
df['Tweets'] = df['Tweets'].fillna('')
df['datex'] = pd.to_datetime(df['datex'])

user_grouped = df.groupby('User')['Tweets'].apply(lambda x: ' '.join(x)).reset_index()

# avg_post_per_day
def active_days(dates):
    return len(set(dates.dt.date))
user_dates = df.groupby('User')['datex'].apply(list).reset_index()
user_dates['active_days'] = user_dates['datex'].apply(lambda d: active_days(pd.Series(d)))
user_dates['post_count'] = df.groupby('User').size().values
user_dates['avg_post_per_day'] = user_dates['post_count'] / user_dates['active_days']
user_grouped = user_grouped.merge(user_dates[['User', 'avg_post_per_day']], on='User')

# Burstiness & avg_len
def compute_burstiness(dates):
    if len(dates) < 2:
        return 0
    dates_sorted = sorted(dates)
    gaps = [(dates_sorted[i+1] - dates_sorted[i]).total_seconds() / 3600 for i in range(len(dates_sorted)-1)]
    return np.std(gaps)
burstiness_df = df.groupby('User')['datex'].apply(compute_burstiness).reset_index(name='burstiness')
avg_len_df = df.groupby('User')['Tweets'].apply(lambda x: np.mean(x.str.len())).reset_index(name='avg_tweet_length')
behavior_df = burstiness_df.merge(avg_len_df, on='User')
user_grouped = user_grouped.merge(behavior_df, on='User')

labels = df[['User', 'io_flag']].dropna().drop_duplicates()
labels = labels.groupby('User')['io_flag'].max().reset_index()
user_grouped = user_grouped.merge(labels, on='User')




In [None]:
#('/content/drive/MyDrive/ColabOutputs/train_cleaned_tweet.csv', index=False)
user_grouped.to_csv(base_path / "new_user_features_labeled.csv", index=False)

In [None]:
train = pd.read_csv(base_path / "new_user_features_labeled.csv")
user_train = train[['User','avg_post_per_day',	'burstiness',	'avg_tweet_length']]
combined_train = pd.merge(train_df, user_train, on='User', how='left')

In [None]:
# Install emoji package
# !pip3 install emoji==0.6.0

### Load Tweet Level Model

In [None]:
# Install and Import
# !pip install transformers --quiet

import torch
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Input, Dense, Concatenate
from tensorflow.keras.metrics import Precision, Recall, AUC
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.optimizers import Adam


In [4]:
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
import torch
from pathlib import Path

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define model location using base path
model_name = "bertweet_metrics_f1_threshold70"
model_path = base_path / model_name

# Load tokenizer and model from local path
tokenizer = AutoTokenizer.from_pretrained(model_path, local_files_only=True)

# Use AutoModel if extracting embeddings; use AutoModelForSequenceClassification for classification
model = AutoModel.from_pretrained(model_path, local_files_only=True).to(device)
model.eval()


emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0


KeyboardInterrupt: 

### Define Function to Generate Embeddings for Tweet Level Model

In [None]:
from tqdm.notebook import tqdm

def get_bertweet_embeddings(texts, batch_size=64):
    all_embeddings = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Extracting BERTweet embeddings"):
        batch_texts = texts[i:i+batch_size]
        encoded = tokenizer(batch_texts, return_tensors='pt', padding=True, truncation=True, max_length=128).to(device)
        with torch.no_grad():
            output = model(**encoded)
            cls_embeddings = output.last_hidden_state[:, 0, :]  # CLS token
            all_embeddings.append(cls_embeddings.cpu().numpy())
    return np.vstack(all_embeddings)


In [None]:
texts = combined_train['Tweets'].tolist()
bertweet_embeddings = get_bertweet_embeddings(texts)  # shape (n, 768)

In [None]:
np.save(base_path /'bertweet_embeddings.npy', bertweet_embeddings)

In [None]:
# Use this if needed
# bertweet_embeddings = np.load(base_path /'bertweet_embeddings.npy')

### RF User Behaviour Prediction and Projection

In [None]:
# Project RF
import joblib
rf_features = combined_train[["avg_post_per_day", "burstiness", "avg_tweet_length"]]

# 3. Load saved RF model
rf_model_path = base_path / "/content/drive/My Drive/ColabOutputs/"user-model/user_model_v1"
rf_model = joblib.load(rf_model_path)

# 4. Get predicted probabilities

rf_probs = rf_model.predict_proba(rf_features)[:, 1].reshape(-1, 1)

# Project to 128D
from tensorflow.keras.layers import Input, Dense

rf_projector = Sequential([
    Input(shape=(1,)),
    Dense(128, activation='relu')
])
rf_proj = rf_projector.predict(rf_probs)

## Validation Set Preprocessing

#### User Metrics

In [None]:
import pandas as pd
df = pd.read_csv(base_path / "new_dev_cleaned_user.csv")
df['row_id'] = np.arange(len(df))
df['Tweets'] = df['Tweets'].fillna('')
df['datex'] = pd.to_datetime(df['datex'])
df = df.dropna(subset=['io_flag'])

In [None]:
# SentenceBERT embedding
user_grouped = df.groupby('User')['Tweets'].apply(lambda x: ' '.join(x)).reset_index()
model = SentenceTransformer('all-MiniLM-L6-v2')
user_grouped['embedding'] = user_grouped['Tweets'].apply(lambda x: model.encode(x))
embedding_matrix = np.vstack(user_grouped['embedding'].values)

# Post count per active day
def active_days(dates):
    return len(set(dates.dt.date))

user_dates = df.groupby('User')['datex'].apply(list).reset_index()
user_dates['active_days'] = user_dates['datex'].apply(lambda dates: active_days(pd.Series(dates)))
user_dates['post_count'] = df.groupby('User').size().values
user_dates['avg_post_per_day'] = user_dates['post_count'] / user_dates['active_days']
user_grouped = user_grouped.merge(user_dates[['User', 'avg_post_per_day']], on='User')

# Burstiness and average tweet length
def compute_burstiness(dates):
    if len(dates) < 2:
        return 0
    dates_sorted = sorted(dates)
    gaps = [(dates_sorted[i+1] - dates_sorted[i]).total_seconds() / 3600 for i in range(len(dates_sorted)-1)]
    return np.std(gaps)

burstiness_df = df.groupby('User')['datex'].apply(compute_burstiness).reset_index(name='burstiness')
avg_len_df = df.groupby('User')['Tweets'].apply(lambda x: np.mean(x.str.len())).reset_index(name='avg_tweet_length')
behavior_df = burstiness_df.merge(avg_len_df, on='User')
user_grouped = user_grouped.merge(behavior_df, on='User')



In [None]:
#('/content/drive/MyDrive/ColabOutputs/train_cleaned_tweet.csv', index=False)
# Save
np.save(base_path / 'new_user_embeddings_dev.npy', embedding_matrix)
user_grouped.drop(columns='embedding').to_csv(base_path / 'new_user_features_dev.csv', index=False)
behavior_df.to_csv(base_path / 'new_user_behavior_features_dev.csv', index=False)

In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv(base_path / "new_dev_cleaned_user.csv")
df['Tweets'] = df['Tweets'].fillna('')
df['datex'] = pd.to_datetime(df['datex'])

user_grouped = df.groupby('User')['Tweets'].apply(lambda x: ' '.join(x)).reset_index()

# avg_post_per_day
def active_days(dates):
    return len(set(dates.dt.date))
user_dates = df.groupby('User')['datex'].apply(list).reset_index()
user_dates['active_days'] = user_dates['datex'].apply(lambda d: active_days(pd.Series(d)))
user_dates['post_count'] = df.groupby('User').size().values
user_dates['avg_post_per_day'] = user_dates['post_count'] / user_dates['active_days']
user_grouped = user_grouped.merge(user_dates[['User', 'avg_post_per_day']], on='User')

# Burstiness & avg_len
def compute_burstiness(dates):
    if len(dates) < 2:
        return 0
    dates_sorted = sorted(dates)
    gaps = [(dates_sorted[i+1] - dates_sorted[i]).total_seconds() / 3600 for i in range(len(dates_sorted)-1)]
    return np.std(gaps)
burstiness_df = df.groupby('User')['datex'].apply(compute_burstiness).reset_index(name='burstiness')
avg_len_df = df.groupby('User')['Tweets'].apply(lambda x: np.mean(x.str.len())).reset_index(name='avg_tweet_length')
behavior_df = burstiness_df.merge(avg_len_df, on='User')
user_grouped = user_grouped.merge(behavior_df, on='User')

labels = df[['User', 'io_flag']].dropna().drop_duplicates()
labels = labels.groupby('User')['io_flag'].max().reset_index()
user_grouped = user_grouped.merge(labels, on='User')


In [None]:
#('/content/drive/MyDrive/ColabOutputs/train_cleaned_tweet.csv', index=False)
user_grouped.to_csv(base_path / "new_user_features_labeled_dev.csv", index=False)

In [None]:
dev = pd.read_csv(base_path / "new_user_features_labeled_dev.csv")
user_dev = dev[['User','avg_post_per_day',	'burstiness',	'avg_tweet_length']]
combined_dev = pd.merge(dev_df, user_dev, on='User', how='left')

In [None]:
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
import torch
from pathlib import Path

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define model location using base path
model_name = "bertweet_metrics_f1_threshold70"
model_path = base_path / model_name

# Load tokenizer and model from local path
tokenizer = AutoTokenizer.from_pretrained(model_path, local_files_only=True)

# Use AutoModel if extracting embeddings; use AutoModelForSequenceClassification for classification
model = AutoModel.from_pretrained(model_path, local_files_only=True).to(device)
model.eval()


In [None]:
texts_dev = combined_dev['Tweets'].tolist()
bertweet_embeddings_dev = get_bertweet_embeddings(texts_dev)  # shape (n, 768)

In [None]:
np.save(base_path / 'bertweet_embeddings_dev.npy', bertweet_embeddings_dev)

In [None]:
bertweet_embeddings_dev = np.load(base_path / 'bertweet_embeddings_dev.npy')

In [None]:
# Project RF
rf_features_dev = combined_dev[["avg_post_per_day", "burstiness", "avg_tweet_length"]]

# 3. Load saved RF model
rf_model_path = base_path / "user-model/user_model_v1"
rf_model = joblib.load(rf_model_path)

# 4. Get predicted probabilities

rf_probs_dev = rf_model.predict_proba(rf_features_dev)[:, 1].reshape(-1, 1)

# Project to 128D
from tensorflow.keras.layers import Input, Dense

rf_projector = Sequential([
    Input(shape=(1,)),
    Dense(128, activation='relu')
])
rf_proj_dev = rf_projector.predict(rf_probs_dev)

### Stacked Model Training

In [None]:
labels_train = combined_train['io_flag']

# Concat and Split
X_combined = np.concatenate([bertweet_embeddings, rf_proj], axis=1)  # shape: (n, 896)
y_combined = labels_train  # binary label (0/1)

X_val = np.concatenate([bertweet_embeddings_dev, rf_proj_dev], axis=1)  # shape: (n, 896)
y_val = labels_dev  # binary label (0/1)

In [None]:
# Train Final MLP
meta_model = Sequential([
    Dense(512, activation='relu', input_shape=(896,)),
    Dense(256, activation='relu'),
    Dense(1, activation='sigmoid')
])

meta_model.compile(
    optimizer=Adam(1e-4),
    loss='binary_crossentropy',
    metrics=[Precision(name='precision'), Recall(name='recall'), AUC(name='auc')]
)

checkpoint = ModelCheckpoint(
    filepath='new_mlp_concat_{epoch:02d}.keras',
    save_weights_only=False,
    save_freq='epoch',
    verbose=1
)

meta_model.fit(
    X_combined, y_combined,
    validation_data=(X_val, y_val),
    epochs=10,
    batch_size=32,
    callbacks=[checkpoint]
)


In [None]:
import shutil
#Best epoch = 10 (high precision, moderate recall), epoch 8 (balanced precision and recall)


# Define source and destination paths
source = 'new_mlp_concat_03.keras'
destination = base_path / 'new_mlp_stacked_03.keras'

# Copy the file
shutil.copy(source, destination)

## Further Analysis
Below is further analysis for finding the reason why BERTweet models experienced drop of recall in test set.

### Read Unmasked Test Dataset

In [None]:
import pandas as pd
test_df = pd.read_csv(base_path / "new_test_cleaned.csv")
test = test_df.dropna(subset=['io_flag'])

### Load Embedding for Validation Set and Test Set
Although this notebook doesn't include the generation of embeddings for test dataset, this can be done by following the same flow for train/validation dataset but using "new_test_cleaned.csv" as the datasource.




In [None]:
import numpy as np
from keras.models import load_model

X_val = np.load(base_path / "bertweet_embeddings_dev.npy")
X_test = np.load(base_path / "bertweet_embeddings_test.npy")
model = load_model(base_path / "new_mlp_baseline_09.keras")

### Produce Prediction Using Model 2

In [None]:
proba_val = model.predict(X_val).flatten()
proba_test = model.predict(X_test).flatten()

In [None]:
y_val = dev_df['io_flag']  # binary true labels
y_test = test['io_flag']

date_val = pd.to_datetime(dev_df['datex'])  # or your date column
date_test = pd.to_datetime(test['datex'])


df_val = pd.DataFrame({
    "date": date_val,
    "proba": proba_val,
    "true": y_val,
    "set": "val"
})

df_test = pd.DataFrame({
    "date": date_test,
    "proba": proba_test,
    "true": y_test,
    "set": "test"
})

df_all = pd.concat([df_val, df_test], ignore_index=True)
df_all["pred"] = (df_all["proba"] >= 0.5).astype(int)
df_all["correct"] = df_all["pred"] == df_all["true"]
df_all = df_all.sort_values("date")


### Plot The Prediction Probability Over Validation and Test

In [None]:
import matplotlib.pyplot as plt

# Prepare IO
df_io = df_all[df_all["true"] == 1].copy()
df_io["day"] = df_io["date"].dt.date
daily_stats_io = df_io.groupby("day")["proba"].agg(["mean", "median"])

# Prepare Non-IO
df_nonio = df_all[df_all["true"] == 0].copy()
df_nonio["day"] = df_nonio["date"].dt.date
daily_stats_nonio = df_nonio.groupby("day")["proba"].agg(["mean", "median"])

# Plot
fig, axes = plt.subplots(1, 2, figsize=(14, 5), sharey=True)

# IO Plot
axes[0].plot(daily_stats_io.index, daily_stats_io["mean"], marker='o', color='purple', label="Mean")
axes[0].plot(daily_stats_io.index, daily_stats_io["median"], marker='s', color='orange', label="Median")
axes[0].axvline(pd.to_datetime("2016-11-06"), color="black", linestyle="--", label="Test Start")
axes[0].set_title("IO (true=1) Prediction Confidence")
axes[0].set_xlabel("Date")
axes[0].set_ylabel("Predicted Probability")
axes[0].tick_params(axis='x', rotation=45)
axes[0].legend()

# Non-IO Plot
axes[1].plot(daily_stats_nonio.index, daily_stats_nonio["mean"], marker='o', color='purple', label="Mean")
axes[1].plot(daily_stats_nonio.index, daily_stats_nonio["median"], marker='s', color='orange', label="Median")
axes[1].axvline(pd.to_datetime("2016-11-06"), color="black", linestyle="--", label="Test Start")
axes[1].set_title("Non-IO (true=0) Prediction Confidence")
axes[1].set_xlabel("Date")
axes[1].tick_params(axis='x', rotation=45)
axes[1].legend()

plt.tight_layout()
plt.show()


### Check Count IO
We checked this to know if the decrease of probability power is due to instability due to small number of observations or not

In [None]:
# Dip due to small number?
# Count IO (true=1) and Non-IO (true=0) samples per date
counts_io = df_all[df_all["true"] == 1].groupby("date").size().rename("IO_count")
counts_nonio = df_all[df_all["true"] == 0].groupby("date").size().rename("NonIO_count")

# Combine and fill missing dates
counts_df = pd.concat([counts_io, counts_nonio], axis=1).fillna(0).astype(int)

# Print the result
print(counts_df)


In [None]:
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

# Resize the plot
fig, ax = plt.subplots(figsize=(7, 4))  # moderate width

# Bar chart for IO tweet count
ax.bar(counts_df.index, counts_df['IO_count'], color='red', label='IO Count (true=1)')

# Add vertical line for test start date
test_start_date = pd.to_datetime('2016-11-06')
ax.axvline(test_start_date, color='black', linestyle='--', label='Test Start')

# Format the x-axis
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
plt.xticks(rotation=45)

# Add labels and legend
ax.set_title('Daily IO (true=1) Tweet Count')
ax.set_xlabel('Date')
ax.set_ylabel('Tweet Count')
ax.legend()

plt.tight_layout()
plt.show()


### Checking Change of Vocabularies
We perform this check to know the possibility of semantic shift on test period

In [None]:
test_unmasked = pd.read_csv(base_path / 'test_cleaned.csv')

In [None]:
# step 1 : copy train data
train_copy = test_unmasked.copy()

# step 2: Preprocess text for word frequency analysis
import nltk
import string
from collections import Counter
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def preprocess_for_freq(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))  # remove punctuation
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    return tokens

# Apply to all training texts and flatten the token list
all_tokens = train_copy['Tweets'].apply(preprocess_for_freq)
flat_tokens = [token for sublist in all_tokens for token in sublist]



In [None]:
# Step 3: Get Top 50 Most Frequent Words
word_freq = Counter(flat_tokens)
top_words = [word for word, count in word_freq.most_common(50)]

print(top_words)


In [None]:
## These are the identified topic specific words on train dataset
topic_words = ['trump', 'clinton', 'hillary', 'donald', 'debate', 'vote', 'debatenight', 'trumps', 'gop', 'president', 'obama', 'election', 'bill', 'america', 'campaign', 'maga']


In [None]:
# Convert to sets for comparison
top_words_set = set(top_words)
topic_words_set = set(topic_words)

# Find top words that are not in topic_words
unrelated_words = top_words_set - topic_words_set

# Print them
print(sorted(unrelated_words))


### Plot The Overlap Between Topic Specific Words in Test vs Train Dataset

In [None]:
import matplotlib.pyplot as plt
from matplotlib.patches import Patch

# Define custom stopwords to exclude
custom_stopwords = {'dont', 'like', 'user', 'im', 'go', 'get', 'know', 'think', 'going', 'make', 'would',
                    'cant', 'one', 'today', 'us', 'rt', 'see', 'want', 'time', 'u', '2', '4', 'day', 'country',
                    'tomorrow', 'amp', 'httpurl', 'lets', 'got', 'new'}

# Filter and sort top words
filtered_word_freq = {word: freq for word, freq in word_freq.items() if word not in custom_stopwords}
top_words_sorted = sorted(filtered_word_freq.items(), key=lambda x: x[1], reverse=True)[:20]

words, freqs = zip(*top_words_sorted)
colors = ['gray' if word in topic_words else 'red' for word in words]

# Plot bar chart
plt.figure(figsize=(10, 5))
plt.bar(words, freqs, color=colors)
plt.xticks(rotation=45, ha='right')
plt.ylabel('Frequency')
plt.title('Top 20 Topical Words in Test Set')

# Add legend
legend_handles = [Patch(color='gray', label='Identified/masked in train set'),
                  Patch(color='red', label='Not identified in train set')]
plt.legend(handles=legend_handles)

plt.tight_layout()
plt.show()
