## Basic Setup
It is advisable to mount a certain gdrive folder to streamline the work

In [None]:
from google.colab import drive
from pathlib import Path
import os

# Mount Google Drive
drive.mount('/content/drive')

# Define and create the output folder, change the base path as you like
base_path = Path('/content/drive/My Drive/ColabOutputs')
base_path.mkdir(parents=True, exist_ok=True)  # Create folder if it doesn't exist



## BERTweet Classifier (Model 1)

### Load Datasource

In [None]:
import pandas as pd
train_df = pd.read_csv(base_path / 'train_cleaned_tweet.csv')
dev_df = pd.read_csv(base_path / 'new_dev_cleaned.csv')

In [None]:
train_df["io_flag"] = train_df["io_flag"].astype(int)
dev_df["io_flag"] = dev_df["io_flag"].astype(int)

### Setup Data, Tokenize, and Load Model

In [None]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score, precision_score, recall_score
import torch
import numpy as np

# 1. Define data
df_for_hf = train_df[["row_id", "Tweets", "io_flag"]].rename(columns={"Tweets": "text", "io_flag": "label"})
dataset = Dataset.from_pandas(df_for_hf, preserve_index=False)

dev_df2 = dev_df[["row_id", "Tweets", "io_flag"]].rename(columns={"Tweets": "text", "io_flag": "label"})
eval_dataset = Dataset.from_pandas(dev_df2, preserve_index=False)

In [None]:
# 2. Tokenizer & Model
model_name = "vinai/bertweet-base"
tokenizer = AutoTokenizer.from_pretrained(model_name, normalization=True, use_fast=True)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# 3. Tokenization function
def tokenize_function(example):
    return tokenizer(example["text"], padding="max_length", truncation=True, max_length=128)

tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_eval_dataset = eval_dataset.map(tokenize_function, batched=True)

### Define Training Arguments

In [None]:
# 5. Training Arguments
# https://huggingface.co/docs/transformers/v4.53.1/en/main_classes/trainer#transformers.TrainingArguments
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir= base_path / "bertweet-trainer-checkpoints"
    # output_dir="./results",
    eval_strategy="steps",
    save_strategy="steps",
    eval_steps=6063,
    save_steps=6063,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    save_total_limit=1,

    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,

    logging_dir="./logs",
    logging_steps=50,
    report_to="none"
)



### Define Evaluation Metrics

In [None]:
from sklearn.metrics import (
    f1_score, accuracy_score, precision_score, recall_score, roc_auc_score
)
import numpy as np

def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    probs = pred.predictions[:, 1]

    return {
        "f1": f1_score(labels, preds),
        "roc_auc": roc_auc_score(labels, probs),
        "accuracy": accuracy_score(labels, preds),
        "precision": precision_score(labels, preds),
        "recall": recall_score(labels, preds)
    }


### Callback Function to Save Model in Every n Steps

In [None]:

from transformers import TrainerCallback
import os

class GoogleDriveSaverCallback(TrainerCallback):
    def __init__(self, save_path, save_every_steps):
        self.save_path = save_path
        self.save_every_steps = save_every_steps
        os.makedirs(save_path, exist_ok=True)

    def on_step_end(self, args, state, control, **kwargs):
        if state.global_step % self.save_every_steps == 0 and state.global_step > 0:
            save_dir = os.path.join(self.save_path, f"step-{state.global_step}")
            kwargs['model'].save_pretrained(save_dir)
            print(f"\n Model saved to {save_dir}")


In [None]:
# 6. Trainer
from transformers import Trainer


gd_callback = GoogleDriveSaverCallback(
    save_path= base_path / "bertweet-checkpoints",
    save_every_steps=4000  # adjust as needed
)




### Train the Model

In [None]:
# 7. Train
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[gd_callback]
)
trainer.train()

In [None]:
# Run this if needed -- eg collab disconnects suddenly
# trainer.train(resume_from_checkpoint=True)

### Final Evaluation of Best Model on Validation Set

In [None]:
# 8. Final evaluation on dev
final_metrics = trainer.evaluate(eval_dataset=tokenized_eval_dataset)
print("🔍 Final metrics on dev set:")
for key, value in final_metrics.items():
    print(f"{key}: {value:.4f}")

### Save Best Model to Gdrive

In [None]:
from pathlib import Path

# Define base path and model name
model_name = "bertweet_metrics_f1_v1"
save_path = base_path / model_name

# Save model and tokenizer
trainer.save_model(save_path)
tokenizer.save_pretrained(save_path)

print(f"✅ Model and tokenizer saved to: {save_path}")


## BERTweet Embedding + MLP (Model 2)
This part can only be run after we run Model 1

In [None]:
# Install and Import
!pip install transformers --quiet

import torch
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Input, Dense, Concatenate
from tensorflow.keras.metrics import Precision, Recall, AUC
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.optimizers import Adam


### Load Model 1

In [None]:
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
import torch
from pathlib import Path

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define model location using base path
model_name = "bertweet_metrics_f1_threshold70"
model_path = base_path / model_name

# Load tokenizer and model from local path
tokenizer = AutoTokenizer.from_pretrained(model_path, local_files_only=True)

# Use AutoModel if extracting embeddings; use AutoModelForSequenceClassification for classification
model = AutoModel.from_pretrained(model_path, local_files_only=True).to(device)
model.eval()


### Define Function to Generate Embeddings

In [None]:
from tqdm.notebook import tqdm

def get_bertweet_embeddings(texts, batch_size=64):
    all_embeddings = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Extracting BERTweet embeddings"):
        batch_texts = texts[i:i+batch_size]
        encoded = tokenizer(batch_texts, return_tensors='pt', padding=True, truncation=True, max_length=128).to(device)
        with torch.no_grad():
            output = model(**encoded)
            cls_embeddings = output.last_hidden_state[:, 0, :]  # CLS token
            all_embeddings.append(cls_embeddings.cpu().numpy())
    return np.vstack(all_embeddings)


### Generate Embeddings on Train and Validation Set

In [None]:
train_full = pd.read_csv(base_path / 'new_train_cleaned_tweet.csv')

texts = train_full['Tweets'].tolist()
bertweet_embeddings = get_bertweet_embeddings(texts)  # shape (n, 768)

In [None]:
texts = dev_df['Tweets'].tolist()
bertweet_embeddings_dev = get_bertweet_embeddings(texts)  # shape (n, 768)

### Train MLP Using Embeddings

In [None]:
train_label = train_full['io_flag']
dev_label = dev_df['io_flag']


X_train, X_val, y_train, y_val = train_test_split(
    bertweet_embeddings, train_labe, bertweet_embeddings_dev, dev_label
)


In [None]:
# Train Final MLP
meta_model = Sequential([
    Dense(512, activation='relu', input_shape=(768,)),
    Dense(256, activation='relu'),
    Dense(1, activation='sigmoid')
])

meta_model.compile(
    optimizer=Adam(1e-4),
    loss='binary_crossentropy',
    metrics=[Precision(name='precision'), Recall(name='recall'), AUC(name='auc')]
)

checkpoint = ModelCheckpoint(
    filepath='new_mlp_baseline_{epoch:02d}.keras',
    save_weights_only=False,
    save_freq='epoch',
    verbose=1
)

meta_model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=10,
    batch_size=32,
    callbacks=[checkpoint]
)


### Save Best Model

In [None]:
import shutil


# Define source and destination paths
source = 'mlp_baseline_epoch_09.keras'
destination = base_path / 'new_mlp_baseline_09.keras'

# Copy the file
shutil.copy(source, destination)