In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import sys
import pandas as pd
import numpy as np
gpt_df = pd.read_csv("/kaggle/input/gpt-dataset/gpt_dataset.csv")


In [None]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
PAT = user_secrets.get_secret("pat")


GITHUB_USERNAME = "vladkisin"
REPO_NAME = "workmind-dev"
REPO_URL = f"https://{GITHUB_USERNAME}:{PAT}@github.com/{GITHUB_USERNAME}/{REPO_NAME}.git"
os.system(f"git clone {REPO_URL}")
os.chdir("/kaggle/working/workmind-dev")

In [None]:
! pip install -U -r requirements.txt --quiet

In [None]:
import wandb
wandb.login(key=user_secrets.get_secret("wandb_pat"))

In [None]:
from sklearn.model_selection import KFold
from adapters import AutoAdapterModel, AdapterTrainer
from transformers import (
    AutoConfig,
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
)
from datasets import Dataset
from experiment.wandb.sentiment import SentimentExperiment
from tuners.adapter import AdapterFineTuner

In [None]:
# Prepare numeric labels
map2label = { 0: "negative", 1: "neutral", 2: "positive"}
gpt_df['text'] = gpt_df['text'].astype(str)
gpt_df['label'] = gpt_df['sentiment_label'].map({v: k for k,v in map2label.items()})

def preprocess_data(batch, tokenizer):
    return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=min(tokenizer.model_max_length, 1024))

In [None]:
import functools

In [None]:
## PARAMETRIZE HERE ##
model_name = "roberta-base" #"microsoft/deberta-v3-large"
experiment_prefix = f"5-fold lora adapter for {model_name}"
experiment_suffix = " ChatGPT-o1 Generated Data"
PROJECT_NAME = "workmind-email-data"
## PARAMETRIZE HERE ##


tokenizer = AutoTokenizer.from_pretrained(model_name)
map_func = functools.partial(preprocess_data, tokenizer=tokenizer)

data = gpt_df[['text', 'label', 'user_id']].to_dict(orient="list")
y_gpt_email = gpt_df["sentiment_label"].tolist()

dataset = Dataset.from_dict(data)
texts = dataset["text"]
labels = dataset["label"]
kf = KFold(n_splits=5, shuffle=True, random_state=42)

with SentimentExperiment(
    None, # No analyzer provided since we collect predictions in 5 folds
    experiment_prefix + experiment_suffix, 
    y_gpt_email,
    project_name=PROJECT_NAME
) as experiment:
    # Placeholder for fold predictions
    all_predictions = [None] * len(dataset)  # Placeholder to maintain original order
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    # K-Fold Cross-Validation
    for fold, (train_idx, test_idx) in enumerate(kf.split(texts)):
        print(f"Processing Fold {fold + 1}...")

        train_texts, test_texts = np.array(texts)[train_idx], np.array(texts)[test_idx]
        train_labels, test_labels = np.array(labels)[train_idx], np.array(labels)[test_idx]
        # Create train and test datasets
        train_dataset = Dataset.from_dict({"text": train_texts, "label": train_labels})
        test_dataset = Dataset.from_dict({"text": test_texts, "label": test_labels})
        train_dataset = train_dataset.map(map_func, batched=True)
        test_dataset = test_dataset.map(map_func, batched=True)

        adapter_tuner = AdapterFineTuner(
            model_name_or_path=model_name,
            train_dataset=train_dataset,
            eval_dataset=test_dataset,
            tokenizer=tokenizer,
            adapter_name="sentiment-head",
            num_labels=3,
            id2label=map2label,
            learning_rate=5e-4,
            num_train_epochs=2,
            train_batch_size=16
        )
        adapter_tuner.prepare_model()
        adapter_tuner.train(trainer_class=AdapterTrainer)


        predictions = adapter_tuner.trainer.predict(test_dataset)
        preds = np.argmax(predictions.predictions, axis=1)

        for idx, pred in zip(test_idx, preds):
            all_predictions[idx] = pred  # Assign prediction to the original order

    experiment.log_metrics([map2label[x] for x in dataset['label']], 
                           [map2label[x] for x in all_predictions], 
                           user_ids=dataset['user_id'])
