In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import sys
import pandas as pd
import numpy as np
df = pd.read_csv("/kaggle/input/glassdoor-job-reviews/glassdoor_reviews.csv")


In [None]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
PAT = user_secrets.get_secret("pat")


GITHUB_USERNAME = "vladkisin"
REPO_NAME = "workmind-dev"
REPO_URL = f"https://{GITHUB_USERNAME}:{PAT}@github.com/{GITHUB_USERNAME}/{REPO_NAME}.git"
os.system(f"git clone {REPO_URL}")
os.chdir("/kaggle/working/workmind-dev")

In [None]:
! pip install -U -r requirements.txt --quiet

In [None]:
from workmind.data_processing.utils import preprocess_and_split_gd
train_df, val_df, test_df = preprocess_and_split_gd(df)


print(f"Train size: {len(train_df)}")
print(f"Validation size: {len(val_df)}")
print(f"Test size: {len(test_df)}")

In [None]:
import wandb
wandb.login(key=user_secrets.get_secret("wandb_pat"))

In [None]:
from datasets import Dataset
from workmind.tuners.partial import PartiallyUnfrozenClsFineTuner
from transformers import (
    AutoConfig,
    AutoTokenizer,
    AutoModel,
    TrainingArguments,
    Trainer
)

In [None]:
model_name = "roberta-large"

train_dataset = Dataset.from_dict(train_df.sample(frac=0.3)[["text", "label"]].to_dict(orient="list"))
eval_dataset = Dataset.from_dict(val_df[["text", "label"]].sample(1500).to_dict(orient="list"))


tokenizer = AutoTokenizer.from_pretrained(model_name)

def preprocess_data(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=min(tokenizer.model_max_length, 1024))

train_dataset = train_dataset.map(preprocess_data, batched=True)
eval_dataset = eval_dataset.map(preprocess_data, batched=True)

In [None]:
from huggingface_hub import login

login(token=user_secrets.get_secret("hf_pat"))

**Fine-tune**

In [None]:
id2label = {0: "negative", 1: "neutral", 2: "positive"}
tuner = PartiallyUnfrozenClsFineTuner(
    model_name_or_path=model_name,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    layers_to_unfreeze=("layer.21", "layer.22", "layer.23", "classifier"),
    learning_rate=1e-4,
    num_train_epochs=1,
    train_batch_size=16,
            val_batch_size=16
)
tuner.prepare_model()
tuner.train(trainer_class=Trainer)


In [None]:
repo_name = f"uladzislauk/{model_name}-unfreeze-ft-glassdoor-60k"

tuner.model.push_to_hub(repo_name)
tokenizer.push_to_hub(repo_name)

**Evaluate on the same dataset**

In [None]:
from workmind.analyzers.sentiment.classification import ClassificationSentimentAnalyzer
from workmind.analyzers.constants import BaseSentiment
from workmind.experiment.wandb.sentiment import SentimentExperiment


In [None]:
y_glassdoor = test_df["recommend"].map({"v": "positive", "x": "negative", "o": "neutral"}).tolist()
reviews = test_df["review"].tolist()
PROJECT_NAME = "workmind-glassdoor"


analyzer = ClassificationSentimentAnalyzer(
    model_name="uladzislauk/roberta-base-unfreeze-ft-glassdoor-60k",
    class_labels=[BaseSentiment.NEGATIVE,
                  BaseSentiment.NEUTRAL,
                  BaseSentiment.POSITIVE], 
    batch_size=16, 
    hypothesis_template=None
)

with SentimentExperiment(
    analyzer, 
    f" unfrozen 3 layers on 60k Glassdoor 1 epoch for roberta-base" + " on Glassdoor Data", 
    y_glassdoor,
    project_name=PROJECT_NAME
) as experiment:
    experiment.evaluate(reviews)

**Evaluate cross-dataset**

In [None]:
gpt_df = pd.read_csv("/kaggle/input/gpt-dataset/gpt_dataset.csv")

In [None]:
gpt_emails = gpt_df["text"].tolist()
y_gpt_email = gpt_df["sentiment_label"].tolist()
user_ids = gpt_df["user_id"].tolist()
PROJECT_NAME="workmind-email-data"

analyzer = ClassificationSentimentAnalyzer(
    model_name="uladzislauk/roberta-base-unfreeze-ft-glassdoor-60k",
    class_labels=[BaseSentiment.NEGATIVE,
                  BaseSentiment.NEUTRAL,
                  BaseSentiment.POSITIVE], 
    batch_size=16, 
    hypothesis_template=None
)

with SentimentExperiment(
    analyzer, 
    f" unfrozen 3 layers on 60k Glassdoor 1 epoch for roberta-base" + " on ChatGPT-o1 Generated Data", 
    y_gpt_email,
    project=PROJECT_NAME
) as experiment:
    experiment.evaluate(gpt_emails, user_ids)