In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import sys
import pandas as pd
import numpy as np
df = pd.read_csv('/kaggle/input/glassdoor-job-reviews/glassdoor_reviews.csv')


In [None]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
PAT = user_secrets.get_secret("pat")


GITHUB_USERNAME = "vladkisin"
REPO_NAME = "workmind-dev"
REPO_URL = f"https://{GITHUB_USERNAME}:{PAT}@github.com/{GITHUB_USERNAME}/{REPO_NAME}.git"
os.system(f"git clone {REPO_URL}")
os.chdir("/kaggle/working/workmind-dev")

In [None]:
! pip install -U -r requirements.txt --quiet

In [None]:
from data.utils import preprocess_and_split_gd
train_df, val_df, test_df = preprocess_and_split_gd(df)

print(f"Train size: {len(train_df)}")
print(f"Validation size: {len(val_df)}")
print(f"Test size: {len(test_df)}")

In [None]:
from sentiment import get_analyzer
from experiment.config import MODELS_CONFIG
from experiment.wandb.sentiment import SentimentExperiment
import wandb
wandb.login(key=user_secrets.get_secret("wandb_pat"))

In [None]:
y_glassdoor = test_df['recommend'].map({'v': 'positive', 'x': 'negative', 'o': 'neutral'}).tolist()
reviews = test_df['review'].tolist()
PROJECT_NAME = "workmind-glassdoor"

for model_card in list(MODELS_CONFIG.keys()):
    config = MODELS_CONFIG[model_card]
    analyzer = get_analyzer(
        inference_type=config[ConfigKeys.INFERENCE_TYPE],
        model_name=model_card,
        class_labels=config[ConfigKeys.CLASS_LABELS],
        batch_size=config[ConfigKeys.BATCH_SIZE],
        hypothesis_template=config[ConfigKeys.HYPOTHESIS_TEMPLATE]
    )
    
    print(f"Running {model_card} on Glassdoor data")
    
    with SentimentExperiment(
        analyzer,
        config[ConfigKeys.EXPERIMENT_NAME] + " on Glassdoor Data",
        y_glassdoor,
        project_name=PROJECT_NAME
    ) as experiment:
        experiment.evaluate(reviews, user_ids)

In [None]:
from datasets import Dataset
from transformers import AutoTokenizer
from tuners.adapter import AdapterFineTuner
from adapters import AdapterTrainer

In [None]:
model_name = "roberta-large"

train_dataset = Dataset.from_dict(train_df.sample(frac=0.2)[['text', 'label']].to_dict(orient="list"))
eval_dataset = Dataset.from_dict(val_df[['text', 'label']].sample(1500).to_dict(orient="list"))


tokenizer = AutoTokenizer.from_pretrained(model_name)

def preprocess_data(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=min(tokenizer.model_max_length, 1024))

train_dataset = train_dataset.map(preprocess_data, batched=True)
eval_dataset = eval_dataset.map(preprocess_data, batched=True)

**Fine-tune**

In [None]:
id2label = {0: 'negative', 1: 'neutral', 2: 'positive'}
adapter_tuner = AdapterFineTuner(
            model_name_or_path=model_name,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            tokenizer=tokenizer,
            adapter_name="sentiment-head",
            num_labels=3,
            id2label=id2label,
            learning_rate=3e-4,
            num_train_epochs=1,
            train_batch_size=8,
            eval_steps=200,
        )
adapter_tuner.prepare_model()
adapter_tuner.train(trainer_class=AdapterTrainer)
adapter_tuner.model.push_adapter_to_hub(f"{model_name.replace('/', '-')}-glasdoor-cls-40k", 'sentiment-head')

**Evaluate on the same dataset**

In [None]:
y_glassdoor = test_df['recommend'].map({'v': 'positive', 'x': 'negative', 'o': 'neutral'}).tolist()
reviews = test_df['review'].tolist()
PROJECT_NAME = "workmind-glassdoor"


analyzer = AdapterClassificationSentimentAnalyzer(
    model_name='roberta-large', 
    adapter_name='uladzislauk/roberta-large-glasdoor-cls-40k', 
    class_labels=[BaseSentiment.NEGATIVE,
                  BaseSentiment.NEUTRAL,
                  BaseSentiment.POSITIVE], 
    batch_size=16
)

with SentimentExperiment(
    analyzer, 
    f"adapter tuned on 40k Glassdoor 1 epoch roberta-large" + " on Glassdoor Data", 
    y_glassdoor,
    project=PROJECT_NAME
) as experiment:
    experiment.evaluate(review)

**Evaluate cross-dataset**

In [None]:
gpt_df = pd.read_csv('/kaggle/input/gpt-dataset/gpt_dataset.csv')

In [None]:
gpt_emails = gpt_df['text'].tolist()
y_gpt_email = gpt_df['sentiment_label'].tolist()
user_ids = gpt_df['user_id'].tolist() 
PROJECT_NAME=workmind-email-data

analyzer = AdapterClassificationSentimentAnalyzer(
    model_name='roberta-large', 
    adapter_name='uladzislauk/roberta-large-glasdoor-cls-40k', 
    class_labels=[BaseSentiment.NEGATIVE,
                  BaseSentiment.NEUTRAL,
                  BaseSentiment.POSITIVE], 
    batch_size=16
)

with SentimentExperiment(
    analyzer, 
    f"adapter tuned on 40k Glassdoor 1 epoch roberta-large" + " on ChatGPT-o1 Generated Data", 
    y_gpt_email,
    project=PROJECT_NAME
) as experiment:
    experiment.evaluate(gpt_emails, user_ids)