In [None]:
!pip install evaluate peft==0.8.2 accelerate bitsandbytes -q

In [None]:
import re, regex, os, sys, warnings, random, gc, logging
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
from torch import nn

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import train_test_split

import evaluate
from datasets import Dataset

import transformers
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer
)
from peft import (
    prepare_model_for_int8_training,
    LoraConfig,
    TaskType,
    get_peft_model
)

SEED = 42
transformers.set_seed(SEED)
warnings.filterwarnings('ignore')
os.environ["TOKENIZERS_PARALLELISM"] = "false"
logging.getLogger().setLevel(logging.WARNING)

INPUT_PATH = '/kaggle/input/translit-datasets/'

In [None]:
from kaggle_secrets import UserSecretsClient
from huggingface_hub import login

login(UserSecretsClient().get_secret("hf")) # for gemma-2b

In [None]:
EPOCHS = 5

# Dataset

## Transliterated Hindi Sentiment

In [None]:
# Insert appropriate dataset reading code from dataset_readers.py

def read_dataset():
    pass

In [None]:
train_df, test_df, label_names, dataset_name, text_col = read_dataset()

train_df.label = train_df.label.cat.codes
test_df.label = test_df.label.cat.codes

class_weights = dict(enumerate(
    compute_class_weight(
        class_weight="balanced", 
        classes=np.unique(train_df['label']), 
        y=train_df['label']
    )
))

pd.set_option('max_colwidth', 200)
display(train_df.head())
display(test_df.head())

print(f'{len(train_df)=}, {len(test_df)=}')
print(label_names)

plt.figure(figsize=(6,2))
plt.bar(x=label_names, height=np.bincount(train_df['label']))

# Gemma-2B

In [None]:
def train_model( 
    model_name,
    train_args: dict,
    seed: int = SEED,
    train_df: pd.DataFrame = train_df, 
    test_df: pd.DataFrame = test_df, 
    label_names: list[str] = label_names,
    save_model: bool = True
):
    # Setup
    transformers.set_seed(seed)
    n_labels = len(label_names)
    id2label = {i:name for i, name in enumerate(label_names)}
    label2id = {name:i for i, name in enumerate(label_names)}
      
    ## Evaluation Metric
    metric = evaluate.load("f1")
    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        predictions = np.argmax(logits, axis=1)
        return metric.compute(predictions=predictions, references=labels, average='weighted')  
    
    ## Get LoRA model
    model = prepare_model_for_int8_training(
        AutoModelForSequenceClassification.from_pretrained(
            model_name, num_labels=n_labels, id2label=id2label, label2id=label2id, load_in_8bit=True
        )
    )
    model = (model)
    lora_model = get_peft_model(
        model, 
        LoraConfig(
            r=64,
            lora_alpha=32,
            lora_dropout=0.1,
            task_type=TaskType.SEQ_CLS,
            target_modules='all-linear'
        )
    )
    
    # Make Dataset and tokenize    
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    def preprocess_function(example):
        return tokenizer(example["sentence"], truncation=True)
 
    train_df.sentence = train_df.sentence.apply(lambda text: text.lower())
    test_df.sentence  =  test_df.sentence.apply(lambda text: text.lower())
    
    tokenized_train = Dataset.from_pandas(train_df, split='train').map(preprocess_function, batched=True)
    tokenized_test  = Dataset.from_pandas( test_df, split='test' ).map(preprocess_function, batched=True)
        
    # Train and evaluate
    trainer = Trainer(
        model = lora_model,
        tokenizer = tokenizer,
        train_dataset = tokenized_train, 
        eval_dataset = tokenized_test,
        compute_metrics = compute_metrics,
        data_collator = DataCollatorWithPadding(tokenizer=tokenizer),
        args = TrainingArguments(
            output_dir = './checkpoints/',
            report_to = 'none',
            evaluation_strategy = "epoch",
            save_strategy = "epoch",
            save_total_limit = 1,
            load_best_model_at_end = True,
            **train_args
        )
    )
    
    display(trainer.evaluate())
    train_output = trainer.train()
    display(trainer.evaluate())
    
    predictions  = trainer.predict(tokenized_test)
    
    y_test, y_pred = test_df.label, predictions.predictions.argmax(1)
    
    clf_report  = classification_report(y_test, y_pred, target_names=label_names, digits=5)
    conf_matrix = confusion_matrix(y_test, y_pred)
    
    if save_model:
        trainer.save_model(f"/kaggle/working/model_{model_name.replace('/', '-')}/")
        
    !rm -r './checkpoints/'
    return clf_report, conf_matrix

In [None]:
clf_report, conf_matrix = train_model(
    'google/gemma-2b',
    train_args = {
        'num_train_epochs': EPOCHS,
        'learning_rate': 2e-5,
        'weight_decay': 0.01,
        'per_device_train_batch_size': 4,
        'logging_steps': 10,
    },
)

In [None]:
print(clf_report)
ConfusionMatrixDisplay(conf_matrix, display_labels=label_names).plot()

# TinyLLaMa

In [None]:
def train_model( 
    model_name,
    train_args: dict,
    seed: int = SEED,
    train_df: pd.DataFrame = train_df, 
    test_df: pd.DataFrame = test_df, 
    label_names: list[str] = label_names,
    save_model: bool = True
):
    # Setup
    transformers.set_seed(seed)
    n_labels = len(label_names)
    id2label = {i:name for i, name in enumerate(label_names)}
    label2id = {name:i for i, name in enumerate(label_names)}
      
    ## Evaluation Metric
    metric = evaluate.load("f1")
    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        predictions = np.argmax(logits, axis=1)
        return metric.compute(predictions=predictions, references=labels, average='weighted')  
    
    ## Get model
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name, num_labels=n_labels, id2label=id2label, label2id=label2id, load_in_8bit=True
    )
    
    # Make Dataset and tokenize    
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    tokenizer.pad_token = tokenizer.eos_token  # llama only
    model.config.pad_token_id = model.config.eos_token_id
        
    def preprocess_function(example):
        return tokenizer(example["sentence"], truncation=True)
    
    train_df.sentence = train_df.sentence.apply(lambda text: text.lower())
    test_df.sentence  =  test_df.sentence.apply(lambda text: text.lower())
    
    tokenized_train = Dataset.from_pandas(train_df, split='train').map(preprocess_function, batched=True)
    tokenized_test  = Dataset.from_pandas( test_df, split='test' ).map(preprocess_function, batched=True)
        
    # Train and evaluate
    lora_model = get_peft_model(
        prepare_model_for_int8_training(model), 
        LoraConfig(
            r=64,
            lora_alpha=32,
            lora_dropout=0.1,
            task_type=TaskType.SEQ_CLS,
            target_modules='all-linear'
        )
    )
    trainer = Trainer(
        model = lora_model,
        tokenizer = tokenizer,
        train_dataset = tokenized_train, 
        eval_dataset = tokenized_test,
        compute_metrics = compute_metrics,
        data_collator = DataCollatorWithPadding(tokenizer=tokenizer),
        args = TrainingArguments(
            output_dir = './checkpoints/',
            report_to = 'none',
            evaluation_strategy = "epoch",
            save_strategy = "epoch",
            save_total_limit = 1,
            load_best_model_at_end = True,
            **train_args
        )
    )
    
    display(trainer.evaluate())
    train_output = trainer.train()
    display(trainer.evaluate())
    
    predictions  = trainer.predict(tokenized_test)
    
    y_test, y_pred = test_df.label, predictions.predictions.argmax(1)
    
    clf_report  = classification_report(y_test, y_pred, target_names=label_names, digits=5)
    conf_matrix = confusion_matrix(y_test, y_pred)
    
    if save_model:
        trainer.save_model(f"/kaggle/working/model_{model_name.replace('/', '-')}/")
        
    !rm -r './checkpoints/'
    return clf_report, conf_matrix

In [None]:
clf_report, conf_matrix = train_model(
    "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T",
    train_args = {
        'num_train_epochs': EPOCHS,
        'learning_rate': 2e-5,
        'weight_decay': 0.01,
        'per_device_train_batch_size': 4,
        'logging_steps': 10,
    },
)

In [None]:
print(clf_report)
ConfusionMatrixDisplay(conf_matrix, display_labels=label_names).plot()