Thank you for visiting my notebook!<br>

This notebook is a DeBERTa starter and a simple demonstration of the finetuning for those new to HuggingFace LLM. If you have any questions, please comment!<br>

#### Summary:
* MODEL: [deberta-v3-base](https://huggingface.co/microsoft/deberta-v3-base)
* PROBLEM: 6-class classification
* TRAINING: Hold-out method, 

#### Notes:
The following code is probably not the best way to go because it's just a simple baseline. As described in the last section (*What You Can Do Next*), there is plenty of room to improve the score.<br>


Enjoy, kagglers!

# Prepare for Offline Training

In [None]:
# !pip install /kaggle/input/lal-scoring-wheels/peft-0.10.0-py3-none-any.whl
!pip install -q /kaggle/input/bnb-to-load-transformers-models/bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl
!pip install -q /kaggle/input/bnb-to-load-transformers-models/accelerate-0.30.1-py3-none-any.whl
!pip install /kaggle/input/bnb-to-load-transformers-models/peft-0.11.1-py3-none-any.whl
#!pip install -q /kaggle/input/bnb-to-load-transformers-models/transformers-4.41.2-py3-none-any.whl

# Import Libraries

In [None]:
import os
import random
import warnings
warnings.simplefilter('ignore')

import numpy as np
import pandas as pd
import torch
import datasets
from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import StratifiedKFold
from transformers import AutoModelForSequenceClassification, AutoTokenizer, DataCollatorWithPadding, TrainingArguments, Trainer

# When using PEFT, comment out the below line.
from peft import LoftQConfig, LoraConfig, TaskType, get_peft_model, PeftModel, PeftConfig

# Config

In [None]:
class CFG:
    n_labels = 6
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    seed = 1
    
    # ----- Model checkpoint -----
    #model_ckpt = '/kaggle/input/deberta-v3-for-offline/base'
    model_ckpt = '/kaggle/input/huggingfacedebertav3variants/deberta-v3-base'
    # model_ckpt = 'microsoft/deberta-v3-base' # When 'INTERNET ON'
    
    # ----- Training params -----
    max_input_length = 2000
    use_peft = False
    n_freeze = None
    n_folds = 4 
    learning_rate = 5.0e-5
    warmup_ratio = 0.1
    n_epochs = 2
    train_batch_size = 4
    eval_batch_size = 1
    grad_accum_steps = 4
    steps = 200
    fp16 = True


# Prepare Data

In [None]:
DATA_DIR = '/kaggle/input/learning-agency-lab-automated-essay-scoring-2/'
df = pd.read_csv(DATA_DIR + 'train.csv')

# score: [1,2,3,4,5,6] -> label: [0,1,2,3,4,5]
df['label'] = df['score'].apply(lambda x: int(x - 1)).astype('uint8')

In [None]:
df['label'].value_counts()

# Train Model

In [None]:
tokenizer = AutoTokenizer.from_pretrained(CFG.model_ckpt)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


def tokenize(batch):
    tokenized_inputs = tokenizer(
        batch['full_text'],
        padding=False,
        truncation=True,
        max_length=CFG.max_input_length,
    )
    return tokenized_inputs


def model_init():
    ### Load model from checkpoint
    model = AutoModelForSequenceClassification.from_pretrained(
        CFG.model_ckpt,
        num_labels=CFG.n_labels,
    ).to(CFG.device)
    ### Freeze layers
    if CFG.n_freeze is not None:
        # embedding layer
        for param in model.base_model.embeddings.parameters():
            param.requires_grad = False
        # eack encoder layer
        for i in range(CFG.n_freeze):
            for param in model.base_model.encoder.layer[i].parameters():
                param.requires_grad = False
    ### Create PEFT (LoRA) model
    if CFG.use_peft:
        loftq_config = LoftQConfig(loftq_bits=4)
        peft_config = LoraConfig(
            task_type=TaskType.SEQ_CLS,
            inference_mode=False,
            init_lora_weights='loftq',
            loftq_config=loftq_config,
            use_rslora=True,
            #target_modules='all-linear',
            r=16,
            lora_alpha=8,
            lora_dropout=0,
        )
        model = get_peft_model(model, peft_config)
    return model


def compute_metrics(outputs):
    predictions, labels = outputs
    preds = np.argmax(predictions, axis=-1)
#     print(f"Predictions: {preds[:10]}")
#     print(f"Labels: {labels[:10]}")
    qwk = cohen_kappa_score(
        y1=labels, y2=preds,
        labels=range(CFG.n_labels),
        weights='quadratic'
    )
    return {'qwk': qwk}


def seed_everything(seed: int):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


In [None]:

def print_trainable_params(model):
    trainable_params = 0
    all_params = 0
    for _, param in model.named_parameters():
        all_params += param.numel()
        if param.requires_grad == True:
            trainable_params += param.numel()
    
    print(f"trainable parameters: {trainable_params}, all parameters: {all_params}, ratio: {100 * trainable_params / all_params}%")



In [None]:
model = model_init()
print_trainable_params(model)

In [None]:
#print(model)

In [None]:
### Set seed
seed_everything(CFG.seed)

### Cross Validation
skf = StratifiedKFold(n_splits=CFG.n_folds, shuffle=True, random_state=CFG.seed)
for fold, (tr_idx, va_idx) in enumerate(skf.split(df, df['label'])):
    # Split train/valid
    df_train = df.loc[tr_idx, ['full_text', 'label']].copy()
    df_valid = df.loc[va_idx, ['full_text', 'label']].copy()
    print('#'*25, f"Fold {fold}", '#'*25)
    # Prepare PyArrow dataset
    ds_train = datasets.Dataset.from_pandas(df_train)
    ds_valid = datasets.Dataset.from_pandas(df_valid)
    # Tokenize
    tokenized_ds_train = ds_train.map(tokenize, batched=True, batch_size=None)
    tokenized_ds_valid = ds_valid.map(tokenize, batched=True, batch_size=None)
    # Convert dataset's format: List -> Torch
    tokenized_ds_train.set_format('torch')
    tokenized_ds_valid.set_format('torch')
    # Train
    training_args = TrainingArguments(
        output_dir='/kaggle/temp/',
        overwrite_output_dir=True,
        learning_rate=CFG.learning_rate,
        warmup_ratio=CFG.warmup_ratio,
        num_train_epochs=CFG.n_epochs,
        per_device_train_batch_size=CFG.train_batch_size,
        per_device_eval_batch_size=CFG.eval_batch_size,
        gradient_accumulation_steps=CFG.grad_accum_steps,
        gradient_checkpointing=True,
        fp16=CFG.fp16,
        logging_strategy='steps',
        logging_steps=CFG.steps,
        evaluation_strategy='steps',
        eval_steps=CFG.steps,
        save_strategy='steps',
        save_steps=CFG.steps,
        save_total_limit=1,
        load_best_model_at_end=True,
        report_to='none',
        seed=CFG.seed,
        )
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_ds_train,
        eval_dataset=tokenized_ds_valid,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )
    #model.config.use_cache = False 
    trainer.train()
    
    # Only 1 fold to save time.
    # Of course, you can comment out 'break' to perform cross-validation.
    #break

In [None]:
#trainer.save_model("peft_model")

# Infer Test Data

In [None]:
# Prepare test data
df_test = pd.read_csv(DATA_DIR + 'test.csv')
ds_test = datasets.Dataset.from_pandas(df_test[['full_text']])
tokenized_ds_test = ds_test.map(tokenize, batched=True, batch_size=None)
tokenized_ds_test.set_format('torch')



In [None]:
# Predict
outputs = trainer.predict(tokenized_ds_test)
predictions = torch.softmax(torch.from_numpy(outputs.predictions), dim=-1).numpy()
preds = np.argmax(predictions, axis=-1)

# Submit

In [None]:
# Don't forget to add 1 to preds
# label: [0,1,2,3,4,5] -> score[1,2,3,4,5,6]
df_test['score'] = preds + 1
df_test.drop(columns=['full_text'],inplace=True)
df_test[['essay_id', 'score']].to_csv('submission.csv', index=False)

In [None]:
df_test.dtypes

In [None]:
df_test

# What You Can Do Next

#### Tips for improving score:
* Cross-validate and average those predictions
* Tune hyperparameters (epoch, batch size, learning rate, etc.)
* Freeze layers
* Try *PEFT* for fast/cost-efficient training
* Try a larger model such as deberta-v3-*large*
* Try a *regression* model instead of a classification model

etc.