### what we will do:
    - full fine tune 
    - analysis layers to  know which layers i can use LoRa with
    - test another ways.


## Note: this notebook for test and experiment different approaches not for deploy, when we decide which approach to take for deploying the code will be written in a different way

In [1]:
!pip install bitsandbytes==0.43.1
!pip install accelerate==0.30.1
!pip install peft==0.11.1
!pip install transformers==4.41.2

Collecting bitsandbytes==0.43.1
  Downloading bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl.metadata (2.2 kB)
Downloading bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl (119.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.43.1
Collecting accelerate==0.30.1
  Downloading accelerate-0.30.1-py3-none-any.whl.metadata (18 kB)
Downloading accelerate-0.30.1-py3-none-any.whl (302 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m15.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: accelerate
  Attempting uninstall: accelerate
    Found existing installation: accelerate 0.28.0
    Uninstalling accelerate-0.28.0:
      Successfully uninstalled accelerate-0.28.0
Successfully installed accelerate-0.30.1
Collecting peft==0.11.1
  Downloading p

# Import Libraries

In [2]:
import os
import random
import warnings
warnings.simplefilter('ignore')

import numpy as np
import pandas as pd
import torch
import datasets
from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import StratifiedKFold
from transformers import AutoModelForSequenceClassification, AutoTokenizer, DataCollatorWithPadding, TrainingArguments, Trainer

# When using PEFT, comment out the below line.
from peft import LoftQConfig, LoraConfig, TaskType, get_peft_model, PeftModel, PeftConfig

2024-06-23 20:32:27.643921: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-23 20:32:27.644074: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-23 20:32:27.767683: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


# Config

In [3]:
class CFG:
    n_labels = 6
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    seed = 1
    
    model_ckpt = 'microsoft/deberta-v3-base' # When 'INTERNET ON'
    
    # ----- Training params -----
    max_input_length = 2000
    use_peft = False
    rank = 8
    
    freeze = {
        "embedding" : False,
        "n_layers_encoder" : 0 # max = 12
    }
    

    learning_rate = 5.0e-5
    warmup_ratio = 0.1
    grad_accum_steps = 4
    fp16 = True
    
    n_folds = 5
    n_epochs = 3
    train_batch_size = 4
    eval_batch_size = 2
    
    steps = 100
    


# Prepare Data

In [4]:
DATA_DIR = '/kaggle/input/learning-agency-lab-automated-essay-scoring-2/'
df = pd.read_csv(DATA_DIR + 'train.csv')

# score: [1,2,3,4,5,6] -> label: [0,1,2,3,4,5]
df['label'] = df['score'].apply(lambda x: int(x - 1)).astype('uint8')

In [5]:
# sample of the data for debuging

df = df.sample(10000).reset_index(drop=True)

In [6]:
df['label'].value_counts()

label
2    3568
1    2735
3    2337
0     729
4     543
5      88
Name: count, dtype: int64

# Train Model

In [7]:
tokenizer = AutoTokenizer.from_pretrained(CFG.model_ckpt)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


def tokenize(batch):
    tokenized_inputs = tokenizer(
        batch['full_text'],
        padding=False,
        truncation=True,
        max_length=CFG.max_input_length,
    )
    return tokenized_inputs



# return the original model or model with freezed layers or peft or second and third
def model_init():
    ### Load model from checkpoint
    model = AutoModelForSequenceClassification.from_pretrained(
        CFG.model_ckpt,
        num_labels=CFG.n_labels,
    ).to(CFG.device)
    
    # Freeze layers
    if CFG.freeze['embedding']:
        # embedding layer
        for param in model.base_model.embeddings.parameters():
            param.requires_grad = False
            
    
    if CFG.freeze['n_layers_encoder'] > 0:
        # each encoder layer
        for i in range(CFG.freeze['n_layers_encoder']):
            for param in model.base_model.encoder.layer[i].parameters():
                param.requires_grad = False
                
    
    
    # Create PEFT (LoRA) model
    if CFG.use_peft:
        peft_config = LoraConfig(
            task_type=TaskType.SEQ_CLS,
            inference_mode=False,
            use_rslora=True,
            r=CFG.rank,
            lora_alpha=8,
            lora_dropout=0,
           # target_modules = "classifier"
        )
        model = get_peft_model(model, peft_config)
        
    
    return model


def compute_metrics(outputs):
    predictions, labels = outputs
    preds = np.argmax(predictions, axis=-1)
#     print(f"Predictions: {preds[:10]}")
#     print(f"Labels: {labels[:10]}")
    qwk = cohen_kappa_score(
        y1=labels, y2=preds,
        labels=range(CFG.n_labels),
        weights='quadratic'
    )
    return {'qwk': qwk}


def seed_everything(seed: int):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

In [8]:
def print_trainable_params(model):
    trainable_params = 0
    all_params = 0
    for _, param in model.named_parameters():
        all_params += param.numel()
        if param.requires_grad == True:
            trainable_params += param.numel()
    
    print(f"trainable parameters: {trainable_params}, all parameters: {all_params}, ratio: {100 * trainable_params / all_params}%")

In [9]:
model = model_init()
print_trainable_params(model)

pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable parameters: 184426758, all parameters: 184426758, ratio: 100.0%


In [10]:
# test trainer with custom loss function /// comming versions
"""
def qwk_obj(y_true, y_pred):
    labels = y_true + a
    preds = y_pred + a
    preds = preds.clip(1, 6)
    f = 1/2*np.sum((preds-labels)**2)
    g = 1/2*np.sum((preds-a)**2+b)
    df = preds - labels
    dg = preds - a
    grad = (df/g - f*dg/g**2)*len(labels)
    hess = np.ones(len(labels))
    return grad, hess
a = 2.948
b = 1.092
"""

'\ndef qwk_obj(y_true, y_pred):\n    labels = y_true + a\n    preds = y_pred + a\n    preds = preds.clip(1, 6)\n    f = 1/2*np.sum((preds-labels)**2)\n    g = 1/2*np.sum((preds-a)**2+b)\n    df = preds - labels\n    dg = preds - a\n    grad = (df/g - f*dg/g**2)*len(labels)\n    hess = np.ones(len(labels))\n    return grad, hess\na = 2.948\nb = 1.092\n'

In [11]:
### Set seed
seed_everything(CFG.seed)

def train_loop():
    
    skf = StratifiedKFold(n_splits=CFG.n_folds, shuffle=True, random_state=CFG.seed)
    for fold, (tr_idx, va_idx) in enumerate(skf.split(df, df['label'])):
        # Split train/valid
        df_train = df.loc[tr_idx, ['full_text', 'label']].copy()
        df_valid = df.loc[va_idx, ['full_text', 'label']].copy()
        print('#'*25, f"Fold {fold}", '#'*25)
        # Prepare PyArrow dataset
        ds_train = datasets.Dataset.from_pandas(df_train)
        ds_valid = datasets.Dataset.from_pandas(df_valid)
        # Tokenize
        tokenized_ds_train = ds_train.map(tokenize, batched=True, batch_size=None)
        tokenized_ds_valid = ds_valid.map(tokenize, batched=True, batch_size=None)
        # Convert dataset's format: List -> Torch
        tokenized_ds_train.set_format('torch')
        tokenized_ds_valid.set_format('torch')
        
        # the model 
        model = model_init()
        print_trainable_params(model)
        print("----" * 50)
        
        # Train
        training_args = TrainingArguments(
            output_dir='/kaggle/temp/',
            overwrite_output_dir=True,
            learning_rate=CFG.learning_rate,
            warmup_ratio=CFG.warmup_ratio,
            num_train_epochs=CFG.n_epochs,
            per_device_train_batch_size=CFG.train_batch_size,
            per_device_eval_batch_size=CFG.eval_batch_size,
            gradient_accumulation_steps=CFG.grad_accum_steps,
            gradient_checkpointing=True,
            fp16=CFG.fp16,
            logging_strategy='steps',
            logging_steps=CFG.steps,
            evaluation_strategy='steps',
            eval_steps=CFG.steps,
            save_strategy='steps',
            save_steps=CFG.steps,
            save_total_limit=1,
            load_best_model_at_end=True,
            report_to='none',
            seed=CFG.seed,
            )
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=tokenized_ds_train,
            eval_dataset=tokenized_ds_valid,
            tokenizer=tokenizer,
            data_collator=data_collator,
            compute_metrics=compute_metrics,
        )
     
        trainer.train()

        # test in one fold with
        if fold == 0:
            break
            


In [12]:
# full fine tune
train_loop()

######################### Fold 0 #########################


Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable parameters: 184426758, all parameters: 184426758, ratio: 100.0%
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------


Step,Training Loss,Validation Loss,Qwk
100,1.507,1.157634,0.638907
200,1.0748,1.177194,0.648296
300,1.0714,0.986136,0.681944
400,1.0281,0.958416,0.712262
500,0.9417,0.909626,0.76046
600,0.9215,0.880848,0.759442
700,0.8776,0.905393,0.724378
800,0.8846,0.92636,0.747527
900,0.8149,0.91204,0.774003
1000,0.866,0.825389,0.802731


In [13]:
# LoRa
CFG.peft = True
CFG.rank = 24
train_loop()

######################### Fold 0 #########################


Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable parameters: 184426758, all parameters: 184426758, ratio: 100.0%
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------


Step,Training Loss,Validation Loss,Qwk
100,1.522,1.127205,0.68395
200,1.1134,1.125564,0.592252
300,1.0585,1.018996,0.647989
400,1.0293,0.93076,0.714882
500,0.9441,0.859924,0.774824
600,0.8792,0.859044,0.766808
700,0.8506,0.868631,0.760578
800,0.8331,0.888064,0.771955
900,0.8074,0.880019,0.770565
1000,0.8338,0.806544,0.804695


In [14]:
CFG.peft = False

# freeze embedding params
CFG.freeze['embedding'] = True
CFG.freeze['n_layers_encoder'] = 0

train_loop()

######################### Fold 0 #########################


Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable parameters: 86044422, all parameters: 184426758, ratio: 46.655064011915236%
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------


Step,Training Loss,Validation Loss,Qwk
100,1.5224,1.109545,0.686957
200,1.0925,1.230584,0.550941
300,1.0458,1.0192,0.662426
400,1.0468,0.96176,0.682913
500,0.9341,0.91071,0.760268
600,0.9167,0.866158,0.761999
700,0.8568,0.86992,0.762259
800,0.836,0.870345,0.779136
900,0.808,0.863416,0.77049
1000,0.8408,0.808451,0.796308


## which layers are used much in full fine-tune ?

In [15]:
# i ask the question to know which layers we can work with on fine tune with LoRa but now i think we do not need to answer it.
# when i test LoRa with all the layers and with Different rank i got a bad results, i think the reason is: 
# the LLM in our case is small and the task may be differ than the model train on and may be other reasons else so i think it's need the Full fine-tune.