### what we will do:
    - full fine tune 
    - analysis layers to  know which layers i can use LoRa with
    - test another ways.


In [1]:
!pip install bitsandbytes==0.43.1
!pip install accelerate==0.30.1
!pip install peft==0.11.1
!pip install transformers==4.41.2

Collecting bitsandbytes==0.43.1
  Downloading bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl.metadata (2.2 kB)
Downloading bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl (119.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.43.1
Collecting accelerate==0.30.1
  Downloading accelerate-0.30.1-py3-none-any.whl.metadata (18 kB)
Downloading accelerate-0.30.1-py3-none-any.whl (302 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hInstalling collected packages: accelerate
  Attempting uninstall: accelerate
    Found existing installation: accelerate 0.28.0
    Uninstalling accelerate-0.28.0:
      Successfully uninstalled accelerate-0.28.0
Successfully installed accelerate-0.30.1
Collecting peft==0.11.1
  Dow

# Import Libraries

In [2]:
import os
import random
import warnings
warnings.simplefilter('ignore')

import numpy as np
import pandas as pd
import torch
import datasets
from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import StratifiedKFold
from transformers import AutoModelForSequenceClassification, AutoTokenizer, DataCollatorWithPadding, TrainingArguments, Trainer

# When using PEFT, comment out the below line.
from peft import LoftQConfig, LoraConfig, TaskType, get_peft_model, PeftModel, PeftConfig

2024-06-22 07:32:47.649548: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-22 07:32:47.649679: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-22 07:32:47.760641: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


# Config

In [3]:
class CFG:
    n_labels = 6
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    seed = 1
    
    # ----- Model checkpoint -----
    #model_ckpt = '/kaggle/input/deberta-v3-for-offline/base'
    #model_ckpt = '/kaggle/input/huggingfacedebertav3variants/deberta-v3-base'
    model_ckpt = 'microsoft/deberta-v3-base' # When 'INTERNET ON'
    
    # ----- Training params -----
    max_input_length = 2000
    use_peft = False
    rank = 32
    n_freeze = False
    n_folds = 4 
    learning_rate = 5.0e-5
    warmup_ratio = 0.1
    n_epochs = 2
    train_batch_size = 4
    eval_batch_size = 2
    grad_accum_steps = 4
    steps = 200
    fp16 = True


# Prepare Data

In [4]:
DATA_DIR = '/kaggle/input/learning-agency-lab-automated-essay-scoring-2/'
df = pd.read_csv(DATA_DIR + 'train.csv')

# score: [1,2,3,4,5,6] -> label: [0,1,2,3,4,5]
df['label'] = df['score'].apply(lambda x: int(x - 1)).astype('uint8')

In [5]:
# sample of the data for debuging

df = df.sample(4000).reset_index(drop=True)

In [6]:
df.head()

Unnamed: 0,essay_id,full_text,score,label
0,a39e44d,Do you think that the face was created by alie...,2,1
1,250a96a,"In 1976, NASA's Viking 1 spacecraft snapped a ...",3,2
2,3b20650,There are numerous mysteries of the world that...,6,5
3,18327f9,I think that smart cars should not be on the r...,2,1
4,a3e37e8,It's come to my attention that more and more p...,4,3


In [7]:
df['label'].value_counts()

label
2    1468
1    1089
3     873
0     297
4     232
5      41
Name: count, dtype: int64

# Train Model

In [8]:
tokenizer = AutoTokenizer.from_pretrained(CFG.model_ckpt)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


def tokenize(batch):
    tokenized_inputs = tokenizer(
        batch['full_text'],
        padding=False,
        truncation=True,
        max_length=CFG.max_input_length,
    )
    return tokenized_inputs


# return the original model or model with freezed layers or peft or second and third
def model_init():
    ### Load model from checkpoint
    model = AutoModelForSequenceClassification.from_pretrained(
        CFG.model_ckpt,
        num_labels=CFG.n_labels,
    ).to(CFG.device)
    
    # Freeze layers
    if CFG.n_freeze:
        # embedding layer
        for param in model.base_model.embeddings.parameters():
            param.requires_grad = False
            
        # eack encoder layer
        for i in range(CFG.n_freeze):
            for param in model.base_model.encoder.layer[i].parameters():
                param.requires_grad = False
                
    
    
    # Create PEFT (LoRA) model
    if CFG.use_peft:
        peft_config = LoraConfig(
            task_type=TaskType.SEQ_CLS,
            inference_mode=False,
            use_rslora=True,
            r=CFG.rank,
            lora_alpha=8,
            lora_dropout=0,
           # target_modules = "classifier"
        )
        model = get_peft_model(model, peft_config)
        
    
    return model


def compute_metrics(outputs):
    predictions, labels = outputs
    preds = np.argmax(predictions, axis=-1)
#     print(f"Predictions: {preds[:10]}")
#     print(f"Labels: {labels[:10]}")
    qwk = cohen_kappa_score(
        y1=labels, y2=preds,
        labels=range(CFG.n_labels),
        weights='quadratic'
    )
    return {'qwk': qwk}


def seed_everything(seed: int):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

In [9]:
def print_trainable_params(model):
    trainable_params = 0
    all_params = 0
    for _, param in model.named_parameters():
        all_params += param.numel()
        if param.requires_grad == True:
            trainable_params += param.numel()
    
    print(f"trainable parameters: {trainable_params}, all parameters: {all_params}, ratio: {100 * trainable_params / all_params}%")

In [10]:
model = model_init()
print_trainable_params(model)

pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable parameters: 184426758, all parameters: 184426758, ratio: 100.0%


In [11]:
# Print all module names
for name, module in model.named_modules():
    print(name)


deberta
deberta.embeddings
deberta.embeddings.word_embeddings
deberta.embeddings.LayerNorm
deberta.embeddings.dropout
deberta.encoder
deberta.encoder.layer
deberta.encoder.layer.0
deberta.encoder.layer.0.attention
deberta.encoder.layer.0.attention.self
deberta.encoder.layer.0.attention.self.query_proj
deberta.encoder.layer.0.attention.self.key_proj
deberta.encoder.layer.0.attention.self.value_proj
deberta.encoder.layer.0.attention.self.pos_dropout
deberta.encoder.layer.0.attention.self.dropout
deberta.encoder.layer.0.attention.output
deberta.encoder.layer.0.attention.output.dense
deberta.encoder.layer.0.attention.output.LayerNorm
deberta.encoder.layer.0.attention.output.dropout
deberta.encoder.layer.0.intermediate
deberta.encoder.layer.0.intermediate.dense
deberta.encoder.layer.0.intermediate.intermediate_act_fn
deberta.encoder.layer.0.output
deberta.encoder.layer.0.output.dense
deberta.encoder.layer.0.output.LayerNorm
deberta.encoder.layer.0.output.dropout
deberta.encoder.layer.1
deb

In [None]:
### Set seed
seed_everything(CFG.seed)

### Cross Validation
skf = StratifiedKFold(n_splits=CFG.n_folds, shuffle=True, random_state=CFG.seed)
for fold, (tr_idx, va_idx) in enumerate(skf.split(df, df['label'])):
    # Split train/valid
    df_train = df.loc[tr_idx, ['full_text', 'label']].copy()
    df_valid = df.loc[va_idx, ['full_text', 'label']].copy()
    print('#'*25, f"Fold {fold}", '#'*25)
    # Prepare PyArrow dataset
    ds_train = datasets.Dataset.from_pandas(df_train)
    ds_valid = datasets.Dataset.from_pandas(df_valid)
    # Tokenize
    tokenized_ds_train = ds_train.map(tokenize, batched=True, batch_size=None)
    tokenized_ds_valid = ds_valid.map(tokenize, batched=True, batch_size=None)
    # Convert dataset's format: List -> Torch
    tokenized_ds_train.set_format('torch')
    tokenized_ds_valid.set_format('torch')
    # Train
    training_args = TrainingArguments(
        output_dir='/kaggle/temp/',
        overwrite_output_dir=True,
        learning_rate=CFG.learning_rate,
        warmup_ratio=CFG.warmup_ratio,
        num_train_epochs=CFG.n_epochs,
        per_device_train_batch_size=CFG.train_batch_size,
        per_device_eval_batch_size=CFG.eval_batch_size,
        gradient_accumulation_steps=CFG.grad_accum_steps,
        gradient_checkpointing=True,
        fp16=CFG.fp16,
        logging_strategy='steps',
        logging_steps=CFG.steps,
        evaluation_strategy='steps',
        eval_steps=CFG.steps,
        save_strategy='steps',
        save_steps=CFG.steps,
        save_total_limit=1,
        load_best_model_at_end=True,
        report_to='none',
        seed=CFG.seed,
        )
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_ds_train,
        eval_dataset=tokenized_ds_valid,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )
    #model.config.use_cache = False 
    trainer.train()
    
    # for debuaging and testing
    if fold > 2:
        break

######################### Fold 0 #########################


Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Qwk
200,1.2016,0.975503,0.722482


######################### Fold 1 #########################


Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Qwk
200,0.9695,0.882938,0.784317


######################### Fold 2 #########################


Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Qwk
200,0.8436,0.817994,0.81445


######################### Fold 3 #########################


Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss


In [None]:
#trainer.save_model("peft_model")

In [None]:
trainer.save_model("full_finetuned_model")

## which layers are used much in full fine-tune ?