In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/kumarmanoj-bag-of-words-meets-bags-of-popcorn/testData.tsv
/kaggle/input/kumarmanoj-bag-of-words-meets-bags-of-popcorn/labeledTrainData.tsv
/kaggle/input/kumarmanoj-bag-of-words-meets-bags-of-popcorn/sampleSubmission.csv
/kaggle/input/kumarmanoj-bag-of-words-meets-bags-of-popcorn/unlabeledTrainData.tsv


In [2]:
!nvidia-smi

Mon Jun  2 07:46:32 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.03              Driver Version: 560.35.03      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla P100-PCIE-16GB           Off |   00000000:00:04.0 Off |                    0 |
| N/A   34C    P0             25W /  250W |       0MiB /  16384MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                     

In [3]:
!pip install bitsandbytes wandb evaluate

Collecting bitsandbytes
  Downloading bitsandbytes-0.46.0-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting fsspec>=2021.05.0 (from fsspec[http]>=2021.05.0->evaluate)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.m

In [4]:
!pip install evaluate



In [None]:
import os
import pandas as pd
import numpy as np
import datasets
import evaluate
import torch
import wandb

from transformers import AutoModelForSequenceClassification, AutoTokenizer, DataCollatorWithPadding
from transformers import Trainer, TrainingArguments
from peft import LoraConfig, get_peft_model, TaskType
from sklearn.model_selection import train_test_split

os.environ["WANDB_API_KEY"] = ""

def load_data(path):
    return pd.read_csv(path, sep='\t', encoding='utf-8')

train = load_data("/kaggle/input/kumarmanoj-bag-of-words-meets-bags-of-popcorn/labeledTrainData.tsv")
test = load_data("/kaggle/input/kumarmanoj-bag-of-words-meets-bags-of-popcorn/testData.tsv")

train, val = train_test_split(train, test_size=0.2, random_state=42)
train_dataset = datasets.Dataset.from_dict({'label': train["sentiment"], 'text': train['review']})
val_dataset = datasets.Dataset.from_dict({'label': val["sentiment"], 'text': val['review']})
test_dataset = datasets.Dataset.from_dict({"text": test['review']})

model_id = "microsoft/deberta-v3-base"
tokenizer = AutoTokenizer.from_pretrained(model_id)

def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True, max_length=512)

tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_val = val_dataset.map(preprocess_function, batched=True)
tokenized_test = test_dataset.map(preprocess_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

alpha_list = [8, 16, 32]
r_list = [4, 8, 16]
target_module_options = [
    ["query_proj"]
]

for alpha in alpha_list:
    for r in r_list:
        for target_modules in target_module_options:
            exp_name = f"alpha_{alpha}_r_{r}_target_{'_'.join(target_modules)}"
            wandb.init(
                project="lora-distilbert-experiment",
                name=exp_name,
                config={
                    "model": model_id,
                    "lora_alpha": alpha,
                    "lora_r": r,
                    "target_modules": target_modules,
                    "epochs": 3,
                    "batch_size": 64
                }
            )

            model = AutoModelForSequenceClassification.from_pretrained(model_id, num_labels=2)
            lora_config = LoraConfig(
                r=r,
                lora_alpha=alpha,
                target_modules=target_modules,
                lora_dropout=0.05,
                bias="none",
                task_type=TaskType.SEQ_CLS
            )
            model = get_peft_model(model, lora_config)

            training_args = TrainingArguments(
                output_dir=f"./output/{exp_name}",
                num_train_epochs=3,
                per_device_train_batch_size=16,
                per_device_eval_batch_size=16,
                gradient_accumulation_steps=4, 
                warmup_steps=50,
                weight_decay=0.01,
                logging_dir="./",
                logging_steps=25,
                save_strategy="no",
                report_to="wandb",
                fp16=True,
                ddp_find_unused_parameters=False
            )

            trainer = Trainer(
                model=model,
                args=training_args,
                train_dataset=tokenized_train,
                eval_dataset=tokenized_val,
                tokenizer=tokenizer,
                data_collator=data_collator,
                compute_metrics=compute_metrics,
            )

            trainer.train()
            eval_results = trainer.evaluate()
            wandb.log({"validation_accuracy": eval_results["eval_accuracy"]})

            prediction_outputs = trainer.predict(tokenized_test)
            test_pred = np.argmax(prediction_outputs[0], axis=-1).flatten()
            result_output = pd.DataFrame(data={"id": test["id"], "sentiment": test_pred})
            output_file = f"distilbert_lora_{exp_name}.csv"
            result_output.to_csv(output_file, index=False, quoting=3)

            wandb.finish()


2025-06-02 07:48:08.461969: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748850488.671599      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748850488.728346      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mlxxing[0m ([33mlxxing-yunnan-university-of-finance-and-economics[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Tracking run with wandb version 0.19.9
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20250602_074900-igydi8rc[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33malpha_8_r_4_target_query_proj[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/lxxing-yunnan-university-of-finance-and-economics/lora-distilbert-experiment[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/lxxing-yunnan-university-of-finance-and-economics/lora-distilbert-experiment/runs/igydi8rc[0m


pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/371M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
25,0.6984
50,0.6943
75,0.6917
100,0.6941
125,0.6944
150,0.6949
175,0.6932
200,0.6934
225,0.6929
250,0.6936


[34m[1mwandb[0m:                                                                                
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m:           eval/accuracy ▁
[34m[1mwandb[0m:               eval/loss ▁
[34m[1mwandb[0m:            eval/runtime ▁
[34m[1mwandb[0m: eval/samples_per_second ▁
[34m[1mwandb[0m:   eval/steps_per_second ▁
[34m[1mwandb[0m:            test/runtime ▁
[34m[1mwandb[0m: test/samples_per_second ▁
[34m[1mwandb[0m:   test/steps_per_second ▁
[34m[1mwandb[0m:             train/epoch ▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇█████
[34m[1mwandb[0m:       train/global_step ▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇██████
[34m[1mwandb[0m:         train/grad_norm █▂▂▃▁▁▃▅▄▃▂▆▃▂▇▁▄▂▁▅▄▅▃▄▃▄▄▄▄▄▅▄▆▅▄▄▆
[34m[1mwandb[0m:     train/learning_rate ▄████▇▇▇▇▇▆▆▆▆▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▁▁▁
[34m[1mwandb[0m:              train/loss ████████████▇█████▇▇▇▇▇▆▆▅▅▄▄▃▃▂▂▁▁▁▁
[34m[1mwandb[0m:     validation_accuracy ▁
[34m[1mwandb[0

Step,Training Loss
25,0.6949
50,0.6957
75,0.6957
100,0.6945
125,0.6953
150,0.6925
175,0.6946
200,0.6939
225,0.691
250,0.6939


[34m[1mwandb[0m:                                                                                
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m:           eval/accuracy ▁
[34m[1mwandb[0m:               eval/loss ▁
[34m[1mwandb[0m:            eval/runtime ▁
[34m[1mwandb[0m: eval/samples_per_second ▁
[34m[1mwandb[0m:   eval/steps_per_second ▁
[34m[1mwandb[0m:            test/runtime ▁
[34m[1mwandb[0m: test/samples_per_second ▁
[34m[1mwandb[0m:   test/steps_per_second ▁
[34m[1mwandb[0m:             train/epoch ▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇█████
[34m[1mwandb[0m:       train/global_step ▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇██████
[34m[1mwandb[0m:         train/grad_norm ▄▄▂▃▁▁▃▅▄▄▂▆▄▃█▂▄▂▁▆▄▅▃▅▁▃▃▃▁▁▂▃▅▄▃▁▄
[34m[1mwandb[0m:     train/learning_rate ▄████▇▇▇▇▇▆▆▆▆▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▁▁▁
[34m[1mwandb[0m:              train/loss █████▇█▇▆▇▇▇▂▇▇▇▇▇▇▇▇▇▆▆▆▁▆▅▅▅▄▅▅▃▃▄▃
[34m[1mwandb[0m:     validation_accuracy ▁
[34m[1mwandb[0

Step,Training Loss
25,0.6949
50,0.6957
75,0.6957
100,0.6945
125,0.6953
150,0.6925
175,0.6946
200,0.6939
225,0.691
250,0.6939


[34m[1mwandb[0m:                                                                                
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m:           eval/accuracy ▁
[34m[1mwandb[0m:               eval/loss ▁
[34m[1mwandb[0m:            eval/runtime ▁
[34m[1mwandb[0m: eval/samples_per_second ▁
[34m[1mwandb[0m:   eval/steps_per_second ▁
[34m[1mwandb[0m:            test/runtime ▁
[34m[1mwandb[0m: test/samples_per_second ▁
[34m[1mwandb[0m:   test/steps_per_second ▁
[34m[1mwandb[0m:             train/epoch ▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇█████
[34m[1mwandb[0m:       train/global_step ▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇██████
[34m[1mwandb[0m:         train/grad_norm ▄▄▂▃▁▁▃▅▄▄▂▆▄▃█▂▄▂▁▆▄▅▃▅▁▃▃▃▁▁▂▃▅▄▃▁▄
[34m[1mwandb[0m:     train/learning_rate ▄████▇▇▇▇▇▆▆▆▆▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▁▁▁
[34m[1mwandb[0m:              train/loss █████▇█▇▆▇▇▇▂▇▇▇▇▇▇▇▇▇▇▇▆▁▆▅▅▆▅▅▅▄▄▅▄
[34m[1mwandb[0m:     validation_accuracy ▁
[34m[1mwandb[0

Step,Training Loss
25,0.6949
50,0.6957
75,0.6957
100,0.6945
125,0.6952
150,0.6924
175,0.6944
200,0.6937
225,0.6908
250,0.6936


[34m[1mwandb[0m:                                                                                
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m:           eval/accuracy ▁
[34m[1mwandb[0m:               eval/loss ▁
[34m[1mwandb[0m:            eval/runtime ▁
[34m[1mwandb[0m: eval/samples_per_second ▁
[34m[1mwandb[0m:   eval/steps_per_second ▁
[34m[1mwandb[0m:            test/runtime ▁
[34m[1mwandb[0m: test/samples_per_second ▁
[34m[1mwandb[0m:   test/steps_per_second ▁
[34m[1mwandb[0m:             train/epoch ▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇█████
[34m[1mwandb[0m:       train/global_step ▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇██████
[34m[1mwandb[0m:         train/grad_norm ▄▄▂▃▁▁▂▅▄▃▂▅▃▂▇▁▄▂▁▅▄▅▄▆▅▇█▇▇▇▇▇▆▅▆▆▆
[34m[1mwandb[0m:     train/learning_rate ▄████▇▇▇▇▇▆▆▆▆▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▁▁▁
[34m[1mwandb[0m:              train/loss █████████████████████▇▇▇▆▅▄▄▃▂▂▂▂▁▁▁▁
[34m[1mwandb[0m:     validation_accuracy ▁
[34m[1mwandb[0

Step,Training Loss
25,0.6949
50,0.6957
75,0.6957
100,0.6945
125,0.6952
150,0.6924
175,0.6944
200,0.6937
225,0.6908
250,0.6936


[34m[1mwandb[0m:                                                                                
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m:           eval/accuracy ▁
[34m[1mwandb[0m:               eval/loss ▁
[34m[1mwandb[0m:            eval/runtime ▁
[34m[1mwandb[0m: eval/samples_per_second ▁
[34m[1mwandb[0m:   eval/steps_per_second ▁
[34m[1mwandb[0m:            test/runtime ▁
[34m[1mwandb[0m: test/samples_per_second ▁
[34m[1mwandb[0m:   test/steps_per_second ▁
[34m[1mwandb[0m:             train/epoch ▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇█████
[34m[1mwandb[0m:       train/global_step ▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇██████
[34m[1mwandb[0m:         train/grad_norm ▄▄▂▃▁▁▂▅▄▃▂▆▄▂█▁▄▂▁▅▄▆▅▇█▇▇▇▇▇▇▆▅▅▅▆▆
[34m[1mwandb[0m:     train/learning_rate ▄████▇▇▇▇▇▆▆▆▆▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▁▁▁
[34m[1mwandb[0m:              train/loss █████████████████████▇▇▆▅▄▃▃▂▂▂▁▁▁▁▁▁
[34m[1mwandb[0m:     validation_accuracy ▁
[34m[1mwandb[0

Step,Training Loss
25,0.6949
50,0.6957
75,0.6957
100,0.6945
125,0.6953
150,0.6924
175,0.6945
200,0.6938
225,0.6908
250,0.6936


[34m[1mwandb[0m:                                                                                
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m:           eval/accuracy ▁
[34m[1mwandb[0m:               eval/loss ▁
[34m[1mwandb[0m:            eval/runtime ▁
[34m[1mwandb[0m: eval/samples_per_second ▁
[34m[1mwandb[0m:   eval/steps_per_second ▁
[34m[1mwandb[0m:            test/runtime ▁
[34m[1mwandb[0m: test/samples_per_second ▁
[34m[1mwandb[0m:   test/steps_per_second ▁
[34m[1mwandb[0m:             train/epoch ▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇█████
[34m[1mwandb[0m:       train/global_step ▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇██████
[34m[1mwandb[0m:         train/grad_norm ▄▄▂▃▁▁▂▅▄▃▂▆▄▂█▁▄▂▁▅▄▅▄▆▆▇▇▇▇█▇▇▆▅▆▅▆
[34m[1mwandb[0m:     train/learning_rate ▄████▇▇▇▇▇▆▆▆▆▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▁▁▁
[34m[1mwandb[0m:              train/loss █████████████████████▇▇▇▆▅▄▃▃▂▂▂▁▁▁▁▁
[34m[1mwandb[0m:     validation_accuracy ▁
[34m[1mwandb[0

Step,Training Loss
25,0.6949
50,0.6957
75,0.6957
100,0.6944
125,0.6951
150,0.6922
175,0.6942
200,0.6934
225,0.6902
250,0.6927


[34m[1mwandb[0m:                                                                                
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m:           eval/accuracy ▁
[34m[1mwandb[0m:               eval/loss ▁
[34m[1mwandb[0m:            eval/runtime ▁
[34m[1mwandb[0m: eval/samples_per_second ▁
[34m[1mwandb[0m:   eval/steps_per_second ▁
[34m[1mwandb[0m:            test/runtime ▁
[34m[1mwandb[0m: test/samples_per_second ▁
[34m[1mwandb[0m:   test/steps_per_second ▁
[34m[1mwandb[0m:             train/epoch ▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇█████
[34m[1mwandb[0m:       train/global_step ▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇██████
[34m[1mwandb[0m:         train/grad_norm ▃▃▂▂▁▁▂▃▃▂▂▄▃▂▅▃▅▆▅▄▅▄▄▃▄▄▅▆▅▃▃█▆▆▄ ▃
[34m[1mwandb[0m:     train/learning_rate ▄████▇▇▇▇▇▆▆▆▆▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▁▁▁
[34m[1mwandb[0m:              train/loss ███████████████▇▆▅▄▃▃▂▂▂▂▁▁▂▂▁▁▁▁▁▁▂▁
[34m[1mwandb[0m:     validation_accuracy ▁
[34m[1mwandb[0

Step,Training Loss
25,0.6949
50,0.6957
75,0.6957
100,0.6944
125,0.6951
150,0.6922
175,0.6942
200,0.6934
225,0.6902
250,0.6926


[34m[1mwandb[0m:                                                                                
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m:           eval/accuracy ▁
[34m[1mwandb[0m:               eval/loss ▁
[34m[1mwandb[0m:            eval/runtime ▁
[34m[1mwandb[0m: eval/samples_per_second ▁
[34m[1mwandb[0m:   eval/steps_per_second ▁
[34m[1mwandb[0m:            test/runtime ▁
[34m[1mwandb[0m: test/samples_per_second ▁
[34m[1mwandb[0m:   test/steps_per_second ▁
[34m[1mwandb[0m:             train/epoch ▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇█████
[34m[1mwandb[0m:       train/global_step ▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇██████
[34m[1mwandb[0m:         train/grad_norm ▃▃▂▂▁▁▂▄▃▃▂▄▃▂▇▅▆▆▆▄▅▆▅▃▄▄▅▅▄▅▃█▆▆▄▆▄
[34m[1mwandb[0m:     train/learning_rate ▄████▇▇▇▇▇▆▆▆▆▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▁▁▁
[34m[1mwandb[0m:              train/loss ██████████████▇▆▅▄▃▃▂▂▂▁▂▁▁▁▂▁▁▁▁▁▁▂▁
[34m[1mwandb[0m:     validation_accuracy ▁
[34m[1mwandb[0

Step,Training Loss
25,0.6949
50,0.6957
75,0.6957
100,0.6944
125,0.6952
150,0.6923
175,0.6943
200,0.6934
225,0.6903
250,0.6928


[34m[1mwandb[0m:                                                                                
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m:           eval/accuracy ▁
[34m[1mwandb[0m:               eval/loss ▁
[34m[1mwandb[0m:            eval/runtime ▁
[34m[1mwandb[0m: eval/samples_per_second ▁
[34m[1mwandb[0m:   eval/steps_per_second ▁
[34m[1mwandb[0m:            test/runtime ▁
[34m[1mwandb[0m: test/samples_per_second ▁
[34m[1mwandb[0m:   test/steps_per_second ▁
[34m[1mwandb[0m:             train/epoch ▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇█████
[34m[1mwandb[0m:       train/global_step ▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇██████
[34m[1mwandb[0m:         train/grad_norm ▄▄▂▃▁▁▂▅▄▃▂▅▃▂█▄▇█▇▅▅▅▄▃▄▄▄▅▃▄▃▆▄▅▃▅▃
[34m[1mwandb[0m:     train/learning_rate ▄████▇▇▇▇▇▆▆▆▆▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▁▁▁
[34m[1mwandb[0m:              train/loss ███████████████▇▆▅▄▃▂▂▂▁▂▁▁▁▂▁▁▁▁▁▁▂▁
[34m[1mwandb[0m:     validation_accuracy ▁
[34m[1mwandb[0

In [6]:
print(model)

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): DebertaV2ForSequenceClassification(
      (deberta): DebertaV2Model(
        (embeddings): DebertaV2Embeddings(
          (word_embeddings): Embedding(128100, 768, padding_idx=0)
          (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): DebertaV2Encoder(
          (layer): ModuleList(
            (0-11): 12 x DebertaV2Layer(
              (attention): DebertaV2Attention(
                (self): DisentangledSelfAttention(
                  (query_proj): lora.Linear(
                    (base_layer): Linear(in_features=768, out_features=768, bias=True)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.05, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=768, out_features=16, bias=False)
      