# Deberta-base with lora peft for seq classification task trained in pytorch native way

In [1]:
!pip install datasets==2.15
!pip install peft

Collecting datasets==2.15
  Downloading datasets-2.15.0-py3-none-any.whl.metadata (20 kB)
Collecting pyarrow-hotfix (from datasets==2.15)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl.metadata (3.6 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets==2.15)
  Downloading dill-0.3.7-py3-none-any.whl.metadata (9.9 kB)
Collecting fsspec<=2023.10.0,>=2023.1.0 (from fsspec[http]<=2023.10.0,>=2023.1.0->datasets==2.15)
  Downloading fsspec-2023.10.0-py3-none-any.whl.metadata (6.8 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
Collecting multiprocess (from datasets==2.15)
  Downloading multiprocess-0.70.15-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K   

In [2]:
import pandas as pd

train = pd.read_csv("/kaggle/input/nlp-week-10-rnn/train.csv")
train.head()

Unnamed: 0,class,text
0,12,Rules Changed Up is the debut studio album by...
1,14,Back is a novel written by British writer Hen...
2,14,Love and Glory (ISBN 0-385-29261-9) is a 1983...
3,13,Max Manus: Man of War is a 2008 Norwegian bio...
4,7,The former Ahavas Sholem Synagogue building w...


In [3]:
train['class'] -= 1
num_labels = 14

In [4]:
from datasets import Dataset

train_df = Dataset.from_pandas(train)
# train_df = train_df.class_encode_column("class")

In [5]:
# train_df = train_df.train_test_split(test_size=0.1, shuffle=True, stratify_by_column='class', seed=42)
train_df = train_df.train_test_split(test_size=0.1, shuffle=True, seed=42)

In [6]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-base")

def tokenization(example):
    return tokenizer(example["text"], truncation=True, padding='max_length')

train_df = train_df.map(tokenization, batched=True, remove_columns=['text'])
train_df.set_format('torch')

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/474 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Map:   0%|          | 0/90720 [00:00<?, ? examples/s]

Map:   0%|          | 0/10080 [00:00<?, ? examples/s]

In [7]:
train_df = train_df.rename_column("class", "labels")

In [8]:
train_df

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 90720
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 10080
    })
})

In [9]:
from torch.utils.data import DataLoader
batch_size = 10

train_dataloader = DataLoader(train_df['train'], shuffle=True, batch_size=batch_size)
eval_dataloader = DataLoader(train_df['test'], batch_size=batch_size)

In [10]:
for batch in train_dataloader:
    print({k: v.shape for k, v in batch.items()})
    # print(model(**batch_correct).logits.shape)
    break

{'labels': torch.Size([10]), 'input_ids': torch.Size([10, 512]), 'token_type_ids': torch.Size([10, 512]), 'attention_mask': torch.Size([10, 512])}


In [11]:
from transformers import DebertaForSequenceClassification
from peft import LoraConfig, TaskType, get_peft_model
import torch

peft_config = LoraConfig(task_type=TaskType.SEQ_CLS, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1)

model = DebertaForSequenceClassification.from_pretrained("microsoft/deberta-base", num_labels=num_labels)
model = get_peft_model(model, peft_config)

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(device)
model.to(device)

model.print_trainable_parameters()

pytorch_model.bin:   0%|          | 0.00/559M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


cuda
trainable params: 305,678 || all params: 139,508,764 || trainable%: 0.21911024887296687


In [12]:
from sklearn.metrics import f1_score
from tqdm import tqdm

def eval(model, eval_dataloader):
    model.eval()

    preds, labels = [], []

    for batch in tqdm(eval_dataloader):
        batch = {k: v.to(device) for k, v in batch.items()}

        with torch.no_grad():
            logits = model(**batch).logits
        predicted_class_id = logits.argmax(dim=-1)
        
        labels.extend(batch['labels'].tolist())
        preds.extend(predicted_class_id.tolist())
    
    return f1_score(labels, preds, average='macro')

In [13]:
from tqdm.auto import tqdm
from torch.optim import AdamW
from transformers import get_scheduler

def train(train_dataloader, model, num_epochs = 5):
    
    num_training_steps = num_epochs * len(train_dataloader)

    optimizer = AdamW(model.parameters(), lr=5e-5)
    lr_scheduler = get_scheduler(
        name="linear", optimizer=optimizer, num_warmup_steps=5, num_training_steps=num_training_steps
    )

    progress_bar = tqdm(range(num_training_steps))

    model.train()
    for epoch in range(num_epochs):
        for batch in train_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            loss.backward()

            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            progress_bar.update(1)

        print(eval(model, eval_dataloader))
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': loss,
            }, f'checkpoint-epoch-{epoch}.pt')
# train(train_dataloader, model)

# Inference

In [14]:
checkpoint = torch.load('/kaggle/input/deberta-base-with-lora/pytorch/epoch-4/1/checkpoint-epoch-4.pt')
model.load_state_dict(checkpoint['model_state_dict'])
model.eval()

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): DebertaForSequenceClassification(
      (deberta): DebertaModel(
        (embeddings): DebertaEmbeddings(
          (word_embeddings): Embedding(50265, 768, padding_idx=0)
          (LayerNorm): DebertaLayerNorm()
          (dropout): StableDropout()
        )
        (encoder): DebertaEncoder(
          (layer): ModuleList(
            (0-11): 12 x DebertaLayer(
              (attention): DebertaAttention(
                (self): DisentangledSelfAttention(
                  (in_proj): lora.Linear(
                    (base_layer): Linear(in_features=768, out_features=2304, bias=False)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=768, out_features=8, bias=False)
                    )
                    (lora_B): ModuleDict(
       

In [15]:
test = pd.read_csv('/kaggle/input/nlp-week-10-rnn/test.csv', index_col=0)
test_df = Dataset.from_pandas(test)


In [16]:
test_df = test_df.map(tokenization, batched=True, remove_columns=['text', 'id'])
test_df.set_format('torch')

Map:   0%|          | 0/11200 [00:00<?, ? examples/s]

In [17]:
test_df

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 11200
})

In [18]:
test_dataloader = DataLoader(test_df, batch_size=batch_size)

In [19]:
from tqdm import tqdm

model.eval()

preds = []

for batch in tqdm(test_dataloader):
    batch = {k: v.to(device) for k, v in batch.items()}
    
    with torch.no_grad():
        logits = model(**batch).logits
    predicted_class_id = logits.argmax(dim=-1)
    
    preds.extend(predicted_class_id.tolist())

100%|██████████| 1120/1120 [05:30<00:00,  3.38it/s]


In [20]:
test['class_id'] = preds
test = test.drop(['text'], axis=1)

In [21]:
test['class_id'] += 1

In [22]:
test.to_csv('submission.csv')