In [5]:
from datasets import load_dataset, DatasetDict, Dataset

In [6]:
from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer
)

from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate 
import torch
import numpy as np

In [7]:
model_checkpoint = 'distilbert-base-uncased'

id2label = {0: "Negative", 1: "Positive"}
label2id = {"Negative" : 0, "Positive" : 1}

model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, num_labels = 2, id2label=id2label, label2id=label2id
)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
from datasets import load_dataset
dataset = load_dataset("shawhin/imdb-truncated")
dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
})

In [9]:
AutoTokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space = True)



In [10]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)

def tokenize_function(examples):
    text = examples["text"]
    tokenizer.truncation_side = "left"
    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        truncation=True,
        max_length=512
    )
    return tokenized_inputs

if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [11]:
accuracy = evaluate.load("accuracy")

In [12]:

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)

    return {"accuracy": accuracy.compute(predictions=predictions, references=labels)}

In [13]:
text_list = ["It was good.", "Not a fan, don't recommed.", "Better than the first one.", "This is not worth watching even once.", "This one is a pass."]

print("predictions sur le model non entrainé:")
print("----------------------------")

for text in text_list:
    inputs = tokenizer.encode(text, return_tensors="pt")
    logits = model(inputs).logits
    predictions = torch.argmax(logits)

    print(text +" _ " + id2label[predictions.tolist()])

predictions sur le model non entrainé:
----------------------------
It was good. _ Positive
Not a fan, don't recommed. _ Negative
Better than the first one. _ Positive
This is not worth watching even once. _ Negative
This one is a pass. _ Positive


In [14]:
peft_config = LoraConfig(task_type="SEQ_CLS",
                         r=4,
                         lora_alpha = 32,
                         lora_dropout = 0.01,
                         target_modules = ['q_lin'])


In [15]:
peft_config

LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type='SEQ_CLS', inference_mode=False, r=4, target_modules={'q_lin'}, lora_alpha=32, lora_dropout=0.01, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={}, use_dora=False, layer_replication=None, runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=False))

In [16]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 628,994 || all params: 67,584,004 || trainable%: 0.9307


In [29]:

lr = 1e-3
batch_size = 3 
num_epochs = 10

In [30]:
training_args = TrainingArguments(
    output_dir= model_checkpoint + "-lora-text-classification",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)



In [31]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator, 
    compute_metrics=compute_metrics,
)


trainer.train()

  0%|          | 0/3340 [00:00<?, ?it/s]

  0%|          | 0/334 [00:00<?, ?it/s]

{'eval_loss': 1.1846506595611572, 'eval_accuracy': {'accuracy': 0.801}, 'eval_runtime': 62.6737, 'eval_samples_per_second': 15.956, 'eval_steps_per_second': 5.329, 'epoch': 1.0}
{'loss': 0.3428, 'grad_norm': 0.44068294763565063, 'learning_rate': 0.0008502994011976049, 'epoch': 1.5}


  0%|          | 0/334 [00:00<?, ?it/s]

{'eval_loss': 0.5975099802017212, 'eval_accuracy': {'accuracy': 0.882}, 'eval_runtime': 61.7971, 'eval_samples_per_second': 16.182, 'eval_steps_per_second': 5.405, 'epoch': 2.0}
{'loss': 0.2533, 'grad_norm': 3.716527862707153e-05, 'learning_rate': 0.0007005988023952096, 'epoch': 2.99}


  0%|          | 0/334 [00:00<?, ?it/s]

{'eval_loss': 0.8281964659690857, 'eval_accuracy': {'accuracy': 0.866}, 'eval_runtime': 61.6327, 'eval_samples_per_second': 16.225, 'eval_steps_per_second': 5.419, 'epoch': 3.0}


  0%|          | 0/334 [00:00<?, ?it/s]

{'eval_loss': 0.9791224598884583, 'eval_accuracy': {'accuracy': 0.88}, 'eval_runtime': 62.1437, 'eval_samples_per_second': 16.092, 'eval_steps_per_second': 5.375, 'epoch': 4.0}
{'loss': 0.1283, 'grad_norm': 1.1976771354675293, 'learning_rate': 0.0005508982035928143, 'epoch': 4.49}


  0%|          | 0/334 [00:00<?, ?it/s]

{'eval_loss': 1.0979851484298706, 'eval_accuracy': {'accuracy': 0.874}, 'eval_runtime': 60.5763, 'eval_samples_per_second': 16.508, 'eval_steps_per_second': 5.514, 'epoch': 5.0}
{'loss': 0.0668, 'grad_norm': 0.0001573127810843289, 'learning_rate': 0.0004011976047904192, 'epoch': 5.99}


  0%|          | 0/334 [00:00<?, ?it/s]

{'eval_loss': 1.1339842081069946, 'eval_accuracy': {'accuracy': 0.891}, 'eval_runtime': 63.1133, 'eval_samples_per_second': 15.845, 'eval_steps_per_second': 5.292, 'epoch': 6.0}


  0%|          | 0/334 [00:00<?, ?it/s]

{'eval_loss': 1.299228549003601, 'eval_accuracy': {'accuracy': 0.881}, 'eval_runtime': 60.5681, 'eval_samples_per_second': 16.51, 'eval_steps_per_second': 5.514, 'epoch': 7.0}
{'loss': 0.0254, 'grad_norm': 2.5495996425206613e-08, 'learning_rate': 0.00025149700598802393, 'epoch': 7.49}


  0%|          | 0/334 [00:00<?, ?it/s]

{'eval_loss': 1.236066460609436, 'eval_accuracy': {'accuracy': 0.884}, 'eval_runtime': 76.674, 'eval_samples_per_second': 13.042, 'eval_steps_per_second': 4.356, 'epoch': 8.0}
{'loss': 0.0143, 'grad_norm': 1.2234094626251135e-08, 'learning_rate': 0.00010179640718562875, 'epoch': 8.98}


  0%|          | 0/334 [00:00<?, ?it/s]

{'eval_loss': 1.376738429069519, 'eval_accuracy': {'accuracy': 0.883}, 'eval_runtime': 98.9158, 'eval_samples_per_second': 10.11, 'eval_steps_per_second': 3.377, 'epoch': 9.0}


  0%|          | 0/334 [00:00<?, ?it/s]

{'eval_loss': 1.3700569868087769, 'eval_accuracy': {'accuracy': 0.885}, 'eval_runtime': 60.6614, 'eval_samples_per_second': 16.485, 'eval_steps_per_second': 5.506, 'epoch': 10.0}
{'train_runtime': 1903.4112, 'train_samples_per_second': 5.254, 'train_steps_per_second': 1.755, 'train_loss': 0.12470604349039272, 'epoch': 10.0}


TrainOutput(global_step=3340, training_loss=0.12470604349039272, metrics={'train_runtime': 1903.4112, 'train_samples_per_second': 5.254, 'train_steps_per_second': 1.755, 'total_flos': 1031411403880992.0, 'train_loss': 0.12470604349039272, 'epoch': 10.0})

In [20]:
import torch


device = torch.device("cuda")
model.to(device)

print("Prédictions sur le modèle entraîné :")
print("---------------------------------")
for text in text_list:
    inputs = tokenizer.encode(text, return_tensors="pt").to(device)

    logits = model(inputs).logits
    predictions = torch.max(logits, 1).indices

    print(text + " - " + id2label[predictions.tolist()[0]])

Prédictions sur le modèle entraîné :
---------------------------------
It was good. - Positive
Not a fan, don't recommed. - Negative
Better than the first one. - Positive
This is not worth watching even once. - Positive
This one is a pass. - Positive


In [28]:
text_list2 = ["pas une mauvaise chose"]

print("Prédictions sur le modèle entraîné :")
print("---------------------------------")
for text in text_list2:
    inputs = tokenizer.encode(text, return_tensors="pt").to(device)

    logits = model(inputs).logits
    predictions = torch.max(logits, 1).indices

    print(text + " - " + id2label[predictions.tolist()[0]])

Prédictions sur le modèle entraîné :
---------------------------------
pas une mauvaise chose - Positive
