In [10]:
!pip install datasets



In [11]:
!pip install torch




In [12]:
!pip install torchvision



In [13]:
!pip install torchaudio



In [14]:
!pip install transformers




In [15]:
!pip install peft



In [16]:
!pip install evaluate



In [17]:
!pip install wandb



In [18]:
from google.colab import userdata
wnb_token = userdata.get('wnb')
hft = userdata.get('hft')

In [19]:
from datasets import load_dataset, DatasetDict, Dataset

from transformers import(
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer
)

In [20]:
from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig

import evaluate
import torch
import numpy as np


In [21]:
import wandb
wandb.login(key=wnb_token)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mtalhamahmood1261[0m ([33mtalhamahmood1261-air-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [None]:
dataset = load_dataset('shawhin/imdb-truncated')
dataset

In [24]:
np.array(dataset['train']['label']).sum()/len(dataset['train']['label'])


0.5

In [25]:
np.array(dataset['train'])

array([{'label': 1, 'text': '. . . or type on a computer keyboard, they\'d probably give this eponymous film a rating of "10." After all, no elephants are shown being killed during the movie; it is not even implied that any are hurt. To the contrary, the master of ELEPHANT WALK, John Wiley (Peter Finch), complains that he cannot shoot any of the pachyderms--no matter how menacing--without a permit from the government (and his tone suggests such permits are not within the realm of probability). Furthermore, the elements conspire--in the form of an unusual drought and a human cholera epidemic--to leave the Wiley plantation house vulnerable to total destruction by the Elephant People (as the natives dub them) to close the story. If you happen to see the current release EARTH, you\'ll detect the Elephant People are faring less well today.'},
       {'label': 1, 'text': "During 1933 this film had many cuts taken from it because it was very over the top for the story content and the fact tha

In [26]:
model_checkpoint = 'distilbert-base-uncased'
id2label = {0: "Negative", 1: "Positive"}
label2id= {"Negative":0, "Positive":1}

model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, num_labels=2, id2label=id2label, label2id=label2id
)

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [27]:
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [28]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space= True)

if tokenizer.pad_token is None:
  tokenizer.add_special_tokens({'pad_token': '[PAD]'})
  model.resize_token_embeddings(len(tokenizer))



tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [29]:
def tokenize_function(examples):
  text=examples["text"]

  tokenizer.truncation_side="left"
  tokenized_inputs= tokenizer(
      text,
      return_tensors="np",
      truncation=True,
      max_length=512
  )

  return tokenized_inputs

In [30]:
tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
})

In [31]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


In [32]:
accuracy = evaluate.load("accuracy")


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [33]:
def compute_metrics(p):
  predictions, labels = p
  predictions = np.argmax(predictions, axis=1)

  return {"accuracy": accuracy.compute(predictions=predictions, references= labels)}

In [34]:
text_list = ["It was good", "Not a fan, don't recommend", "Better than the first one.", "This is not worth watching even once.", "This one is a pass."]
print("Untrained model predictions:")
print("----------------------------")
for text in text_list:
  inputs = tokenizer.encode(text, return_tensors="pt")
  logits = model(inputs).logits
  predictions = torch.argmax(logits)

  print(text + " - "+ id2label[predictions.tolist()])

Untrained model predictions:
----------------------------
It was good - Negative
Not a fan, don't recommend - Negative
Better than the first one. - Negative
This is not worth watching even once. - Negative
This one is a pass. - Negative


In [35]:
peft_config = LoraConfig(
    task_type = "SEQ_CLS",
    r=4,
    lora_alpha=32,
    lora_dropout=0.01,
    target_modules=["q_lin"]
)

In [36]:
peft_config

LoraConfig(task_type='SEQ_CLS', peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, inference_mode=False, r=4, target_modules={'q_lin'}, exclude_modules=None, lora_alpha=32, lora_dropout=0.01, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={}, eva_config=None, use_dora=False, layer_replication=None, runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=False), lora_bias=False)

In [37]:
model= get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 628,994 || all params: 67,584,004 || trainable%: 0.9307


In [38]:
lr = 1e-3
batch_size= 4
num_epochs = 10


In [39]:
training_args= TrainingArguments(
    output_dir = model_checkpoint + "-lora-text-classification",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True
)



In [40]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator= data_collator,
    compute_metrics=compute_metrics
)

trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.368608,{'accuracy': 0.891}
2,0.426000,0.397958,{'accuracy': 0.889}
3,0.426000,0.586218,{'accuracy': 0.895}
4,0.164000,0.667261,{'accuracy': 0.902}
5,0.164000,0.790616,{'accuracy': 0.889}
6,0.048400,0.908504,{'accuracy': 0.887}
7,0.048400,0.936229,{'accuracy': 0.891}
8,0.017600,0.939271,{'accuracy': 0.886}
9,0.017600,0.952553,{'accuracy': 0.89}
10,0.013600,0.962549,{'accuracy': 0.889}


Trainer is attempting to log a value of "{'accuracy': 0.891}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.889}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.895}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.902}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.889}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This i

TrainOutput(global_step=2500, training_loss=0.13389202270507813, metrics={'train_runtime': 441.0244, 'train_samples_per_second': 22.674, 'train_steps_per_second': 5.669, 'total_flos': 1112883852759936.0, 'train_loss': 0.13389202270507813, 'epoch': 10.0})

In [50]:
model.to('cpu')
print("Trained model predictions:")
print("---------------------------------------")
for text in text_list:
  inputs = tokenizer.encode(text, return_tensors="pt").to('cpu')
  logits= model(inputs).logits
  predictions= torch.argmax(logits)

  print(text + " - " + id2label[predictions.item()])


Trained model predictions:
---------------------------------------
It was good - Positive
Not a fan, don't recommend - Negative
Better than the first one. - Positive
This is not worth watching even once. - Negative
This one is a pass. - Negative


In [45]:
# optional push model to huggingface hub
# from huggingface_hub import notebook_login
# notebook_login() # ensure token gives write access

# option 2: key login
from huggingface_hub import login
login(hft)


In [46]:
hf_name = 'Talha1261' # your hf username or org name
model_id = hf_name + '/' + model_checkpoint + '-lora-text-classification' # you can name the model whatever you want

In [47]:
model.push_to_hub(model_id) # save model


adapter_model.safetensors:   0%|          | 0.00/2.52M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Talha1261/distilbert-base-uncased-lora-text-classification/commit/9ed6b8cc11901275ef5e9bd247c5ef950ec62962', commit_message='Upload model', commit_description='', oid='9ed6b8cc11901275ef5e9bd247c5ef950ec62962', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Talha1261/distilbert-base-uncased-lora-text-classification', endpoint='https://huggingface.co', repo_type='model', repo_id='Talha1261/distilbert-base-uncased-lora-text-classification'), pr_revision=None, pr_num=None)

In [48]:
trainer.push_to_hub(model_id) # save trainer


events.out.tfevents.1741602294.5625b5e41a67.894.0:   0%|          | 0.00/9.27k [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/5.43k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Talha1261/distilbert-base-uncased-lora-text-classification/commit/6d533a1ca6ca1be0fc7216f2275e347124e62628', commit_message='Talha1261/distilbert-base-uncased-lora-text-classification', commit_description='', oid='6d533a1ca6ca1be0fc7216f2275e347124e62628', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Talha1261/distilbert-base-uncased-lora-text-classification', endpoint='https://huggingface.co', repo_type='model', repo_id='Talha1261/distilbert-base-uncased-lora-text-classification'), pr_revision=None, pr_num=None)

In [49]:
# optional: load peft model
# how to load peft model from hub for inference
config = PeftConfig.from_pretrained(model_id)
inference_model = AutoModelForSequenceClassification.from_pretrained(
    config.base_model_name_or_path,
    num_labels=2,
    id2label=id2label,
    label2id=label2id
)
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
model= PeftModel.from_pretrained(inference_model, model_id)

adapter_config.json:   0%|          | 0.00/733 [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


adapter_model.safetensors:   0%|          | 0.00/2.52M [00:00<?, ?B/s]