# Dataset

In [None]:
# conda install -c huggingface datasets
# ! pip install evaluate
#! pip install peft

In [1]:
from datasets import load_dataset, DatasetDict, Dataset

In [8]:
from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer
)

from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
import torch
import numpy as np

In [9]:
dataset = load_dataset('shawhin/imdb-truncated')
dataset

Downloading readme:   0%|          | 0.00/592 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/836k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/853k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
})

In [10]:
model_checkpoint = 'distilbert-base-uncased'

# Define label maps
id2label = {0 : 'Negative', 1 : 'Positive'}
label2id = {'Negative' : 0, 'Positive' : 1}

# Generate Classification model from model_checkpoint
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, num_labels = 2, id2label = id2label, label2id = label2id
)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.bias', 'classifier.we

In [11]:
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
       

In [12]:
# create Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space = True)

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [13]:
# create tokenize function
def tokenize_function(examples):
    # Extract text
    text = examples['text']

    # tokenize and truncate text
    tokenizer.truncation_side = 'left'
    tokenized_inputs = tokenizer(
        text,
        return_tensors = 'np',
        truncation=True,
        max_length=512
    )
    return tokenized_inputs

if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token' : ['PAD']})
    model.resize_token_embeddings(len(tokenizer))

# tokenize training and validation datasets
tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

  tensor = as_tensor(value)


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
})

In [None]:
# input_ids = token id numaralari. her bir tokenin aldigi numaralar
# attention = farkli kelimeler ve tokenlar arasi iliskiyi ve wichtigkeit belirler
# unsere model fokusiert aif einen word und legt fest, dass welche relationship gibt es zwischen unsere haupt word
# text data soll eine gleiche Lange sein. wenn eine data kurz ist, wird mit padding fÃ¼llen 

# attention mask besteht aus 0 und 1 
# wenn ein word oder token 1 ist, beachtet unsere model auf diesen word
# wenn 0 ist, ignoriert diese Wort
# the padding sind 0 

In [14]:
# create data collator for dynamic padding
data_collator = DataCollatorWithPadding(tokenizer = tokenizer)

In [15]:
# define Evaluation Metrics
accuracy = evaluate.load('accuracy')

# define an Evaluation Function to pass into Trainer later.
def compute_metrics(p):
    predictions , labels = p
    predictions = np.argmax(predictions, axis = 1)

    return {'accuracy' : accuracy.compute(predictions = predictions,
                                          references=labels)}
                                          

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [16]:
# Untrained Model Performance
# Define list of examples
text_list = ["It was good." , "Not a fan don't recommed.", "Better than the first one.", "Greatest of all the time.",
             "Worst app evet...", "This is not worth watching even once.", "This one is a pass."]

print('Untrained model Predictions: ')
print('-'*50)

for text in text_list:
    # tokenize text
    inputs = tokenizer.encode(text, return_tensors='pt')
    # compute Logits
    logits = model(inputs).logits
    #Convert Logits to label
    predictions = torch.argmax(logits)

    print(text + ' - ' + id2label[predictions.tolist()])

# Really bad result with base model...

Untrained model Predictions: 
--------------------------------------------------
It was good. - Negative
Not a fan don't recommed. - Negative
Better than the first one. - Negative
Greatest of all the time. - Negative
Worst app evet... - Negative
This is not worth watching even once. - Negative
This one is a pass. - Negative


In [17]:
# fine tuning with LoRA
peft_config = LoraConfig(task_type= 'SEQ_CLS', # Sequence classification
                         r = 4, # Intrinsic rank of Trainable weight matrix
                         lora_alpha=32, # This is like a learnig rate
                         lora_dropout = 0.01, # Probability of dropout
                         target_modules= ['q_lin'] # we apply lora to query layer
                         )

In [18]:
model = get_peft_model(model, peft_config)   # 
model.print_trainable_parameters() # Thats the power of LoRA

trainable params: 628,994 || all params: 67,584,004 || trainable%: 0.9306847223789819


In [20]:
# define Hyperparameters
lr = 1e-3
batch_size = 4
num_epoch = 10

# Define Training Arguments
training_args = TrainingArguments(
    output_dir = model_checkpoint + '-lora-text-classification',
    learning_rate = lr,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = batch_size,
    num_train_epochs = num_epoch,
    weight_decay= 0.01,
    evaluation_strategy= 'epoch',
    save_strategy= 'epoch',
    load_best_model_at_end= 'True'
)

In [None]:
# Define Trainer Object
trainer = Trainer(
    model = model, # our peft model
    args = training_args, # hyperparameters
    train_dataset= tokenized_dataset['train'],
    eval_dataset= tokenized_dataset['validation'],
    tokenizer= tokenizer, # our tokenizer
    data_collator= data_collator, # dynamic padding
    compute_metrics= compute_metrics  # out evaluation metric function
)
trainer.train()

# Generate predictions

In [None]:
model.to('mps')   # moving to mps for Mac (can alternatively do 'cpu')

print('Trained model predictions:')
print('-'*50)

for text in text_list:
    inputs = tokenizer.encode(text, return_tensors= 'pt').to('mps')

    logits = model(inputs).logits
    predictions = torch.max(logits, 1).indices

    print(text + ' - ' + id2label[predictions.tolist()[0]])