Now we will perform transfer learning using our pre-trained model (IMDB), and fine tune it with Rotten Tomatoes dataset.

In [20]:
pip install transformers datasets evaluate accelerate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [2]:
from datasets import load_dataset

In [3]:
rotten_tomatoes = load_dataset('rotten_tomatoes')

README.md:   0%|          | 0.00/7.46k [00:00<?, ?B/s]

train.parquet:   0%|          | 0.00/699k [00:00<?, ?B/s]

validation.parquet:   0%|          | 0.00/90.0k [00:00<?, ?B/s]

test.parquet:   0%|          | 0.00/92.2k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8530 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1066 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1066 [00:00<?, ? examples/s]

In [4]:
rotten_tomatoes

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 8530
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 1066
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1066
    })
})

Load our pre-trained model directly from huggingface.

In [5]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("tashrifmahmud/sentiment_analysis_model")
model = AutoModelForSequenceClassification.from_pretrained("tashrifmahmud/sentiment_analysis_model")

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

tokenizer_config.json:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/746 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Preprocess the rotten tomatoes dataset with our tokenizer.

In [6]:
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True)

In [7]:
tokenized_rotten_tomatoes = rotten_tomatoes.map(tokenize_function, batched=True)

Map:   0%|          | 0/8530 [00:00<?, ? examples/s]

Map:   0%|          | 0/1066 [00:00<?, ? examples/s]

Map:   0%|          | 0/1066 [00:00<?, ? examples/s]

In [8]:
print(model)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [9]:
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(p):
    preds, labels = p
    preds = np.argmax(preds, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")
    accuracy = accuracy_score(labels, preds)
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

In [10]:
from transformers import Trainer, TrainingArguments

In [11]:
# Set up your training arguments
training_args = TrainingArguments(
    output_dir="sentiment_analysis_model",
    run_name="testing_rotten_tomatoes",
    evaluation_strategy="epoch",
    per_device_eval_batch_size=8,
    logging_dir='./logs',
    push_to_hub=False,
)



In [12]:
# Set up the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    eval_dataset=tokenized_rotten_tomatoes["test"].select([i for i in range(1000)]),
)

In [13]:
# Evaluate the existing model on rotten tomatoes data
evaluation_results = trainer.evaluate()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [15]:
evaluation_results

{'eval_loss': 0.4267759621143341,
 'eval_model_preparation_time': 0.0029,
 'eval_accuracy': 0.821,
 'eval_precision': 0.8160714285714286,
 'eval_recall': 0.8574108818011257,
 'eval_f1': 0.8362305580969808,
 'eval_runtime': 17.7026,
 'eval_samples_per_second': 56.489,
 'eval_steps_per_second': 7.061}

In [16]:
# Freezing the first 3 layers
for i, layer in enumerate(model.distilbert.transformer.layer):
    if i < 3:
        for param in layer.parameters():
            param.requires_grad = False

In [17]:
# checking
for name, param in model.distilbert.named_parameters():
    print(f"{name}: {param.requires_grad}")

embeddings.word_embeddings.weight: True
embeddings.position_embeddings.weight: True
embeddings.LayerNorm.weight: True
embeddings.LayerNorm.bias: True
transformer.layer.0.attention.q_lin.weight: False
transformer.layer.0.attention.q_lin.bias: False
transformer.layer.0.attention.k_lin.weight: False
transformer.layer.0.attention.k_lin.bias: False
transformer.layer.0.attention.v_lin.weight: False
transformer.layer.0.attention.v_lin.bias: False
transformer.layer.0.attention.out_lin.weight: False
transformer.layer.0.attention.out_lin.bias: False
transformer.layer.0.sa_layer_norm.weight: False
transformer.layer.0.sa_layer_norm.bias: False
transformer.layer.0.ffn.lin1.weight: False
transformer.layer.0.ffn.lin1.bias: False
transformer.layer.0.ffn.lin2.weight: False
transformer.layer.0.ffn.lin2.bias: False
transformer.layer.0.output_layer_norm.weight: False
transformer.layer.0.output_layer_norm.bias: False
transformer.layer.1.attention.q_lin.weight: False
transformer.layer.1.attention.q_lin.bias

In [18]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [19]:
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

In [21]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [23]:
training_args = TrainingArguments(
    output_dir="sentiment_analysis_model_v2",
    run_name="finetune_run",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3, # we are increasing from our previous 2 to 3
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True,
)

In [24]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_rotten_tomatoes["train"],
    eval_dataset=tokenized_rotten_tomatoes["test"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

Fine-tuning our model with rotten tomatoes data.

In [25]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.365,0.368247,0.839587,0.826715,0.859287,0.842686
2,0.2804,0.389193,0.845216,0.85249,0.834897,0.843602
3,0.2301,0.434206,0.844278,0.840445,0.849906,0.845149


TrainOutput(global_step=1602, training_loss=0.28703648261214315, metrics={'train_runtime': 1134.8722, 'train_samples_per_second': 22.549, 'train_steps_per_second': 1.412, 'total_flos': 3389840731607040.0, 'train_loss': 0.28703648261214315, 'epoch': 3.0})

Looks like after epoch 1 and epoch 2 it starts to overfit and thus the validation loss goes up from .37 to .39 and finally to .43 at last epoch. So we will use the model before it starts overfitting. Since we already had load best model at end for training argument we can check at which checkpoint model is being loaded.

In [26]:
print(f"Best model checkpoint is at: {trainer.state.best_model_checkpoint}")

Best model checkpoint is at: sentiment_analysis_model_v2/checkpoint-534


In [27]:
trainer.save_model()

events.out.tfevents.1732436245.37fede67bf22.177.1:   0%|          | 0.00/7.58k [00:00<?, ?B/s]

As push_to_hub was set to true, we can again load the best model and push back to hub.

In [29]:
checkpoint_path = "sentiment_analysis_model_v2/checkpoint-534"
model = AutoModelForSequenceClassification.from_pretrained(checkpoint_path)
tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)

In [31]:
model.push_to_hub("sentiment_analysis_model_v2")
tokenizer.push_to_hub("sentiment_analysis_model_v2")

README.md:   0%|          | 0.00/1.88k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/tashrifmahmud/sentiment_analysis_model_v2/commit/9b213ead5a62ad8388263b2cb9acb4049474b815', commit_message='Upload tokenizer', commit_description='', oid='9b213ead5a62ad8388263b2cb9acb4049474b815', pr_url=None, repo_url=RepoUrl('https://huggingface.co/tashrifmahmud/sentiment_analysis_model_v2', endpoint='https://huggingface.co', repo_type='model', repo_id='tashrifmahmud/sentiment_analysis_model_v2'), pr_revision=None, pr_num=None)