In [110]:
from datasets import load_dataset
import numpy as np
import pandas as pd
import torch

In [77]:
raw_dataset = load_dataset("glue","sst2")
print("dataset type :",type(raw_dataset['train']))
print("train dataset :",raw_dataset['train'])
print("features in dataset :",raw_dataset['train'].features)

dataset type : <class 'datasets.arrow_dataset.Dataset'>
train dataset : Dataset({
    features: ['sentence', 'label', 'idx'],
    num_rows: 67349
})
features in dataset : {'sentence': Value(dtype='string', id=None), 'label': ClassLabel(names=['negative', 'positive'], id=None), 'idx': Value(dtype='int32', id=None)}


In [78]:
raw_dataset['train'].features

{'sentence': Value(dtype='string', id=None), 'label': ClassLabel(names=['negative', 'positive'], id=None), 'idx': Value(dtype='int32', id=None)}

In [79]:
from transformers import AutoTokenizer

In [80]:
type(raw_dataset['train'].data)

<class 'datasets.table.MemoryMappedTable'>

In [81]:
raw_dataset['train'].data

MemoryMappedTable
sentence: string
label: int64
idx: int32
----
sentence: [["hide new secretions from the parental units ","contains no wit , only labored gags ","that loves its characters and communicates something rather beautiful about human nature ","remains utterly satisfied to remain the same throughout ","on the worst revenge-of-the-nerds clichés the filmmakers could dredge up ",...,"you wish you were at home watching that movie instead of in the theater watching this one ","'s no point in extracting the bare bones of byatt 's plot for purposes of bland hollywood romance ","underdeveloped ","the jokes are flat ","a heartening tale of small victories "],["suspense , intriguing characters and bizarre bank robberies , ","a gritty police thriller with all the dysfunctional family dynamics one could wish for ","with a wonderful ensemble cast of characters that bring the routine day to day struggles of the working class to life ","nonetheless appreciates the art and reveals a music sc

In [82]:
checkpoint  = "distilbert-base-uncased"

In [83]:
# Laoding pretrained model for tokenizer
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [84]:
# to see ids allocated to sentences
tokenized_sentence = tokenizer(raw_dataset['train'][0:3]['sentence'])

In [85]:
print(tokenized_sentence)

{'input_ids': [[101, 5342, 2047, 3595, 8496, 2013, 1996, 18643, 3197, 102], [101, 3397, 2053, 15966, 1010, 2069, 4450, 2098, 18201, 2015, 102], [101, 2008, 7459, 2049, 3494, 1998, 10639, 2015, 2242, 2738, 3376, 2055, 2529, 3267, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}


In [86]:
def tokenize_func(batch):
    return tokenizer(batch['sentence'],truncation=True)


In [87]:
tokenized_dataset = raw_dataset.map(tokenize_func,batched=True)

In [88]:
from transformers import TrainingArguments

In [111]:
if torch.cuda.device_count() > 1:  # if multiple GPUs
       print("Let's use", torch.cuda.device_count(), "GPUs!")

In [113]:
training_args = TrainingArguments(
    output_dir="sentiment_analysis_trainer",
    evaluation_strategy='epoch',
    save_strategy="epoch",
    num_train_epochs=1,
)


In [114]:
from transformers import AutoModelForSequenceClassification

In [115]:
# Loading the pretrained model
model = AutoModelForSequenceClassification.from_pretrained(checkpoint,num_labels =2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [116]:
type(model)

<class 'transformers.models.distilbert.modeling_distilbert.DistilBertForSequenceClassification'>

In [117]:
from torchinfo import summary

In [118]:
summary(model)

Layer (type:depth-idx)                                  Param #
DistilBertForSequenceClassification                     --
├─DistilBertModel: 1-1                                  --
│    └─Embeddings: 2-1                                  --
│    │    └─Embedding: 3-1                              23,440,896
│    │    └─Embedding: 3-2                              393,216
│    │    └─LayerNorm: 3-3                              1,536
│    │    └─Dropout: 3-4                                --
│    └─Transformer: 2-2                                 --
│    │    └─ModuleList: 3-5                             42,527,232
├─Linear: 1-2                                           590,592
├─Linear: 1-3                                           1,538
├─Dropout: 1-4                                          --
Total params: 66,955,010
Trainable params: 66,955,010
Non-trainable params: 0

In [119]:
model
    

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [120]:
# Sanity check to see model parameters before the training
params_before = []
for name , p in model.named_parameters():
    params_before.append(p.detach().cpu().numpy())

In [121]:
from transformers import Trainer
from datasets import load_metric

In [122]:
metric = load_metric("glue","sst2")

In [123]:
# metric.compute(predictions=[1,0,1],references=[1,0,0])

In [124]:
def compute_metrics(logits_and_labels):
    logits,labels = logits_and_labels
    predictions = np.argmax(logits,axis=-1)
    return metric.compute(predictions=predictions,references=labels)

In [127]:
trainer = Trainer(
    model=model,
    args= training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset= tokenized_dataset['validation'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,

)

In [128]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.2085,0.337862,0.909404


TrainOutput(global_step=8419, training_loss=0.22664838075666136, metrics={'train_runtime': 238.722, 'train_samples_per_second': 282.123, 'train_steps_per_second': 35.267, 'total_flos': 518596929468840.0, 'train_loss': 0.22664838075666136, 'epoch': 1.0})

In [134]:
trainer.save_model("sentiment_analysis_model")

In [131]:
from transformers import pipeline

In [151]:
our_model = pipeline("text-classification",model='/home/ubuntu/uzair/NLP/sentiment_analysis_model',device='cpu')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [144]:
sample_texts = [
    "I love this product! It's amazing.",
    "This movie was terrible, I hated it.",
    "The weather today is pleasant and sunny.",
    "The food at the restaurant was delicious.",
    "I'm feeling very happy and excited right now.",
    "This book is a masterpiece of literature.",
]

In [145]:
our_model(sample_texts)

[{'label': 'LABEL_1', 'score': 0.9988431930541992}, {'label': 'LABEL_0', 'score': 0.9981188178062439}, {'label': 'LABEL_1', 'score': 0.998943030834198}, {'label': 'LABEL_1', 'score': 0.9987308382987976}, {'label': 'LABEL_1', 'score': 0.9983748197555542}, {'label': 'LABEL_1', 'score': 0.998918890953064}]

In [148]:
# Inference

import json

config_path = "/home/ubuntu/uzair/NLP/sentiment_analysis_model/config.json"
with open(config_path) as f:
    d = json.load(f)
    # print(d)
    d['id2label'] = {0:'negative',1: 'positive'}

with open(config_path,'w') as f:
    json.dump(d,f,indent=2)



In [149]:
!cat /home/ubuntu/uzair/NLP/sentiment_analysis_model/config.json

{
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "problem_type": "single_label_classification",
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.34.0",
  "vocab_size": 30522,
  "id2label": {
    "0": "negative",
    "1": "positive"
  }
}

In [152]:
our_model(sample_texts)

[{'label': 'positive', 'score': 0.9988431930541992}, {'label': 'negative', 'score': 0.9981188178062439}, {'label': 'positive', 'score': 0.998943030834198}, {'label': 'positive', 'score': 0.9987308382987976}, {'label': 'positive', 'score': 0.9983748197555542}, {'label': 'positive', 'score': 0.998918890953064}]

In [153]:
# Sanity check if params are changes during training or not
params_after = []
for name,p in model.named_parameters():
    params_after.append(p.detach().cpu().numpy())

In [154]:
for p1,p2 in zip(params_before,params_after):
    print(np.sum(np.abs(p1-p2)))

17931.072
110.000275
2.05167
1.2841101
1612.3315
2.1135306
1614.0089
0.0036122422
1468.5613
1.2392111
1409.5287
0.9639718
2.079465
0.9605099
6195.7256
6.9311614
5717.6377
0.890881
1.992882
0.86630344
1596.4199
1.9055786
1600.3307
0.0035844066
1406.9348
1.0768696
1369.3785
0.87917376
1.9602393
0.8766457
6094.748
6.4256124
5603.518
0.8368228
1.8786879
0.8925983
1573.2028
1.9689867
1583.6948
0.0032956232
1396.3035
0.9630036
1374.6539
0.8892582
1.9563774
0.95508695
6155.4814
7.050356
5513.952
0.8686353
1.7714043
0.8424982
1561.5447
1.7499917
1579.2345
0.0038739864
1428.6957
0.9432603
1380.324
0.9337509
1.8230643
0.91793245
5996.125
6.869168
5236.5376
0.8994397
1.691229
0.8050519
1482.8392
1.7797852
1482.8049
0.0022738744
1308.5636
0.86920226
1296.4889
1.0590011
1.7743952
1.1389029
5650.809
6.750546
4707.041
1.0823448
1.5869303
1.1002113
1386.1273
1.7020447
1414.8899
0.0016796269
1215.963
1.4130276
1201.0848
1.3824914
1.6803751
1.534333
4955.7896
6.573166
4486.851
1.3337083
2.193441
0.82992