### Application of Transformers (for Sentiment Analysis)
1. Pipeline('text-classfication');
2. Fine-Tuning hypermeters of model; 
3. Using Cuda instead of cpu for speedy training.

In [1]:
import numpy as np
import pandas as pd
import torch
from datasets import load_dataset
from pprint import pprint
from transformers import AutoTokenizer
from transformers import TrainingArguments
from transformers import AutoModelForSequenceClassification
from torchinfo import summary
from transformers import Trainer
from datasets import load_metric
from transformers import pipeline

In [2]:
# pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu117
if torch.cuda.is_available():
    device = "cuda:0"
else:
    device = "cpu"
device = torch.device(device)
print(device)
print("Pytorch CUDA Version is ", torch.version.cuda)

cuda:0
Pytorch CUDA Version is  11.7


In [3]:
# !pip install transformers datasets
raw_datasets = load_dataset('glue', 'sst2')

Found cached dataset glue (C:/Users/Sealion/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/3 [00:00<?, ?it/s]

In [4]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})

In [5]:
raw_datasets['train']

Dataset({
    features: ['sentence', 'label', 'idx'],
    num_rows: 67349
})

In [6]:
dir(raw_datasets['train'])

['_TF_DATASET_REFS',
 '__class__',
 '__del__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__enter__',
 '__eq__',
 '__exit__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__getitems__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_build_local_temp_path',
 '_check_index_is_initialized',
 '_data',
 '_estimate_nbytes',
 '_fingerprint',
 '_format_columns',
 '_format_kwargs',
 '_format_type',
 '_generate_tables_from_cache_file',
 '_generate_tables_from_shards',
 '_get_cache_file_path',
 '_get_output_signature',
 '_getitem',
 '_indexes',
 '_indices',
 '_info',
 '_map_single',
 '_new_dataset_with_indices',
 '_output_all_columns',
 '_push_parquet_shards_to_hub',
 '_save_to_disk_single',
 '_select_contiguous',
 '_select_with_indices_mappin

In [7]:
type(raw_datasets['train'])

datasets.arrow_dataset.Dataset

In [8]:
raw_datasets['train'].data

MemoryMappedTable
sentence: string
label: int64
idx: int32
----
sentence: [["hide new secretions from the parental units ","contains no wit , only labored gags ","that loves its characters and communicates something rather beautiful about human nature ","remains utterly satisfied to remain the same throughout ","on the worst revenge-of-the-nerds clichés the filmmakers could dredge up ",...,"you wish you were at home watching that movie instead of in the theater watching this one ","'s no point in extracting the bare bones of byatt 's plot for purposes of bland hollywood romance ","underdeveloped ","the jokes are flat ","a heartening tale of small victories "],["suspense , intriguing characters and bizarre bank robberies , ","a gritty police thriller with all the dysfunctional family dynamics one could wish for ","with a wonderful ensemble cast of characters that bring the routine day to day struggles of the working class to life ","nonetheless appreciates the art and reveals a music sc

In [9]:
raw_datasets['train'].data[0]

<pyarrow.lib.ChunkedArray object at 0x000002320DF2A040>
[
  [
    "hide new secretions from the parental units ",
    "contains no wit , only labored gags ",
    "that loves its characters and communicates something rather beautiful about human nature ",
    "remains utterly satisfied to remain the same throughout ",
    "on the worst revenge-of-the-nerds clichés the filmmakers could dredge up ",
    ...
    "you wish you were at home watching that movie instead of in the theater watching this one ",
    "'s no point in extracting the bare bones of byatt 's plot for purposes of bland hollywood romance ",
    "underdeveloped ",
    "the jokes are flat ",
    "a heartening tale of small victories "
  ],
  [
    "suspense , intriguing characters and bizarre bank robberies , ",
    "a gritty police thriller with all the dysfunctional family dynamics one could wish for ",
    "with a wonderful ensemble cast of characters that bring the routine day to day struggles of the working class to li

In [10]:
raw_datasets['train'][1000:1003]

{'sentence': ['suspense , intriguing characters and bizarre bank robberies , ',
  'a gritty police thriller with all the dysfunctional family dynamics one could wish for ',
  'with a wonderful ensemble cast of characters that bring the routine day to day struggles of the working class to life '],
 'label': [1, 1, 1],
 'idx': [1000, 1001, 1002]}

In [11]:
raw_datasets['train'].features

{'sentence': Value(dtype='string', id=None),
 'label': ClassLabel(names=['negative', 'positive'], id=None),
 'idx': Value(dtype='int32', id=None)}

#### Tokenizer

In [12]:
## Using pretrained model ('distilbert-base-uncased')
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

In [13]:
## test samples
raw_datasets['train'][0:3]['sentence']

['hide new secretions from the parental units ',
 'contains no wit , only labored gags ',
 'that loves its characters and communicates something rather beautiful about human nature ']

In [14]:
tokenized_sentences = tokenizer(raw_datasets['train'][0:3]['sentence'])
print(tokenized_sentences)

{'input_ids': [[101, 5342, 2047, 3595, 8496, 2013, 1996, 18643, 3197, 102], [101, 3397, 2053, 15966, 1010, 2069, 4450, 2098, 18201, 2015, 102], [101, 2008, 7459, 2049, 3494, 1998, 10639, 2015, 2242, 2738, 3376, 2055, 2529, 3267, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}


In [15]:
## Function
def tokenize_fn(batch):
    return tokenizer(batch['sentence'], truncation=True)
tokenized_datasets = raw_datasets.map(tokenize_fn, batched=True)
tokenized_datasets

Map:   0%|          | 0/67349 [00:00<?, ? examples/s]

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

Map:   0%|          | 0/1821 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 1821
    })
})

In [16]:
training_args = TrainingArguments('my_trainer', evaluation_strategy='epoch', save_strategy='epoch', num_train_epochs=1,)

Failure while loading azureml_run_type_providers. Failed to load entrypoint azureml.scriptrun = azureml.core.script_run:ScriptRun._from_run_dto with exception (pywin32 228 (c:\users\sealion\anaconda3\lib\site-packages), Requirement.parse('pywin32==227; sys_platform == "win32"'), {'docker'}).


#### Model

In [17]:
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)
type(model)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'pre_classifier.weight', 'classi

transformers.models.distilbert.modeling_distilbert.DistilBertForSequenceClassification

In [18]:
device = torch.device("cuda:0") if torch.cuda.is_available() else "cpu"
model = model.to(device)

In [19]:
# !pip install torchinfo
# summary(model, input_size=(16,512), dtypes=['torch.IntTensor'], device="cpu")   ## cudo:0
summary(model)

Layer (type:depth-idx)                                  Param #
DistilBertForSequenceClassification                     --
├─DistilBertModel: 1-1                                  --
│    └─Embeddings: 2-1                                  --
│    │    └─Embedding: 3-1                              23,440,896
│    │    └─Embedding: 3-2                              393,216
│    │    └─LayerNorm: 3-3                              1,536
│    │    └─Dropout: 3-4                                --
│    └─Transformer: 2-2                                 --
│    │    └─ModuleList: 3-5                             42,527,232
├─Linear: 1-2                                           590,592
├─Linear: 1-3                                           1,538
├─Dropout: 1-4                                          --
Total params: 66,955,010
Trainable params: 66,955,010
Non-trainable params: 0

In [26]:
## Fine-tuning parameters before training
params_before = []
for name, p in model.named_parameters():
    params_before.append(p.detach().cpu().numpy())

In [36]:
metric = load_metric('glue', 'sst2')

In [28]:
# test
metric.compute(predictions=[1,0,1], references=[1,0,0])

{'accuracy': 0.6666666666666666}

In [29]:
def compute_metrics(logits_and_labels):
    logits, labels = logits_and_labels
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

#### Training (long time)

In [30]:
trainer = Trainer(model, training_args, train_dataset=tokenized_datasets['train'], eval_dataset=tokenized_datasets['validation'],
                 tokenizer=tokenizer, compute_metrics=compute_metrics,)
trainer.train()

The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: idx, sentence. If idx, sentence are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 67349
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 8419
  Number of trainable parameters = 66955010
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.195,0.382774,0.896789


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: idx, sentence. If idx, sentence are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 872
  Batch size = 8
Saving model checkpoint to my_trainer\checkpoint-8419
Configuration saved in my_trainer\checkpoint-8419\config.json
Model weights saved in my_trainer\checkpoint-8419\pytorch_model.bin
tokenizer config file saved in my_trainer\checkpoint-8419\tokenizer_config.json
Special tokens file saved in my_trainer\checkpoint-8419\special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=8419, training_loss=0.26894088311948094, metrics={'train_runtime': 454.2661, 'train_samples_per_second': 148.259, 'train_steps_per_second': 18.533, 'total_flos': 518400815624736.0, 'train_loss': 0.26894088311948094, 'epoch': 1.0})

In [43]:
trainer.save_model('my_saved_model')

Saving model checkpoint to my_saved_model
Configuration saved in my_saved_model\config.json
Model weights saved in my_saved_model\pytorch_model.bin
tokenizer config file saved in my_saved_model\tokenizer_config.json
Special tokens file saved in my_saved_model\special_tokens_map.json


In [48]:
from transformers import pipeline   ## pipeline for model or tokenized raw data
newmodel = pipeline('text-classification', model='my_saved_model', device=0)

loading configuration file my_saved_model\config.json
Model config DistilBertConfig {
  "_name_or_path": "my_saved_model",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "problem_type": "single_label_classification",
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.26.1",
  "vocab_size": 30522
}

loading configuration file my_saved_model\config.json
Model config DistilBertConfig {
  "_name_or_path": "my_saved_model",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializ

In [55]:
newmodel('This method works great')   # positive

[{'label': 'LABEL_1', 'score': 0.9996652603149414}]

In [56]:
newmodel('This method works bad')     # negative 

[{'label': 'LABEL_0', 'score': 0.9978487491607666}]

In [60]:
## Extract Fine-tuning parameters after training, and compare them.
params_after = []
for name, p in model.named_parameters():
    params_after.append(p.detach().cpu().numpy())
    
for p1, p2 in zip(params_before, params_after):
    print(np.sum(np.abs(p1-p2)))

13485.047
91.91156
1.7997546
1.1370147
1305.2952
1.6901932
1294.8547
0.0026736008
1189.7288
1.0605416
1128.602
0.84680164
1.7160456
0.85775936
4919.0845
5.7258983
4515.952
0.69795597
1.6095532
0.67171335
1297.0327
1.6288805
1276.0375
0.002484723
1091.8843
0.8281398
1042.3605
0.73171365
1.533285
0.7256043
4867.947
5.3627076
4407.6807
0.692232
1.4428861
0.6802748
1273.0339
1.5718892
1277.5612
0.0022093891
1103.8116
0.7235434
1093.535
0.7237096
1.5213573
0.7750572
4882.576
5.5375686
4294.3545
0.6945188
1.4406718
0.6699518
1275.6019
1.3967298
1287.1619
0.0028095916
1131.5803
0.70965075
1088.8489
0.68552005
1.459613
0.70057225
4798.997
5.4915953
4087.5652
0.6691375
1.3874807
0.6912411
1206.1769
1.4691503
1212.032
0.0016011212
990.1749
0.71090114
1004.90375
0.8583193
1.3801863
0.8401797
4566.8
5.2452526
3504.8257
0.76673514
1.2926623
0.7090116
1161.9203
1.4425095
1135.8972
0.0009350345
936.16693
0.70254314
942.5768
0.85767925
1.3357252
1.1096609
3655.7778
4.494397
3223.699
0.96542
1.3582172


#### Acknowledge:

1. Dataset: https://huggingface.co/datasets
2. Courses of Lazy Programmer

All above is for practice purposes only.