# Step 1: INstall And Import Python Libraries

In [None]:
# Install libraries
#https://grabngoinfo.com/transfer-learning-for-text-classification-using-hugging-face-transformers-trainer/

: 

In [2]:
# Data processing
import pandas as pd
import numpy as np

# Modeling
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, EarlyStoppingCallback, TextClassificationPipeline

# Hugging Face Dataset
from datasets import Dataset

# Model performance evaluation
import evaluate

ModuleNotFoundError: No module named 'pandas'

# Step 2: Download And Read Data

In [14]:
# Mount Google Drive
# from google.colab import drive
# drive.mount('/content/drive')


In [15]:
# imdb_review = pd.read_csv('/content/drive/MyDrive/data.csv')
imdb_review = pd.read_csv('./data.csv')

imdb_review = imdb_review.rename(columns={'sentiment': 'label'})

imdb_review['label']= imdb_review['label'].map({'positive': 1, 'negative': 0})

imdb_review.head()


Unnamed: 0,review,label
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [16]:
imdb_review.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   review  50000 non-null  object
 1   label   50000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 781.4+ KB


In [17]:
# Check the label distribution

imdb_review['label'].value_counts()

label
1    25000
0    25000
Name: count, dtype: int64

#Step 3:  Train Test  Split

In [18]:
# Training dataset
train_data = imdb_review.sample(frac=0.8, random_state=42)

# Testing dataset
test_data = imdb_review.drop(train_data.index)

# Check the number of records in training and testing dataset.
print(f'The training dataset has {len(train_data)} records.')
print(f'The testing dataset has {len(test_data)} records.')

The training dataset has 40000 records.
The testing dataset has 10000 records.


# Step 4: Convert Pandas Dataframe To Hugging Face Dataset


Hugging Face Dataset objects are memory mapped on drive so they are not limited by RAM memory which is very helpful for processing large datasets

In [19]:
# Convert pyhton dataframe to Hugging Face arrow dataset
hg_train_data = Dataset.from_pandas(train_data)
hg_test_data = Dataset.from_pandas(test_data)

In [20]:
# Length of the Dataset
print(f'The length of hg_train_data is {len(hg_train_data)}.\n')

# Check one review
hg_train_data[0]

The length of hg_train_data is 40000.



{'review': "I really liked this Summerslam due to the look of the arena, the curtains and just the look overall was interesting to me for some reason. Anyways, this could have been one of the best Summerslam's ever if the WWF didn't have Lex Luger in the main event against Yokozuna, now for it's time it was ok to have a huge fat man vs a strong man but I'm glad times have changed. It was a terrible main event just like every match Luger is in is terrible. Other matches on the card were Razor Ramon vs Ted Dibiase, Steiner Brothers vs Heavenly Bodies, Shawn Michaels vs Curt Hening, this was the event where Shawn named his big monster of a body guard Diesel, IRS vs 1-2-3 Kid, Bret Hart first takes on Doink then takes on Jerry Lawler and stuff with the Harts and Lawler was always very interesting, then Ludvig Borga destroyed Marty Jannetty, Undertaker took on Giant Gonzalez in another terrible match, The Smoking Gunns and Tatanka took on Bam Bam Bigelow and the Headshrinkers, and Yokozuna 

# Step 5: Tokenize Text

A tokenizer converts text into numbers to use as the input of the NLP (Natural Language Processing) models.

In [21]:
# Tokenizer from a pretrained model
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

# Take a look at the tokenizer
tokenizer

Downloading (…)okenizer_config.json: 100%|██████████| 29.0/29.0 [00:00<00:00, 9.62kB/s]
Downloading (…)lve/main/config.json: 100%|██████████| 570/570 [00:00<00:00, 304kB/s]
Downloading (…)solve/main/vocab.txt: 100%|██████████| 213k/213k [00:00<00:00, 1.09MB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 436k/436k [00:00<00:00, 4.60MB/s]


BertTokenizerFast(name_or_path='bert-base-cased', vocab_size=28996, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [22]:
# Mapping between special tokens and their IDs.
print(f'The unknown token is {tokenizer.unk_token} and the ID for the unkown token is {tokenizer.unk_token_id}.')
print(f'The seperator token is {tokenizer.sep_token} and the ID for the seperator token is {tokenizer.sep_token_id}.')
print(f'The pad token is {tokenizer.pad_token} and the ID for the pad token is {tokenizer.pad_token_id}.')
print(f'The sentence level classification token is {tokenizer.cls_token} and the ID for the classification token is {tokenizer.cls_token_id}.')
print(f'The mask token is {tokenizer.mask_token} and the ID for the mask token is {tokenizer.mask_token_id}.')

The unknown token is [UNK] and the ID for the unkown token is 100.
The seperator token is [SEP] and the ID for the seperator token is 102.
The pad token is [PAD] and the ID for the pad token is 0.
The sentence level classification token is [CLS] and the ID for the classification token is 101.
The mask token is [MASK] and the ID for the mask token is 103.


In [23]:
# Function to tokenize data

def tokenize_dataset(data):
    return tokenizer(data['review'], max_length=32, truncation=True, padding="max_length")

# Tokenize the dataset
dataset_train = hg_train_data.map(tokenize_dataset)
dataset_test = hg_test_data.map(tokenize_dataset)

                                                                   

In [24]:
# Take a look at the data
print(dataset_train)
print(dataset_test)
print(dataset_train[0])

Dataset({
    features: ['review', 'label', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 40000
})
Dataset({
    features: ['review', 'label', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 10000
})
{'review': "I really liked this Summerslam due to the look of the arena, the curtains and just the look overall was interesting to me for some reason. Anyways, this could have been one of the best Summerslam's ever if the WWF didn't have Lex Luger in the main event against Yokozuna, now for it's time it was ok to have a huge fat man vs a strong man but I'm glad times have changed. It was a terrible main event just like every match Luger is in is terrible. Other matches on the card were Razor Ramon vs Ted Dibiase, Steiner Brothers vs Heavenly Bodies, Shawn Michaels vs Curt Hening, this was the event where Shawn named his big monster of a body guard Diesel, IRS vs 1-2-3 Kid, Bret Hart first takes on Doink then takes 

# Step 6: Load Pretrained Model




- AutoModelForSequenceClassification loads the BERT model without the sequence classification head.
- The method from_pretrained() loads the weights from the pretrained model into the new model, so the weights in the new model are not randomly initialized. Note that the new weights for the new sequence classification head are going to be randomly initialized.
- bert-base-cased is the name of the pretrained model. We can change it to a different model based on the nature of the project.
- num_labels indicates the number of classes. Our dataset has two classes, positive and negative, so num_labels=2.


In [25]:
# Load model
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2)

Downloading pytorch_model.bin: 100%|██████████| 436M/436M [00:16<00:00, 26.8MB/s] 
Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights o

# Step 7 Set Training Argument

Hugging Face has 96 parameters for TrainingArguments, which provides a lot of flexibility in fine-tuning the transfer learning model.

In [26]:
# Set up training arguments
training_args = TrainingArguments(
    output_dir="./sentiment_transfer_learning_transformer/",          
    logging_dir='./sentiment_transfer_learning_transformer/logs',            
    logging_strategy='epoch',
    logging_steps=100,    
    num_train_epochs=2,              
    per_device_train_batch_size=4,  
    per_device_eval_batch_size=4,  
    learning_rate=5e-6,
    seed=42,
    save_strategy='epoch',
    save_steps=100,
    evaluation_strategy='epoch',
    eval_steps=100,
    load_best_model_at_end=True
)

Step 8: Set Evaluation Metrics

In step 8, we will set the evaluation metric because Hugging Face Trainer does not evaluate the model performance automatically during the training process.

In [27]:
# Number of evaluation modules
print(f'There are {len(evaluate.list_evaluation_modules())} evaluation models in Hugging Face.\n')

# List all evaluation metrics
evaluate.list_evaluation_modules()

There are 161 evaluation models in Hugging Face.



['lvwerra/test',
 'precision',
 'code_eval',
 'roc_auc',
 'cuad',
 'xnli',
 'rouge',
 'pearsonr',
 'mse',
 'super_glue',
 'comet',
 'cer',
 'sacrebleu',
 'mahalanobis',
 'wer',
 'competition_math',
 'f1',
 'recall',
 'coval',
 'mauve',
 'xtreme_s',
 'bleurt',
 'ter',
 'accuracy',
 'exact_match',
 'indic_glue',
 'spearmanr',
 'mae',
 'squad',
 'chrf',
 'glue',
 'perplexity',
 'mean_iou',
 'squad_v2',
 'meteor',
 'bleu',
 'wiki_split',
 'sari',
 'frugalscore',
 'google_bleu',
 'bertscore',
 'matthews_correlation',
 'seqeval',
 'trec_eval',
 'rl_reliability',
 'jordyvl/ece',
 'angelina-wang/directional_bias_amplification',
 'cpllab/syntaxgym',
 'lvwerra/bary_score',
 'kaggle/amex',
 'kaggle/ai4code',
 'hack/test_metric',
 'yzha/ctc_eval',
 'codeparrot/apps_metric',
 'mfumanelli/geometric_mean',
 'daiyizheng/valid',
 'poseval',
 'erntkn/dice_coefficient',
 'mgfrantz/roc_auc_macro',
 'Vlasta/pr_auc',
 'gorkaartola/metric_for_tp_fp_samples',
 'idsedykh/metric',
 'idsedykh/codebleu2',
 'idsed

In [28]:
# Function to compute the metric
def compute_metrics(eval_pred):
    metric = evaluate.load("accuracy")
    logits, labels = eval_pred
    # probabilities = tf.nn.softmax(logits)
    predictions = np.argmax(logits, axis=1)
    return metric.compute(predictions=predictions, references=labels)

# Step9: Train Model Using Transformer Trainer

In [29]:
# Train the model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_test,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=1)]
)

In [None]:
trainer.train()

: 

# Step 11: Evaluate Model Performance

In [None]:
# Trainer evaluate
trainer.evaluate(dataset_test)

{'eval_loss': 0.5471986532211304,
 'eval_accuracy': 0.76,
 'eval_runtime': 39.6333,
 'eval_samples_per_second': 252.313,
 'eval_steps_per_second': 63.078,
 'epoch': 2.0}

# Step 12: Save and Load The Model

In [None]:
# Save tokenizer
tokenizer.save_pretrained('./sentiment_transfer_learning_transformer/')

# Save model
trainer.save_model('./sentiment_transfer_learning_transformer/')

In [None]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("./sentiment_transfer_learning_transformer/")

# Load model
loaded_model = AutoModelForSequenceClassification.from_pretrained('./sentiment_transfer_learning_transformer/')

In [None]:
# !zip -r sentiment_transfer_learning_transformer.zip sentiment_transfer_learning_transformer/ 

  adding: sentiment_transfer_learning_transformer/ (stored 0%)
  adding: sentiment_transfer_learning_transformer/special_tokens_map.json (deflated 42%)
  adding: sentiment_transfer_learning_transformer/checkpoint-20000/ (stored 0%)
  adding: sentiment_transfer_learning_transformer/checkpoint-20000/rng_state.pth (deflated 28%)
  adding: sentiment_transfer_learning_transformer/checkpoint-20000/config.json (deflated 49%)
  adding: sentiment_transfer_learning_transformer/checkpoint-20000/optimizer.pt (deflated 17%)
  adding: sentiment_transfer_learning_transformer/checkpoint-20000/training_args.bin (deflated 49%)
  adding: sentiment_transfer_learning_transformer/checkpoint-20000/pytorch_model.bin (deflated 7%)
  adding: sentiment_transfer_learning_transformer/checkpoint-20000/trainer_state.json (deflated 63%)
  adding: sentiment_transfer_learning_transformer/checkpoint-20000/scheduler.pt (deflated 49%)
  adding: sentiment_transfer_learning_transformer/config.json (deflated 49%)
  adding: s

# Step 13: Analysis with SHAP