In [1]:
# Data processing
import pandas as pd
import numpy as np

# Modeling
import tensorflow as tf
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, EarlyStoppingCallback, TextClassificationPipeline

# Hugging Face Dataset
from datasets import Dataset

# Model performance evaluation
import evaluate

In [2]:
# Read in data
path = "data/amazon_cells_labelled.txt"
amz_review = pd.read_csv(path, sep='\t', names=['review', 'label'])
# Take a look at the data
amz_review.head()

# Take a look at the data
amz_review.head()

Unnamed: 0,review,label
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1


In [3]:
# Get the dataset information
amz_review.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   review  1000 non-null   object
 1   label   1000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 15.8+ KB


In [4]:
# Check the label distribution
amz_review['label'].value_counts()

label
0    500
1    500
Name: count, dtype: int64

In [5]:
from sklearn.model_selection import train_test_split

# Split the data into a temporary training set (60%) and a temporary test set (40%)
train_data, test_data = train_test_split(
    amz_review, test_size=0.4, stratify=amz_review['label'], random_state=42
)

# Split the temporary test set into validation (50%) and test (50%)
val_data, test_data = train_test_split(
    test_data, test_size=0.5, stratify=test_data['label'], random_state=42
)

In [6]:
# Convert pyhton dataframe to Hugging Face arrow dataset
hg_train_data = Dataset.from_pandas(train_data)
hg_val_data = Dataset.from_pandas(val_data)
hg_test_data = Dataset.from_pandas(test_data)

In [7]:
# Length of the Dataset
print(f'The length of hg_train_data is {len(hg_train_data)}.\n')

# Check one review
hg_train_data[0]

The length of hg_train_data is 600.



{'review': "I bought this battery with a coupon from Amazon and I'm very happy with my purchase.",
 'label': 1,
 '__index_level_0__': 793}

In [8]:
# Validate the record in pandas dataframe
amz_review.iloc[[793]]

Unnamed: 0,review,label
793,I bought this battery with a coupon from Amazo...,1


In [9]:
# Tokenizer from a pretrained model
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

# Take a look at the tokenizer
tokenizer



BertTokenizerFast(name_or_path='bert-base-cased', vocab_size=28996, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [10]:
# Mapping between special tokens and their IDs.
print(f'The unknown token is {tokenizer.unk_token} and the ID for the unkown token is {tokenizer.unk_token_id}.')
print(f'The seperator token is {tokenizer.sep_token} and the ID for the seperator token is {tokenizer.sep_token_id}.')
print(f'The pad token is {tokenizer.pad_token} and the ID for the pad token is {tokenizer.pad_token_id}.')
print(f'The sentence level classification token is {tokenizer.cls_token} and the ID for the classification token is {tokenizer.cls_token_id}.')
print(f'The mask token is {tokenizer.mask_token} and the ID for the mask token is {tokenizer.mask_token_id}.')

The unknown token is [UNK] and the ID for the unkown token is 100.
The seperator token is [SEP] and the ID for the seperator token is 102.
The pad token is [PAD] and the ID for the pad token is 0.
The sentence level classification token is [CLS] and the ID for the classification token is 101.
The mask token is [MASK] and the ID for the mask token is 103.


In [11]:
# Funtion to tokenize data
def tokenize_dataset(data):
    return tokenizer(data["review"],
                     max_length=32,
                     truncation=True,
                     padding="max_length")

# Tokenize the dataset
dataset_train = hg_train_data.map(tokenize_dataset)
dataset_val = hg_val_data.map(tokenize_dataset)
dataset_test = hg_test_data.map(tokenize_dataset)

Map:   0%|          | 0/600 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [12]:
# Take a look at the data
print(dataset_train)
print(dataset_val)
print(dataset_test)

Dataset({
    features: ['review', 'label', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 600
})
Dataset({
    features: ['review', 'label', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 200
})
Dataset({
    features: ['review', 'label', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 200
})


In [13]:
# Load model
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
# Set up training arguments
training_args = TrainingArguments(
    output_dir="./sentiment_transfer_learning_transformer/",
    logging_dir='./sentiment_transfer_learning_transformer/logs',
    logging_strategy='epoch',
    logging_steps=100,
    num_train_epochs=2,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    learning_rate=5e-6,
    seed=42,
    save_strategy='epoch',
    save_steps=100,
    eval_strategy='epoch',
    eval_steps=100,
    load_best_model_at_end=True
)

In [19]:
# Number of evaluation modules
print(f'There are {len(evaluate.list_evaluation_modules())} evaluation models in Hugging Face.\n')

# List all evaluation metrics
evaluate.list_evaluation_modules()

There are 190 evaluation models in Hugging Face.



['Remeris/rouge_ru',
 'lvwerra/test',
 'jordyvl/ece',
 'angelina-wang/directional_bias_amplification',
 'cpllab/syntaxgym',
 'lvwerra/bary_score',
 'hack/test_metric',
 'yzha/ctc_eval',
 'codeparrot/apps_metric',
 'mfumanelli/geometric_mean',
 'daiyizheng/valid',
 'erntkn/dice_coefficient',
 'mgfrantz/roc_auc_macro',
 'Vlasta/pr_auc',
 'gorkaartola/metric_for_tp_fp_samples',
 'idsedykh/metric',
 'idsedykh/codebleu2',
 'idsedykh/codebleu',
 'idsedykh/megaglue',
 'christopher/ndcg',
 'Vertaix/vendiscore',
 'GMFTBY/dailydialogevaluate',
 'GMFTBY/dailydialog_evaluate',
 'jzm-mailchimp/joshs_second_test_metric',
 'ola13/precision_at_k',
 'yulong-me/yl_metric',
 'abidlabs/mean_iou',
 'abidlabs/mean_iou2',
 'KevinSpaghetti/accuracyk',
 'NimaBoscarino/weat',
 'ronaldahmed/nwentfaithfulness',
 'Viona/infolm',
 'kyokote/my_metric2',
 'kashif/mape',
 'Ochiroo/rouge_mn',
 'giulio98/code_eval_outputs',
 'leslyarun/fbeta_score',
 'giulio98/codebleu',
 'anz2/iliauniiccocrevaluation',
 'zbeloki/m2',
 

In [20]:
# Function to compute the metric
def compute_metrics(eval_pred):
    metric = evaluate.load("accuracy")
    logits, labels = eval_pred
    # probabilities = tf.nn.softmax(logits)
    predictions = np.argmax(logits, axis=1)
    return metric.compute(predictions=predictions, references=labels)

In [22]:
# Train the model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_val,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=1)]
)

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.2571,0.244681,0.895
2,0.1055,0.275707,0.9


TrainOutput(global_step=300, training_loss=0.1812740421295166, metrics={'train_runtime': 106.3527, 'train_samples_per_second': 11.283, 'train_steps_per_second': 2.821, 'total_flos': 19733329152000.0, 'train_loss': 0.1812740421295166, 'epoch': 2.0})

In [24]:
# Predictions
y_test_predict = trainer.predict(dataset_test)

# Take a look at the predictions
y_test_predict

PredictionOutput(predictions=array([[-1.5654663 ,  2.3990934 ],
       [ 1.7581693 , -0.6715308 ],
       [ 0.09052113,  1.02621   ],
       [-2.4650466 ,  2.794814  ],
       [ 0.9079242 ,  0.28788352],
       [-1.5788181 ,  2.4674635 ],
       [ 1.2817857 , -0.357408  ],
       [ 1.5299478 , -0.43254912],
       [-2.3842678 ,  2.5726216 ],
       [-2.107672  ,  2.7823431 ],
       [ 1.7868149 , -0.69429886],
       [ 1.7423849 , -0.5829625 ],
       [ 1.71223   , -0.56120944],
       [ 1.6318128 , -0.6738697 ],
       [-0.4429009 ,  1.7435055 ],
       [-2.567539  ,  2.6445045 ],
       [ 1.8519938 , -0.71804506],
       [ 0.29203394,  0.57320195],
       [ 1.8237321 , -0.7084937 ],
       [ 1.6794803 , -0.41436952],
       [-0.8883786 ,  1.870488  ],
       [-0.76079154,  1.7905223 ],
       [ 1.7798735 , -0.5307578 ],
       [-2.4459465 ,  2.6110325 ],
       [-0.6942134 ,  1.7669176 ],
       [ 1.76245   , -0.66160893],
       [ 1.4274702 , -0.37966722],
       [ 1.6194515 , -0.50

In [26]:
# Predicted logits
y_test_logits = y_test_predict.predictions

# First 5 predicted probabilities
y_test_logits[:5]

array([[-1.5654663 ,  2.3990934 ],
       [ 1.7581693 , -0.6715308 ],
       [ 0.09052113,  1.02621   ],
       [-2.4650466 ,  2.794814  ],
       [ 0.9079242 ,  0.28788352]], dtype=float32)

In [35]:
# Predicted probabilities
y_test_probabilities = tf.nn.softmax(y_test_logits)

# First 5 predicted logits
y_test_probabilities[:5]

<tf.Tensor: shape=(5, 2), dtype=float32, numpy=
array([[0.01862299, 0.981377  ],
       [0.9190642 , 0.08093577],
       [0.28177202, 0.71822804],
       [0.00516917, 0.99483085],
       [0.6502278 , 0.34977219]], dtype=float32)>

In [36]:
# Predicted labels
y_test_pred_labels = np.argmax(y_test_probabilities, axis=1)

# First 5 predicted probabilities
y_test_pred_labels[:5]

array([1, 0, 1, 1, 0])

In [37]:
# Actual labels
y_test_actual_labels = y_test_predict.label_ids

# First 5 predicted probabilities
y_test_actual_labels[:5]

array([1, 0, 0, 1, 0])

In [38]:
# Trainer evaluate
trainer.evaluate(dataset_test)

{'eval_loss': 0.2873287498950958,
 'eval_accuracy': 0.91,
 'eval_runtime': 4.9994,
 'eval_samples_per_second': 40.005,
 'eval_steps_per_second': 10.001,
 'epoch': 2.0}

In [42]:
# Load f1 metric
metric_f1 = evaluate.load("f1")

# Compute f1 metric
metric_f1.compute(predictions=y_test_pred_labels, references=y_test_actual_labels)

{'f1': 0.9072164948453608}

In [44]:
# Load recall metric
metric_recall = evaluate.load("recall")

# Compute recall metric
metric_recall.compute(predictions=y_test_pred_labels, references=y_test_actual_labels)

{'recall': 0.88}

In [45]:
# Save tokenizer
tokenizer.save_pretrained('./sentiment_transfer_learning_transformer/')

# Save model
trainer.save_model('./sentiment_transfer_learning_transformer/')

In [46]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("./sentiment_transfer_learning_transformer/")

# Load model
loaded_model = AutoModelForSequenceClassification.from_pretrained('./sentiment_transfer_learning_transformer/')