In [54]:
#classifying customer reviews
from transformers import DistilBertModel,  DistilBertConfig, DistilBertTokenizer, DistilBertForSequenceClassification
from transformers import Trainer, TrainingArguments
from datasets import load_dataset
import numpy as np
import pandas as pd
import torch
import os
from datetime import datetime
from datasets import Dataset
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from transformers import BertConfig
from sklearn.utils.class_weight import compute_class_weight
from transformers import EarlyStoppingCallback, IntervalStrategy


In [55]:
#variables
seed_val = 1
train_subset = 100
test_subset = 50
num_epochs = 4
batch_size = 32
lr = 0.0001
dp = 0.2
wd = 0.01
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
class_list = np.array([0, 1])
sv_limit = 2
test_size = 0.3
esp = 3
evs = 2

In [56]:
#load data
dataset = pd.read_csv("bert_customer/sentiment-analysis.csv", sep=r'\s*,\s*', engine='python')

#print(dataset.head)
print(len(dataset))
print(dataset.columns)
dataset.columns = dataset.columns.str.replace('"', '')
dataset.columns = dataset.columns.str.replace("'", '')
dataset.columns = dataset.columns.str.strip()
print(dataset.columns)


cleaned_data = dataset.dropna(subset=['Sentiment']).copy()

print(len(cleaned_data))
print(cleaned_data["Sentiment"].unique())

cleaned_data.loc[:, "Sentiment"] = cleaned_data["Sentiment"].map({"Positive": 1, "Negative": 0})
cleaned_data["Sentiment"] = cleaned_data["Sentiment"].astype(int)

data_sentiment = cleaned_data["Sentiment"]

print(cleaned_data["Sentiment"].isnull().sum())
print(cleaned_data["Sentiment"].unique())

98
Index(['"Text', 'Sentiment', 'Source', 'Date/Time', 'User ID', 'Location',
       'Confidence Score"'],
      dtype='object')
Index(['Text', 'Sentiment', 'Source', 'Date/Time', 'User ID', 'Location',
       'Confidence Score'],
      dtype='object')
96
['Positive' 'Negative']
0
[1 0]


In [57]:
#Tokenize data
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")


def tokenize_func(data):
    return tokenizer(data, padding="max_length", truncation=True, max_length=128)

tokenized_data = cleaned_data["Text"].apply(tokenize_func)

dataset_hf = Dataset.from_pandas(cleaned_data)


dataset_hf = dataset_hf.add_column("input_ids", [x["input_ids"] for x in tokenized_data])
dataset_hf = dataset_hf.add_column("attention_mask", [x["attention_mask"] for x in tokenized_data])



dataset_hf = dataset_hf.add_column("labels", cleaned_data["Sentiment"].values.astype(int))

dataset_hf.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
dataset_hf = dataset_hf.train_test_split(test_size)
dataset_hf = dataset_hf.map(lambda x: {"labels": int(x["labels"])})

#print(dataset_hf["train"][0])
print(cleaned_data["Sentiment"].unique())



Map: 100%|██████████| 67/67 [00:00<00:00, 729.96 examples/s]
Map: 100%|██████████| 29/29 [00:00<00:00, 697.80 examples/s]

[1 0]





In [None]:
#build model
trained_dataset = dataset_hf["train"].shuffle(seed=seed_val).select(range(len(dataset_hf["train"]))) 
test_dataset = dataset_hf["test"].shuffle(seed=seed_val).select(range(len(dataset_hf["test"])))

bert_config=DistilBertConfig.from_pretrained("distilbert-base-uncased", hidden_dropout_prob=dp, num_labels=2)
#attention mechanism is SDPA by default
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", config=bert_config)

print(model)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [59]:
#training model

#output_dir="bert_imdb/bert_results", takes up too much space on laptop
def comp_scores(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)

    accuracy = accuracy_score(labels, preds)
    precision = precision_score(labels, preds)
    recall = recall_score(labels, preds)
    f1 = f1_score(labels, preds)

    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

training_args = TrainingArguments(
    output_dir="bert_customer/bert_results",
    eval_strategy="steps",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=wd,
    logging_dir="bert_customer/bert_logs",
    logging_steps=1,
    save_steps=10,
    #fp16=True,
    save_total_limit=sv_limit,
    metric_for_best_model="f1",
    load_best_model_at_end=True,
    eval_steps=evs
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=trained_dataset,
    eval_dataset=test_dataset,
    compute_metrics=comp_scores,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=esp)]
)

try:
    trainer.train()
except Exception as e:
    print(f"Error with training model through Trainer: {e}")

Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
2,0.6928,0.660869,0.551724,0.551724,1.0,0.711111
4,0.6468,0.615556,0.551724,0.551724,1.0,0.711111
6,0.469,0.525046,0.551724,0.551724,1.0,0.711111
8,0.4293,0.399583,0.965517,0.941176,1.0,0.969697
10,0.3724,0.332392,0.965517,0.941176,1.0,0.969697
12,0.2502,0.299353,0.965517,0.941176,1.0,0.969697


Could not locate the best model at bert_customer/bert_results\checkpoint-8\pytorch_model.bin, if you are running a distributed training on multiple nodes, you should activate `--save_on_each_node`.


In [60]:
#save model
curr_date = datetime.now().strftime("%Y%m%d_%H%M_%f")

try:
    model_path = f"models/customer_model_{curr_date}"
    tokenizer_path = f"tokenizers/customer_tokenizer_{curr_date}"
    model.save_pretrained(model_path)
    tokenizer.save_pretrained(tokenizer_path)
except Exception as e:
    print(f"Error with saving model and tokenizer: {e}")

In [69]:
#predictions
model.to(device)
model.eval()

pred_list = []
true_label = []




count = 0
print("Beginning prediction")


predictions = trainer.predict(test_dataset)

pred_prob = predictions.predictions
true_labels = predictions.label_ids
pred_classes = np.argmax(pred_prob, axis=1)

pred_list.extend(pred_classes)
true_label.extend(true_labels)

np_pred = np.array(pred_list)
np_label = np.array(true_label)

print("Ending prediction. Beginning evaluation")

if pred_list and true_label:
     try:
          

          accuracy = accuracy_score(np_label, np_pred)
          precision = precision_score(np_label, np_pred, zero_division=0)
          recall = recall_score(np_label, np_pred, zero_division=0)
          f1 = f1_score(np_label, np_pred, zero_division=0)
     except Exception as e:
          print(f"Error with calculating scores: {e}")
     print(f"Accuracy: {accuracy}\nPrecision: {precision}\nRecall: {recall}\nF1: {f1}")
     cm = confusion_matrix(np_label, np_pred)
     print(f"Confusion matrix:\n{cm}")
     print("Ending evaluation")
     #for i in range(10):
          #print(f"Predicted sentiment: {pred_list[i]}")
     
else:
     print("Skipping evaluation")


Beginning prediction


Ending prediction. Beginning evaluation
Accuracy: 0.9655172413793104
Precision: 0.9411764705882353
Recall: 1.0
F1: 0.9696969696969697
Confusion matrix:
[[12  1]
 [ 0 16]]
Ending evaluation
