In [8]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import torch
from transformers import TrainingArguments, Trainer
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import EarlyStoppingCallback

In [9]:
# Read data
data = pd.read_csv("./datasets/sentiments.csv")
data["Sentiment"] = data["Sentiment"].apply(lambda x: 1 if x=="positive" else 0)
# Define pretrained tokenizer and model
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)

# ----- 1. Preprocess data -----#
# Preprocess data
X = list(data["Sentence"])
y = list(data["Sentiment"])
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)
X_train_tokenized = tokenizer(X_train, padding=True, truncation=True, max_length=512)
X_val_tokenized = tokenizer(X_val, padding=True, truncation=True, max_length=512)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
# Create torch dataset
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

train_dataset = Dataset(X_train_tokenized, y_train)
val_dataset = Dataset(X_val_tokenized, y_val)

In [11]:


# ----- 2. Fine-tune pretrained model -----#
# Define Trainer parameters
def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred)
    precision = precision_score(y_true=labels, y_pred=pred)
    f1 = f1_score(y_true=labels, y_pred=pred)

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

# Define Trainer
args = TrainingArguments(
    output_dir="output",
    evaluation_strategy="steps",
    eval_steps=500,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    seed=0,
    load_best_model_at_end=True,
)
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

# Train pre-trained model
trainer.train()





  0%|          | 0/1455 [00:00<?, ?it/s]

{'loss': 0.3922, 'learning_rate': 3.2817869415807564e-05, 'epoch': 1.03}


  0%|          | 0/122 [00:00<?, ?it/s]

{'eval_loss': 0.3355537950992584, 'eval_accuracy': 0.8927835051546392, 'eval_precision': 0.8122605363984674, 'eval_recall': 0.7940074906367042, 'eval_f1': 0.8030303030303032, 'eval_runtime': 3.2367, 'eval_samples_per_second': 299.686, 'eval_steps_per_second': 37.692, 'epoch': 1.03}
{'loss': 0.2349, 'learning_rate': 1.5635738831615122e-05, 'epoch': 2.06}


  0%|          | 0/122 [00:00<?, ?it/s]

{'eval_loss': 0.45490390062332153, 'eval_accuracy': 0.9, 'eval_precision': 0.84, 'eval_recall': 0.7865168539325843, 'eval_f1': 0.8123791102514506, 'eval_runtime': 3.4345, 'eval_samples_per_second': 282.426, 'eval_steps_per_second': 35.522, 'epoch': 2.06}
{'train_runtime': 321.9401, 'train_samples_per_second': 36.119, 'train_steps_per_second': 4.519, 'train_loss': 0.25119592463437634, 'epoch': 3.0}


TrainOutput(global_step=1455, training_loss=0.25119592463437634, metrics={'train_runtime': 321.9401, 'train_samples_per_second': 36.119, 'train_steps_per_second': 4.519, 'train_loss': 0.25119592463437634, 'epoch': 3.0})

In [16]:
# ----- 3. Predict -----#
# Load test data
test_data = pd.read_csv("./datasets/sentiments.csv")
X_test = list(test_data["Sentence"])
X_test_tokenized = tokenizer(X_test, padding=True, truncation=True, max_length=512)

# Create torch dataset
test_dataset = Dataset(X_test_tokenized)

# Load trained model
model_path = "./output/checkpoint-500"
model = BertForSequenceClassification.from_pretrained(model_path, num_labels=2)

# Define test trainer
test_trainer = Trainer(model)

# Make prediction
raw_pred, _, _ = test_trainer.predict(test_dataset)

# Preprocess raw predictions
y_pred = np.argmax(raw_pred, axis=1)

  0%|          | 0/606 [00:00<?, ?it/s]

In [30]:
print(sum(y_pred))
a=test_data.Sentiment.values
b = a[a == "positive"]
len(b)

1376


1363