In [None]:
# Mount the Google Drve for getting dataset

from google.colab import drive
drive.mount('/content/drive')

# Change directory to the project directory

import os
os.chdir('/content/drive/MyDrive/TD-bert/')

In [None]:
#Install the Hugging Face Transformers library:

!pip install transformers

In [None]:
# Import the necessary modules:

import torch
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizerFast, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup

In [None]:
# Loading the pretrained BERT tokenizer:

tokenizer = BertTokenizerFast.from_pretrained("bert-base-multilingual-cased")

In [None]:
# Load the pre-trained  BERT model and prepare it for sequence classification:

num_labels = 3  # positive, negative, and neutral
model = BertForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=num_labels)

In [None]:
# Adding the [TAR] token to the tokenizer's vocabulary and resizing the model's embeddings:

tokenizer.add_tokens(["[TAR]"])
model.resize_token_embeddings(len(tokenizer))

In [None]:
# Defining a custom dataset class for sentient analysis:

class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer(text, add_special_tokens=False, max_length=self.max_length, truncation=True, padding='max_length')
        input_ids = encoding["input_ids"]
        attention_mask = encoding["attention_mask"]

        return {
            "input_ids": torch.tensor(input_ids, dtype=torch.long),
            "attention_mask": torch.tensor(attention_mask, dtype=torch.long),
            "label": torch.tensor(label, dtype=torch.long)
        }

In [None]:
# Create the train, validation, and test sets:
train_data = pd.read_csv("dataset/tar_train_dataset.csv", sep=';', encoding='utf-8')
validation_data = pd.read_csv("dataset/tar_validation_dataset.csv", sep=';', encoding='utf-8')
test_data = pd.read_csv("dataset/tar_test_dataset.csv", sep=';', encoding='utf-8')

train_data.head()

In [None]:
# For the T-BERT model, training, validation and test will be done by
# using targeted sentiments:

train_texts = train_data["Text"]
train_labels = train_data["Targeted Sentiment"]
val_texts = validation_data["Text"]
val_labels = validation_data["Targeted Sentiment"]

test_texts = test_data["Text"]
test_labels = test_data["Targeted Sentiment"]

In [None]:
# Convert string labels to numerical values
label_mapping = {"negativ": 0, "neutral": 1, "positiv": 2}
train_labels = [label_mapping[label] for label in train_labels]
val_labels = [label_mapping[label] for label in val_labels]
test_labels = [label_mapping[label] for label in test_labels]

# Create SentimentDataset instances
train_dataset = SentimentDataset(train_texts, train_labels, tokenizer)
val_dataset = SentimentDataset(val_texts, val_labels, tokenizer)
test_dataset = SentimentDataset(test_texts, test_labels, tokenizer)

# Create DataLoader instances
batch_size = 24
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [None]:
# Set the device and move the model to the device:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

model = model.to(device)

In [None]:
# Get class counts for training dataset:

class_count_0 = train_labels.count(0)
class_count_1 = train_labels.count(1)
class_count_2 = train_labels.count(2)

def getKey(dict, value):
  return list(filter(lambda x: dict[x] == value, dict))[0]

print("Class counts:")
print(f"{getKey(label_mapping,0)}: {class_count_0}")
print(f"{getKey(label_mapping,1)}: {class_count_1}")
print(f"{getKey(label_mapping,2)}: {class_count_2}")

In [None]:
# Set the class weights and define the loss function:

from torch import nn
import torch.nn.functional as F

class_weights = torch.tensor([1 / class_count_0, 1 / class_count_1, 1 / class_count_2]).to(device)
loss_fn = nn.CrossEntropyLoss(weight=class_weights)

In [None]:
# Prepare the optimizer and learning rate scheduler:

epochs = 2
num_training_steps = epochs * len(train_loader)
lr = 1e-5
weight_decay = 0.1
warmup_steps = 300

optimizer = AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=num_training_steps)


In [None]:
# Define helper functions for training and evaluation:

def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler):
    model = model.train()
    total_train_loss = 0

    for batch in data_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        total_train_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    return total_train_loss / len(data_loader)

def eval_epoch(model, data_loader, loss_fn, device):
    model = model.eval()
    total_eval_loss = 0

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

            total_eval_loss += loss.item()

    return total_eval_loss / len(data_loader)


In [None]:
# Calculate performance metrics:

from sklearn.metrics import precision_recall_fscore_support

def get_predictions(model, data_loader, device):
    model = model.eval()
    predictions = []
    true_labels = []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs.logits, dim=1)

            predictions.extend(preds.cpu().numpy().tolist())
            true_labels.extend(labels.cpu().numpy().tolist())

    return predictions, true_labels

def get_f1_scores(model, device):
    print("F1 Scores:")
    train_preds, train_labels= get_predictions(model, train_loader, device)
    train_f1 = precision_recall_fscore_support(train_preds, train_labels, average='weighted', zero_division=0)[2]
    print(f"\tTrain dataset against targeted sentiment labels: {train_f1:.4f}")

    test_preds, test_labels = get_predictions(model, test_loader, device)
    test_f1 = precision_recall_fscore_support(test_labels, test_preds, average='weighted', zero_division=0)[2]
    print(f"\tTest dataset against targeted sentiment labels: {test_f1:.4f}")

    return train_f1, test_f1

In [None]:
import csv

f = open("./models/t-bert_marked/results.csv","w")
writer = csv.writer(f)
writer.writerow(["Epoch","Train loss", "Val. loss", "Test loss","Train data F1", "Test data F1"])

# Train and evaluate the model:

for epoch in range(epochs):
    print("-" * 10)
    print(f"Epoch {epoch+1}/{epochs}")

    train_loss = train_epoch(model, train_loader, loss_fn, optimizer, device, scheduler)
    print(f"Train Loss: {train_loss:.4f}")

    val_loss = eval_epoch(model, val_loader, loss_fn, device)
    print(f"Validation Loss: {val_loss:.4f}")

    test_loss = "-"

    test_loss = eval_epoch(model, test_loader, loss_fn, device)
    print(f"Test Loss: {test_loss:.4f}")

    model.save_pretrained(f"./models/t-bert_marked/epoch{epoch+1}/model")
    tokenizer.save_pretrained(f"./models/t-bert_marked/epoch{epoch+1}/tokenizer")
    print("Saved.")

    train_f1, test_f1 = get_f1_scores(model, device)

    writer.writerow([epoch+1,train_loss,val_loss,test_loss,train_f1,test_f1])

f.close()
