In [18]:
# Importing the libraries needed
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
import seaborn as sns
import transformers
import json
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaModel, RobertaTokenizer
import logging
logging.basicConfig(level=logging.ERROR)

In [19]:
# Setting up the device for GPU usage

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [39]:
from pymongo import MongoClient
import pandas as pd
from dotenv import load_dotenv
import os

# 1. MongoDB-Verbindung aufbauen (idealerweise aus .env)
load_dotenv()
MONGO_URI = os.getenv("MONGO_URI")

client = MongoClient(MONGO_URI)
db = client["ukraineBiasDB"]
collection = db["tweets_balanced"]

# 2. Aggregation definieren (aus MongoDB kopiert)
pipeline = [
    {
        '$project': {
            '_id': 1,
            'sentiment': '$text',
            'target': {
                '$literal': ''
            }
        }
    }
]

# 3. Aggregation ausführen und Daten als DataFrame einlesen
cursor = collection.aggregate(pipeline)
train = pd.DataFrame(list(cursor))

# 4. (Optional) _id als Index entfernen, falls nicht gebraucht
#df.drop(columns=['_id'], inplace=True)

# Vorschau
print(train.head())

                        _id  \
0  67e02b4955c5d9c79f5dc4f5   
1  67e02b4955c5d9c79f5dc4f6   
2  67e02b4955c5d9c79f5dc4f7   
3  67e02b4955c5d9c79f5dc4f8   
4  67e02b4955c5d9c79f5dc4f9   

                                           sentiment target  
0  BREAKING: Trump responds to the bombshell New ...         
1  🔴 L'Occident a armé l'Ukraine et craint mainte...         
2  🚨BREAKING: Elon Musk says that American politi...         
3  What a twist! China may take part in peacekeep...         
4  Nothing to see here, just actors in Ukraine ge...         


In [21]:
# dataframe shape (rows, columns)
train.shape



(70, 3)

In [22]:
# train on a unique subset of the data
train['sentiment'].unique()

array(['BREAKING: Trump responds to the bombshell New York Times report about Elon receiving top-secret briefings from the Pentagon about war with China by calling the NYT the enemy of the people. NYT wouldn’t have ran this if it wasn’t well-sourced.',
       '🔴 L\'Occident a armé l\'Ukraine et craint maintenant les conséquences – The Times  📍L\'Occident a sans cesse fourni des armes à l\'Ukraine, et maintenant, selon la publication britannique The Times, il fait face à des conséquences inquiétantes. Les experts avertissent que l\'Ukraine pourrait devenir un "bazaar d\'armes" en Europe.  📍Cela est source de commerce illégal d\'armes, qui pourrait se répandre à travers l\'Europe, l\'Afrique et le Moyen-Orient.  📍La publication cite l\'exemple de l\'ancienne Yougoslavie, où, après la fin du conflit en 2001, le pays est devenu le principal arsenal pour les terroristes et les criminels.  📍À Kyiv seule, au début du conflit, plus de 25 000 mitraillettes et 10 millions de cartouches ont été d

In [23]:
# Descibe the dataset
train.describe()

Unnamed: 0,_id,sentiment,target
count,70,70,70.0
unique,70,70,1.0
top,67e02b4955c5d9c79f5dc4f5,BREAKING: Trump responds to the bombshell New ...,
freq,1,1,70.0


In [24]:
# Creating a new dataframe with two columns
new_df = train[['sentiment', 'target']]
new_df.head()

Unnamed: 0,sentiment,target
0,BREAKING: Trump responds to the bombshell New ...,
1,🔴 L'Occident a armé l'Ukraine et craint mainte...,
2,🚨BREAKING: Elon Musk says that American politi...,
3,What a twist! China may take part in peacekeep...,
4,"Nothing to see here, just actors in Ukraine ge...",



## Preparing the Dataset and Dataloader
I will start with defining few key variables that will be used later during the training/fine tuning stage. Followed by creation of Dataset class - This defines how the text is pre-processed before sending it to the neural network. I will also define the Dataloader that will feed the data in batches to the neural network for suitable training and processing. Dataset and Dataloader are constructs of the PyTorch library for defining and controlling the data pre-processing and its passage to neural network. For further reading into Dataset and Dataloader read the docs at PyTorch

## SentimentData Dataset Class
This class is defined to accept the Dataframe as input and generate tokenized output that is used by the Roberta model for training.
I am using the Roberta tokenizer to tokenize the data in the TITLE column of the dataframe.
The tokenizer uses the encode_plus method to perform tokenization and generate the necessary outputs, namely: ids, attention_mask
To read further into the tokenizer, refer to this document
target is the encoded category on the news headline.
The SentimentData class is used to create 2 datasets, for training and for validation.
Training Dataset is used to fine tune the model: 80% of the original data
Validation Dataset is used to evaluate the performance of the model. The model has not seen this data during training.

## Dataloader
Dataloader is used to for creating training and validation dataloader that load data to the neural network in a defined manner. This is needed because all the data from the dataset cannot be loaded to the memory at once, hence the amount of dataloaded to the memory and then passed to the neural network needs to be controlled.
This control is achieved using the parameters such as batch_size and max_len.
Training and Validation dataloaders are used in the training and validation part of the flow respectively

In [25]:
# Defining some key variables that will be used later on in the training
MAX_LEN = 256
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
# EPOCHS = 1
LEARNING_RATE = 1e-05
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', truncation=True, do_lower_case=True)

In [26]:
class SentimentData(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.sentiment
        self.targets = self.data.target
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [27]:
train_size = 0.8
train_data=new_df.sample(frac=train_size,random_state=200)
test_data=new_df.drop(train_data.index).reset_index(drop=True)
train_data = train_data.reset_index(drop=True)


print("FULL Dataset: {}".format(new_df.shape))
print("TRAIN Dataset: {}".format(train_data.shape))
print("TEST Dataset: {}".format(test_data.shape))

training_set = SentimentData(train_data, tokenizer, MAX_LEN)
testing_set = SentimentData(test_data, tokenizer, MAX_LEN)

FULL Dataset: (70, 2)
TRAIN Dataset: (56, 2)
TEST Dataset: (14, 2)


In [28]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

## Creating the Neural Network for Fine Tuning

### Neural Network
-We will be creating a neural network with the RobertaClass.
-This network will have the Roberta Language model followed by a dropout and finally a Linear layer to obtain the final outputs.
-The data will be fed to the Roberta Language model as defined in the dataset.
-Final layer outputs is what will be compared to the Sentiment category to determine the accuracy of models prediction.
-We will initiate an instance of the network called model. This instance will be used for training and then to save the final trained model for future inference.

### Loss Function and Optimizer
-Loss Function and Optimizer and defined in the next cell.
-The Loss Function is used the calculate the difference in the output created by the model and the actual output.
-Optimizer is used to update the weights of the neural network to improve its performance.

In [29]:
class RobertaClass(torch.nn.Module):
    def __init__(self):
        super(RobertaClass, self).__init__()
        self.l1 = RobertaModel.from_pretrained("roberta-base")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768, 5)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

In [30]:
model = RobertaClass()
model.to(device)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaClass(
  (l1): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNo

## Fine Tuning the Model

After all the effort of loading and preparing the data and datasets, creating the model and defining its loss and optimizer. This is probably the easier steps in the process.

Here we define a training function that trains the model on the training dataset created above, specified number of times (EPOCH), An epoch defines how many times the complete data will be passed through the network.

Following events happen in this function to fine tune the neural network:

The dataloader passes data to the model based on the batch size.
Subsequent output from the model and the actual category are compared to calculate the loss.
Loss value is used to optimize the weights of the neurons in the network.
After every 5000 steps the loss value is printed in the console.
As you can see just in 1 epoch by the final step the model was working with a loss of 0.8141926634122427.

In [31]:
# Creating the loss function and optimizer
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [32]:
def calcuate_accuracy(preds, targets):
    n_correct = (preds==targets).sum().item()
    return n_correct

In [33]:
# Defining the training function on the 80% of the dataset for tuning the distilbert model

def train(epoch):
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    model.train()
    for _,data in tqdm(enumerate(training_loader, 0)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)

        outputs = model(ids, mask, token_type_ids)
        loss = loss_function(outputs, targets)
        tr_loss += loss.item()
        big_val, big_idx = torch.max(outputs.data, dim=1)
        n_correct += calcuate_accuracy(big_idx, targets)

        nb_tr_steps += 1
        nb_tr_examples+=targets.size(0)
        
        if _%5000==0:
            loss_step = tr_loss/nb_tr_steps
            accu_step = (n_correct*100)/nb_tr_examples 
            print(f"Training Loss per 5000 steps: {loss_step}")
            print(f"Training Accuracy per 5000 steps: {accu_step}")

        optimizer.zero_grad()
        loss.backward()
        # # When using GPU
        optimizer.step()

    print(f'The Total Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}')
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Training Loss Epoch: {epoch_loss}")
    print(f"Training Accuracy Epoch: {epoch_accu}")

    return 

In [34]:
"""EPOCHS = 1
for epoch in range(EPOCHS):
    train(epoch)"""

'EPOCHS = 1\nfor epoch in range(EPOCHS):\n    train(epoch)'

## Validating the Model

During the validation stage we pass the unseen data(Testing Dataset) to the model. This step determines how good the model performs on the unseen data.

This unseen data is the 20% of train.tsv which was seperated during the Dataset creation stage. During the validation stage the weights of the model are not updated. Only the final output is compared to the actual value. This comparison is then used to calcuate the accuracy of the model.

As you can see the model is predicting the correct category of a given sample to a 69.47% accuracy which can further be improved by training more.

In [35]:
def valid(model, testing_loader):
    model.eval()
    n_correct = 0; n_wrong = 0; total = 0; tr_loss=0; nb_tr_steps=0; nb_tr_examples=0
    with torch.no_grad():
        for _, data in tqdm(enumerate(testing_loader, 0)):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
            targets = data['targets'].to(device, dtype = torch.long)
            outputs = model(ids, mask, token_type_ids).squeeze()
            loss = loss_function(outputs, targets)
            tr_loss += loss.item()
            big_val, big_idx = torch.max(outputs.data, dim=1)
            n_correct += calcuate_accuracy(big_idx, targets)

            nb_tr_steps += 1
            nb_tr_examples+=targets.size(0)
            
            if _%5000==0:
                loss_step = tr_loss/nb_tr_steps
                accu_step = (n_correct*100)/nb_tr_examples
                print(f"Validation Loss per 100 steps: {loss_step}")
                print(f"Validation Accuracy per 100 steps: {accu_step}")
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Validation Loss Epoch: {epoch_loss}")
    print(f"Validation Accuracy Epoch: {epoch_accu}")
    
    return epoch_accu


In [37]:
"""acc = valid(model, testing_loader)
print("Accuracy on test data = %0.2f%%" % acc)"""

'acc = valid(model, testing_loader)\nprint("Accuracy on test data = %0.2f%%" % acc)'

## Saving the Trained Model Artifacts for inference

This is the final step in the process of fine tuning the model.

The model and its vocabulary are saved locally. These files are then used in the future to make inference on new inputs of news headlines.

In [None]:
"""output_model_file = 'pytorch_roberta_sentiment.bin'
output_vocab_file = './'

model_to_save = model
torch.save(model_to_save, output_model_file)
tokenizer.save_vocabulary(output_vocab_file)

print('All files saved')
print('This tutorial is completed')"""