# Yves Leconte

In [1]:
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
import plotly.express as px
from torch.utils.data import DataLoader
from torch import tensor
from datasets import Dataset
from transformers import BertTokenizer, DistilBertModel, DistilBertConfig
from tqdm.notebook import tqdm

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [2]:
dataset = pd.read_csv('../data/sentiment_analysis_for_financial_news_[from_kaggle].csv', sep='^([^,]+),', engine='python',encoding='latin-1')
dataset.rename(columns={' Sentence;;;': 'Sentence'}, inplace=True)
dataset.drop(columns=['Unnamed: 0'], inplace=True)

We have an unbalanced dataset, let's derive some inverse-frequency based weights to be used later in the cross entropy loss

In [3]:
dataset['Sentiment'] = dataset['Sentiment'].replace({"\"neutral": 'neutral', "\"positive": 'positive', "\"negative": 'negative'})
dataset['Sentiment'].value_counts()

Sentiment
neutral     2879
positive    1363
negative     604
Name: count, dtype: int64

In [4]:
n_samples = 4846

In [5]:
freq_neutral = dataset['Sentiment'].value_counts()['neutral']/n_samples
freq_positive = dataset['Sentiment'].value_counts()['positive']/n_samples
freq_negative = dataset['Sentiment'].value_counts()['negative']/n_samples

class_weights = [1/freq_negative, 1/freq_positive, 1/freq_neutral]

Let's make the data readable for HuggingFace's Bert based tokenizer

In [6]:
dataset = Dataset.from_pandas(dataset)

In [7]:
dataset

Dataset({
    features: ['Sentiment', 'Sentence'],
    num_rows: 4846
})

In [8]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)

dataset = dataset.shuffle(seed=2024)

def preprocessing_fn(examples):
    tokenized_batch = tokenizer(
        examples["Sentence"],
        add_special_tokens=False,
        truncation=True,
        max_length=256,
        padding='max_length',
        return_attention_mask=False
    )

    # Using one-hot encoding is very convenient for multi label classification
    labels = torch.tensor([0 if sentiment == 'negative' else 1 if sentiment == 'positive' else 2 for sentiment in examples['Sentiment']])
    one_hot_labels = F.one_hot(labels, num_classes=3).float()
    
    tokenized_batch = {k: tensor(v) for k, v in tokenized_batch.items()}
    tokenized_batch['labels'] = one_hot_labels
    return tokenized_batch


split_dataset = dataset.select(range(n_samples))

tokenized_dataset = split_dataset.map(preprocessing_fn, batched=True)
tokenized_dataset.set_format(type='torch', columns=['input_ids', 'labels'])

train_valid_split = tokenized_dataset.train_test_split(test_size=0.2)

train_dataset = train_valid_split['train']
valid_dataset = train_valid_split['test']

Map:   0%|          | 0/4846 [00:00<?, ? examples/s]

In [9]:
train_dataset

Dataset({
    features: ['Sentiment', 'Sentence', 'input_ids', 'token_type_ids', 'labels'],
    num_rows: 3876
})

Using a datacollator to pad the inputs sentences is particularly necessary when using Bert

In [10]:
class DataCollator:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer

    def __call__(self, batch):
        features = self.tokenizer.pad(
            batch, padding="longest", max_length=256, return_tensors="pt"
        )
        return features

In [11]:
data_collator = DataCollator(tokenizer)

In [12]:
batch_size = 32

train_dataloader = DataLoader(
    train_dataset, batch_size=batch_size, collate_fn=data_collator
)
valid_dataloader = DataLoader(
    valid_dataset, batch_size=batch_size, collate_fn=data_collator
)

Let's build our classifier

In [13]:
class DistilBertClassifier(nn.Module):
    
    def __init__(self, num_classes=3):
        super().__init__()
        self.bert = DistilBertModel.from_pretrained('distilbert-base-uncased')
        self.classifier = nn.Linear(768, num_classes)
        
    def forward(self, input_ids, attention_mask):
        bert_out = self.bert(input_ids, attention_mask)
        out = bert_out["last_hidden_state"][:, 0]
        logits = self.classifier(out)
        probabilities = F.softmax(logits, dim=1)
        return probabilities


In [14]:
model = DistilBertClassifier(3)
model.to(device)

DistilBertClassifier(
  (bert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Li

We check that everything is fine

In [15]:
batch = next(iter(train_dataloader))

input_ids = batch["input_ids"].to(device)
attention_mask = batch["attention_mask"].to(device)
output = model.forward(input_ids, attention_mask)

print(output.shape)
print(type(output))
print(output)

torch.Size([32, 3])
<class 'torch.Tensor'>
tensor([[0.3542, 0.3730, 0.2727],
        [0.3587, 0.3736, 0.2677],
        [0.3560, 0.3724, 0.2716],
        [0.3611, 0.3777, 0.2612],
        [0.3577, 0.3789, 0.2634],
        [0.3612, 0.3771, 0.2617],
        [0.3592, 0.3769, 0.2639],
        [0.3485, 0.3803, 0.2712],
        [0.3594, 0.3747, 0.2659],
        [0.3526, 0.3726, 0.2748],
        [0.3543, 0.3741, 0.2716],
        [0.3479, 0.3786, 0.2735],
        [0.3510, 0.3784, 0.2706],
        [0.3606, 0.3792, 0.2602],
        [0.3620, 0.3738, 0.2641],
        [0.3557, 0.3846, 0.2597],
        [0.3593, 0.3792, 0.2615],
        [0.3579, 0.3785, 0.2635],
        [0.3566, 0.3760, 0.2674],
        [0.3581, 0.3802, 0.2617],
        [0.3596, 0.3775, 0.2629],
        [0.3547, 0.3744, 0.2708],
        [0.3615, 0.3792, 0.2592],
        [0.3563, 0.3796, 0.2641],
        [0.3629, 0.3759, 0.2612],
        [0.3556, 0.3824, 0.2619],
        [0.3598, 0.3807, 0.2595],
        [0.3663, 0.3703, 0.2634],
     

In [16]:
def validation(model, valid_dataloader, num_class=3):
    total_size = 0
    acc_total = 0
    loss_total = 0
    class_correct = [0 for _ in range(num_class)]
    class_total = [0 for _ in range(num_class)]
    criterion = nn.CrossEntropyLoss(weight=tensor(class_weights).to(device))
    model.eval()
    with torch.no_grad():
        for batch in tqdm(valid_dataloader):
            batch = {k: v.to(device) for k, v in batch.items()}
            input_ids = batch["input_ids"]
            labels = batch["labels"]
            attention_mask = batch["attention_mask"]
            preds_probs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = criterion(preds_probs, labels)
            
            # Convert from one-hot to indices
            labels = torch.argmax(labels, dim=1)
            predicted_labels = torch.argmax(preds_probs, dim=1)

            acc = (predicted_labels == labels)
            total_size += acc.shape[0]
            acc_total += acc.sum().item()
            loss_total += loss.item()
            
            for i in range(num_class):
                class_total[i]+=torch.sum(labels==i).item()
                class_correct[i]+=torch.sum((labels==i) & (predicted_labels==i)).item()
            
    model.train()
    class_acc = [100*class_correct[i]/class_total[i] if class_total[i]!=0 else 0 for i in range(num_class)]
    return loss_total / len(valid_dataloader), acc_total / total_size, class_acc

In [17]:
def training(model, n_epochs, train_dataloader, valid_dataloader, lr=1e-6, num_class=3):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, eps=1e-8)
    list_val_acc = []
    list_val_acc_class = []
    list_train_acc = []
    list_train_acc_class = []
    list_train_loss = []
    list_val_loss = []
    criterion = nn.CrossEntropyLoss(weight=tensor(class_weights).to(device))
    for e in range(n_epochs):
        # ========== Training ==========

        # Set model to training mode
        model.train()
        model.to(device)

        # Tracking variables
        train_loss = 0
        epoch_train_acc = 0
        class_correct = [0 for _ in range(num_class)]
        class_total = [0 for _ in range(num_class)]
        for batch in tqdm(train_dataloader):
            batch = {k: v.to(device) for k, v in batch.items()}
            input_ids = batch["input_ids"]
            labels = batch["labels"]
            attention_mask = batch["attention_mask"]
            optimizer.zero_grad()
            # Forward pass
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            # Backward pass
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            train_loss += loss.detach().cpu().item()
            
            labels = torch.argmax(labels, dim=1)
            predicted_labels = torch.argmax(outputs, dim=1)
            acc = (predicted_labels == labels)
            epoch_train_acc += acc.float().mean().item()
            
            for i in range(num_class):
                class_total[i]+=torch.sum((labels==i)).item()
                class_correct[i]+=((labels==i) & (predicted_labels==i)).float().mean().item()
            
        list_train_acc.append(100 * epoch_train_acc / len(train_dataloader))
        list_train_acc_class.append([100*class_correct[i]/class_total[i] if class_total[i]!=0 else 0 for i in range(num_class)])
        list_train_loss.append(train_loss / len(train_dataloader))

        # ========== Validation ==========

        validation_loss, validation_accuracy, validation_accuracy_class = validation(model, valid_dataloader)
        list_val_loss.append(validation_loss)
        list_val_acc.append(validation_accuracy * 100)
        list_val_acc_class.append(validation_accuracy_class)
        print(f"Epoch {e}, Train loss: {list_train_loss[-1]}, Train acc: {list_train_acc[-1]}, Val loss: {validation_loss}, Val acc: {validation_accuracy * 100}, Val class acc: {validation_accuracy_class}")
    return list_train_loss, list_train_acc, list_train_acc_class, list_val_loss, list_val_acc, list_val_acc_class

In [18]:
list_train_loss, list_train_acc, list_train_acc_class, list_val_loss, list_val_acc, list_val_acc_class = training(model, 25, train_dataloader, valid_dataloader)

  0%|          | 0/122 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

Epoch 0, Train loss: 3.2858822267243437, Train acc: 25.28176229508197, Val loss: 3.3005127359659285, Val acc: 26.082474226804127, Val class acc: [0.0, 100.0, 0.0]


  0%|          | 0/122 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

Epoch 1, Train loss: 3.2622953588295585, Train acc: 42.36680327868852, Val loss: 3.2438563843409214, Val acc: 54.63917525773196, Val class acc: [36.434108527131784, 26.48221343873518, 70.74829931972789]


  0%|          | 0/122 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

Epoch 2, Train loss: 3.184593304342801, Train acc: 55.942622950819676, Val loss: 3.153928966841502, Val acc: 60.824742268041234, Val class acc: [53.48837209302326, 9.090909090909092, 84.6938775510204]


  0%|          | 0/122 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

Epoch 3, Train loss: 3.1025315649255454, Train acc: 58.657786885245905, Val loss: 3.0860805934748874, Val acc: 61.649484536082475, Val class acc: [51.16279069767442, 10.671936758893281, 85.8843537414966]


  0%|          | 0/122 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

Epoch 4, Train loss: 3.0174087376073997, Train acc: 61.73155737704918, Val loss: 3.0030003777824517, Val acc: 62.16494845360825, Val class acc: [58.13953488372093, 14.624505928853756, 83.50340136054422]


  0%|          | 0/122 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

Epoch 5, Train loss: 2.931941137115159, Train acc: 63.60143442622951, Val loss: 2.9132695447818007, Val acc: 62.577319587628864, Val class acc: [65.89147286821705, 29.64426877470356, 76.0204081632653]


  0%|          | 0/122 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

Epoch 6, Train loss: 2.839875476369752, Train acc: 65.18954918032787, Val loss: 2.8297732963403273, Val acc: 65.4639175257732, Val class acc: [61.24031007751938, 43.47826086956522, 75.85034013605443]


  0%|          | 0/122 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

Epoch 7, Train loss: 2.747566292453342, Train acc: 66.67520491803279, Val loss: 2.6948872755676803, Val acc: 67.93814432989691, Val class acc: [73.64341085271317, 47.03557312252964, 75.68027210884354]


  0%|          | 0/122 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

Epoch 8, Train loss: 2.609760975443705, Train acc: 68.72438524590164, Val loss: 2.5596703694384906, Val acc: 68.14432989690722, Val class acc: [83.72093023255815, 62.84584980237154, 67.00680272108843]


  0%|          | 0/122 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

Epoch 9, Train loss: 2.481234450295254, Train acc: 73.4375, Val loss: 2.456101664116356, Val acc: 69.79381443298969, Val class acc: [86.04651162790698, 70.35573122529644, 65.98639455782313]


  0%|          | 0/122 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

Epoch 10, Train loss: 2.378065218279175, Train acc: 75.28176229508196, Val loss: 2.384402940291272, Val acc: 71.34020618556701, Val class acc: [85.27131782945736, 78.65612648221344, 65.1360544217687]


  0%|          | 0/122 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

Epoch 11, Train loss: 2.299176311135019, Train acc: 76.46004098360656, Val loss: 2.3290953426636833, Val acc: 74.63917525773196, Val class acc: [86.04651162790698, 83.00395256916995, 68.5374149659864]


  0%|          | 0/122 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

Epoch 12, Train loss: 2.2408353609380316, Train acc: 78.71413934426229, Val loss: 2.2885344819571043, Val acc: 75.36082474226804, Val class acc: [86.82170542635659, 81.81818181818181, 70.06802721088435]


  0%|          | 0/122 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

Epoch 13, Train loss: 2.1982811652721943, Train acc: 79.96926229508196, Val loss: 2.2561781466217607, Val acc: 75.05154639175258, Val class acc: [89.92248062015504, 83.39920948616601, 68.19727891156462]


  0%|          | 0/122 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

Epoch 14, Train loss: 2.15006563222051, Train acc: 81.89036885245902, Val loss: 2.2320787791495538, Val acc: 76.18556701030927, Val class acc: [88.37209302325581, 80.23715415019763, 71.7687074829932]


  0%|          | 0/122 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

Epoch 15, Train loss: 2.149429217519191, Train acc: 80.71209016393442, Val loss: 2.2246245686482227, Val acc: 75.6701030927835, Val class acc: [91.47286821705427, 81.81818181818181, 69.5578231292517]


  0%|          | 0/122 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

Epoch 16, Train loss: 2.1077721398633185, Train acc: 82.78688524590164, Val loss: 2.219621541001719, Val acc: 75.56701030927834, Val class acc: [91.47286821705427, 85.37549407114625, 67.85714285714286]


  0%|          | 0/122 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

Epoch 17, Train loss: 2.088709427534683, Train acc: 82.94057377049181, Val loss: 2.1945955805341133, Val acc: 76.90721649484537, Val class acc: [90.69767441860465, 81.42292490118577, 71.93877551020408]


  0%|          | 0/122 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

Epoch 18, Train loss: 2.075163542893424, Train acc: 82.96618852459017, Val loss: 2.204712618631643, Val acc: 77.31958762886599, Val class acc: [87.59689922480621, 84.58498023715416, 71.93877551020408]


  0%|          | 0/122 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

Epoch 19, Train loss: 2.0632720530180775, Train acc: 83.99077868852459, Val loss: 2.189249537560074, Val acc: 77.42268041237114, Val class acc: [91.47286821705427, 83.39920948616601, 71.7687074829932]


  0%|          | 0/122 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

Epoch 20, Train loss: 2.0536773092745024, Train acc: 84.32377049180327, Val loss: 2.1692427766579683, Val acc: 79.79381443298969, Val class acc: [90.69767441860465, 80.63241106719367, 77.04081632653062]


  0%|          | 0/122 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

Epoch 21, Train loss: 2.0344283341260967, Train acc: 84.75922131147541, Val loss: 2.177579491032753, Val acc: 79.58762886597938, Val class acc: [89.92248062015504, 83.00395256916995, 75.85034013605443]


  0%|          | 0/122 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

Epoch 22, Train loss: 2.022597679201006, Train acc: 85.57889344262296, Val loss: 2.1636906205962876, Val acc: 79.58762886597938, Val class acc: [91.47286821705427, 76.6798418972332, 78.2312925170068]


  0%|          | 0/122 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

Epoch 23, Train loss: 2.014159594784705, Train acc: 85.63012295081967, Val loss: 2.159267711775616, Val acc: 79.69072164948454, Val class acc: [89.92248062015504, 81.42292490118577, 76.70068027210884]


  0%|          | 0/122 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

Epoch 24, Train loss: 1.9997805155125017, Train acc: 85.91188524590164, Val loss: 2.1616611137803905, Val acc: 81.23711340206185, Val class acc: [87.59689922480621, 79.05138339920948, 80.78231292517007]


In [27]:
train_losses = pd.Series(name='Train loss', data=list_train_loss)
validation_losses = pd.Series(name='Validation loss', data=list_val_loss)
train_valid_losses = pd.concat([train_losses, validation_losses], axis=1)

validation_accuracy_class = pd.DataFrame(columns=['negative','positive','neutral'], data=list_val_acc_class)

In [30]:
fig = px.line(train_valid_losses, labels={'value': 'loss', 'index': 'epoch'}, title='Train vs Validation loss')
fig

In [31]:
fig = px.line(validation_accuracy_class, labels={'value': 'accuracy', 'index': 'epoch'}, title='Accuracy per class')
fig