# Yves Leconte

In [120]:
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
import plotly.express as px
from torch.utils.data import DataLoader
from torch import tensor
from datasets import Dataset
from transformers import BertTokenizer, DistilBertModel, DistilBertConfig
from tqdm.notebook import tqdm

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [121]:
dataset = pd.read_csv('../data/sentiment_analysis_for_financial_news_[from_kaggle].csv', sep='^([^,]+),', engine='python',encoding='latin-1')
dataset.rename(columns={' Sentence;;;': 'Sentence'}, inplace=True)
dataset.drop(columns=['Unnamed: 0'], inplace=True)

We have an unbalanced dataset, let's derive some inverse-frequency based weights to be used later in the cross entropy loss

In [122]:
dataset['Sentiment'] = dataset['Sentiment'].replace({"\"neutral": 'neutral', "\"positive": 'positive', "\"negative": 'negative'})
dataset['Sentiment'].value_counts()

Sentiment
neutral     2879
positive    1363
negative     604
Name: count, dtype: int64

In [123]:
n_samples = 4846

In [124]:
freq_neutral = dataset['Sentiment'].value_counts()['neutral']/n_samples
freq_positive = dataset['Sentiment'].value_counts()['positive']/n_samples
freq_negative = dataset['Sentiment'].value_counts()['negative']/n_samples

class_weights = [1/freq_negative, 1/freq_positive, 1/freq_neutral]

Let's make the data readable for HuggingFace's Bert based tokenizer

In [125]:
dataset = Dataset.from_pandas(dataset)

In [126]:
dataset

Dataset({
    features: ['Sentiment', 'Sentence'],
    num_rows: 4846
})

In [127]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)

dataset = dataset.shuffle(seed=2024)

def preprocessing_fn(examples):
    tokenized_batch = tokenizer(
        examples["Sentence"],
        add_special_tokens=False,
        truncation=True,
        max_length=256,
        padding='max_length',
        return_attention_mask=False
    )

    # Using one-hot encoding is very convenient for multi label classification
    labels = torch.tensor([0 if sentiment == 'negative' else 1 if sentiment == 'positive' else 2 for sentiment in examples['Sentiment']])
    one_hot_labels = F.one_hot(labels, num_classes=3).float()
    
    tokenized_batch = {k: tensor(v) for k, v in tokenized_batch.items()}
    tokenized_batch['labels'] = one_hot_labels
    return tokenized_batch


split_dataset = dataset.select(range(n_samples))

tokenized_dataset = split_dataset.map(preprocessing_fn, batched=True)
tokenized_dataset.set_format(type='torch', columns=['input_ids', 'labels'])

train_valid_split = tokenized_dataset.train_test_split(test_size=0.2)

train_dataset = train_valid_split['train']
valid_dataset = train_valid_split['test']

Map:   0%|          | 0/4846 [00:00<?, ? examples/s]

In [128]:
train_dataset

Dataset({
    features: ['Sentiment', 'Sentence', 'input_ids', 'token_type_ids', 'labels'],
    num_rows: 3876
})

Using a datacollator to pad the inputs sentences is particularly necessary when using Bert

In [129]:
class DataCollator:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer

    def __call__(self, batch):
        features = self.tokenizer.pad(
            batch, padding="longest", max_length=256, return_tensors="pt"
        )
        return features

In [130]:
data_collator = DataCollator(tokenizer)

In [131]:
batch_size = 32

train_dataloader = DataLoader(
    train_dataset, batch_size=batch_size, collate_fn=data_collator
)
valid_dataloader = DataLoader(
    valid_dataset, batch_size=batch_size, collate_fn=data_collator
)

Let's build our classifier

In [132]:
class DistilBertClassifier(nn.Module):
    
    def __init__(self, num_classes=3):
        super().__init__()
        self.bert = DistilBertModel.from_pretrained('distilbert-base-uncased')
        self.classifier = nn.Linear(768, num_classes)
        
    def forward(self, input_ids, attention_mask):
        bert_out = self.bert(input_ids, attention_mask)
        out = bert_out["last_hidden_state"][:, 0]
        logits = self.classifier(out)
        probabilities = F.softmax(logits, dim=1)
        return probabilities


In [133]:
model = DistilBertClassifier(3)
model.to(device)

DistilBertClassifier(
  (bert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Li

We check that everything is fine

In [134]:
batch = next(iter(train_dataloader))

input_ids = batch["input_ids"].to(device)
attention_mask = batch["attention_mask"].to(device)
output = model.forward(input_ids, attention_mask)

print(output.shape)
print(type(output))
print(output)

torch.Size([32, 3])
<class 'torch.Tensor'>
tensor([[0.3219, 0.2650, 0.4130],
        [0.3153, 0.2695, 0.4153],
        [0.3271, 0.2582, 0.4146],
        [0.3211, 0.2603, 0.4186],
        [0.3267, 0.2621, 0.4111],
        [0.3203, 0.2637, 0.4160],
        [0.3212, 0.2650, 0.4138],
        [0.3221, 0.2617, 0.4162],
        [0.3237, 0.2657, 0.4107],
        [0.3171, 0.2659, 0.4170],
        [0.3200, 0.2648, 0.4153],
        [0.3210, 0.2625, 0.4165],
        [0.3266, 0.2667, 0.4067],
        [0.3187, 0.2642, 0.4171],
        [0.3283, 0.2642, 0.4074],
        [0.3155, 0.2688, 0.4157],
        [0.3219, 0.2606, 0.4175],
        [0.3275, 0.2683, 0.4042],
        [0.3269, 0.2603, 0.4128],
        [0.3334, 0.2607, 0.4059],
        [0.3247, 0.2634, 0.4120],
        [0.3219, 0.2630, 0.4151],
        [0.3207, 0.2655, 0.4138],
        [0.3310, 0.2645, 0.4045],
        [0.3217, 0.2630, 0.4153],
        [0.3212, 0.2649, 0.4139],
        [0.3206, 0.2691, 0.4103],
        [0.3263, 0.2620, 0.4117],
     

In [135]:
def validation(model, valid_dataloader, num_class=3):
    total_size = 0
    acc_total = 0
    loss_total = 0
    class_correct = [0 for _ in range(num_class)]
    class_total = [0 for _ in range(num_class)]
    criterion = nn.CrossEntropyLoss(weight=tensor(class_weights).to(device))
    model.eval()
    with torch.no_grad():
        for batch in tqdm(valid_dataloader):
            batch = {k: v.to(device) for k, v in batch.items()}
            input_ids = batch["input_ids"]
            labels = batch["labels"]
            attention_mask = batch["attention_mask"]
            preds_probs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = criterion(preds_probs, labels)
            
            # Convert from one-hot to indices
            labels = torch.argmax(labels, dim=1)
            predicted_labels = torch.argmax(preds_probs, dim=1)

            acc = (predicted_labels == labels)
            total_size += acc.shape[0]
            acc_total += acc.sum().item()
            loss_total += loss.item()
            
            for i in range(num_class):
                class_total[i]+=torch.sum(labels==i).item()
                class_correct[i]+=torch.sum((labels==i) & (predicted_labels==i)).item()
            
    model.train()
    class_acc = [100*class_correct[i]/class_total[i] if class_total[i]!=0 else 0 for i in range(num_class)]
    return loss_total / len(valid_dataloader), acc_total / total_size, class_acc

In [136]:
def training(model, n_epochs, train_dataloader, valid_dataloader, lr=1e-6, num_class=3):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, eps=1e-8)
    list_val_acc = []
    list_val_acc_class = []
    list_train_acc = []
    list_train_acc_class = []
    list_train_loss = []
    list_val_loss = []
    criterion = nn.CrossEntropyLoss(weight=tensor(class_weights).to(device))
    for e in range(n_epochs):
        # ========== Training ==========

        # Set model to training mode
        model.train()
        model.to(device)

        # Tracking variables
        train_loss = 0
        epoch_train_acc = 0
        class_correct = [0 for _ in range(num_class)]
        class_total = [0 for _ in range(num_class)]
        for batch in tqdm(train_dataloader):
            batch = {k: v.to(device) for k, v in batch.items()}
            input_ids = batch["input_ids"]
            labels = batch["labels"]
            attention_mask = batch["attention_mask"]
            optimizer.zero_grad()
            # Forward pass
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            # Backward pass
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            train_loss += loss.detach().cpu().item()
            
            labels = torch.argmax(labels, dim=1)
            predicted_labels = torch.argmax(outputs, dim=1)
            acc = (predicted_labels == labels)
            epoch_train_acc += acc.float().mean().item()
            
            for i in range(num_class):
                class_total[i]+=torch.sum((labels==i)).item()
                class_correct[i]+=((labels==i) & (predicted_labels==i)).float().mean().item()
            
        list_train_acc.append(100 * epoch_train_acc / len(train_dataloader))
        list_train_acc_class.append([100*class_correct[i]/class_total[i] if class_total[i]!=0 else 0 for i in range(num_class)])
        list_train_loss.append(train_loss / len(train_dataloader))

        # ========== Validation ==========

        validation_loss, validation_accuracy, validation_accuracy_class = validation(model, valid_dataloader)
        list_val_loss.append(validation_loss)
        list_val_acc.append(validation_accuracy * 100)
        list_val_acc_class.append(validation_accuracy_class)
        print(f"Epoch {e}, Train loss: {list_train_loss[-1]}, Train acc: {list_train_acc[-1]}, Val loss: {validation_loss}, Val acc: {validation_accuracy * 100}, Val class acc: {validation_accuracy_class}")
    return list_train_loss, list_train_acc, list_train_acc_class, list_val_loss, list_val_acc, list_val_acc_class

In [137]:
list_train_loss, list_train_acc, list_train_acc_class, list_val_loss, list_val_acc, list_val_acc_class = training(model, 25, train_dataloader, valid_dataloader)

  0%|          | 0/122 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

Epoch 0, Train loss: 3.295780714069327, Train acc: 47.84836065573771, Val loss: 3.3170356725430783, Val acc: 51.85567010309279, Val class acc: [44.62809917355372, 0.6993006993006993, 79.39609236234459]


  0%|          | 0/122 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

Epoch 1, Train loss: 3.2554850278780427, Train acc: 50.99897540983606, Val loss: 3.2504756511427773, Val acc: 55.77319587628866, Val class acc: [52.892561983471076, 1.048951048951049, 84.19182948490231]


  0%|          | 0/122 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

Epoch 2, Train loss: 3.1808123858472817, Train acc: 58.19672131147541, Val loss: 3.191284988729264, Val acc: 55.77319587628866, Val class acc: [59.50413223140496, 0.6993006993006993, 82.94849023090586]


  0%|          | 0/122 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

Epoch 3, Train loss: 3.1273061157890725, Train acc: 55.814549180327866, Val loss: 3.145864463836644, Val acc: 56.49484536082474, Val class acc: [65.2892561983471, 0.0, 83.30373001776199]


  0%|          | 0/122 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

Epoch 4, Train loss: 3.030311929668566, Train acc: 58.17110655737705, Val loss: 3.0806306832883723, Val acc: 55.670103092783506, Val class acc: [76.03305785123968, 1.048951048951049, 79.04085257548846]


  0%|          | 0/122 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

Epoch 5, Train loss: 2.9376082850622836, Train acc: 60.60450819672131, Val loss: 3.004434209598819, Val acc: 56.70103092783505, Val class acc: [83.47107438016529, 1.3986013986013985, 79.04085257548846]


  0%|          | 0/122 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

Epoch 6, Train loss: 2.8610824930139254, Train acc: 62.14139344262295, Val loss: 2.9377131015757816, Val acc: 58.55670103092784, Val class acc: [88.4297520661157, 3.4965034965034967, 80.10657193605684]


  0%|          | 0/122 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

Epoch 7, Train loss: 2.8043652195615985, Train acc: 64.39549180327869, Val loss: 2.883886110834135, Val acc: 59.381443298969074, Val class acc: [90.9090909090909, 7.3426573426573425, 79.04085257548846]


  0%|          | 0/122 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

Epoch 8, Train loss: 2.72675122590794, Train acc: 66.41905737704919, Val loss: 2.8155564408195644, Val acc: 63.50515463917525, Val class acc: [86.77685950413223, 18.53146853146853, 81.34991119005329]


  0%|          | 0/122 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

Epoch 9, Train loss: 2.6764227535503937, Train acc: 67.13627049180327, Val loss: 2.78635262089696, Val acc: 61.855670103092784, Val class acc: [90.08264462809917, 24.475524475524477, 74.77797513321492]


  0%|          | 0/122 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

Epoch 10, Train loss: 2.6110651136636074, Train acc: 69.8514344262295, Val loss: 2.7179074807298154, Val acc: 65.05154639175258, Val class acc: [86.77685950413223, 33.21678321678322, 76.55417406749557]


  0%|          | 0/122 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

Epoch 11, Train loss: 2.561909729929879, Train acc: 70.03073770491804, Val loss: 2.6676975365262563, Val acc: 65.15463917525773, Val class acc: [85.12396694214875, 41.60839160839161, 72.82415630550622]


  0%|          | 0/122 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

Epoch 12, Train loss: 2.498971192180733, Train acc: 71.82377049180327, Val loss: 2.6376620682322987, Val acc: 66.59793814432989, Val class acc: [81.81818181818181, 57.34265734265734, 68.0284191829485]


  0%|          | 0/122 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

Epoch 13, Train loss: 2.4595240158057687, Train acc: 72.89959016393442, Val loss: 2.5893365725551156, Val acc: 68.8659793814433, Val class acc: [78.51239669421487, 64.68531468531468, 68.9165186500888]


  0%|          | 0/122 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

Epoch 14, Train loss: 2.388233451072698, Train acc: 73.6936475409836, Val loss: 2.4951849142382616, Val acc: 67.31958762886599, Val class acc: [91.73553719008264, 61.18881118881119, 65.18650088809947]


  0%|          | 0/122 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

Epoch 15, Train loss: 2.316578728672905, Train acc: 76.20389344262296, Val loss: 2.4595236470435853, Val acc: 69.38144329896907, Val class acc: [86.77685950413223, 64.68531468531468, 68.0284191829485]


  0%|          | 0/122 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

Epoch 16, Train loss: 2.28887138061582, Train acc: 77.51024590163935, Val loss: 2.4608972679178547, Val acc: 68.96907216494846, Val class acc: [83.47107438016529, 79.37062937062937, 60.56838365896981]


  0%|          | 0/122 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

Epoch 17, Train loss: 2.252443008871467, Train acc: 78.02254098360656, Val loss: 2.4146206082602766, Val acc: 70.61855670103093, Val class acc: [86.77685950413223, 74.12587412587412, 65.36412078152753]


  0%|          | 0/122 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

Epoch 18, Train loss: 2.2044666366306678, Train acc: 79.35450819672131, Val loss: 2.399328263445174, Val acc: 72.37113402061856, Val class acc: [85.9504132231405, 69.23076923076923, 71.04795737122558]


  0%|          | 0/122 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

Epoch 19, Train loss: 2.195349333040341, Train acc: 79.71311475409836, Val loss: 2.3847976100864607, Val acc: 72.98969072164948, Val class acc: [83.47107438016529, 73.77622377622377, 70.33747779751332]


  0%|          | 0/122 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

Epoch 20, Train loss: 2.1732859659016337, Train acc: 80.73770491803279, Val loss: 2.3791466353784556, Val acc: 70.51546391752578, Val class acc: [87.60330578512396, 79.72027972027972, 62.16696269982238]


  0%|          | 0/122 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

Epoch 21, Train loss: 2.157305978443865, Train acc: 80.5327868852459, Val loss: 2.328354236651688, Val acc: 72.57731958762886, Val class acc: [92.56198347107438, 77.27272727272727, 65.89698046181172]


  0%|          | 0/122 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

Epoch 22, Train loss: 2.135025038467289, Train acc: 81.7110655737705, Val loss: 2.3299850613399817, Val acc: 76.18556701030927, Val class acc: [88.4297520661157, 71.32867132867133, 76.02131438721136]


  0%|          | 0/122 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

Epoch 23, Train loss: 2.1322994821592443, Train acc: 81.9672131147541, Val loss: 2.3057914451338006, Val acc: 74.63917525773196, Val class acc: [88.4297520661157, 77.62237762237763, 70.15985790408526]


  0%|          | 0/122 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

Epoch 24, Train loss: 2.096142611820231, Train acc: 82.99180327868852, Val loss: 2.2950587141212035, Val acc: 76.5979381443299, Val class acc: [89.25619834710744, 73.42657342657343, 75.48845470692717]


In [138]:
train_losses = pd.Series(name='Train loss', data=list_train_loss)
validation_losses = pd.Series(name='Validation loss', data=list_val_loss)
train_valid_losses = pd.concat([train_losses, validation_losses], axis=1)

validation_accuracy_class = pd.DataFrame(columns=['negative','positive','neutral'], data=list_val_acc_class)


In [139]:
fig = px.line(train_valid_losses, labels={'value': 'loss', 'index': 'epoch'}, title='Train vs Validation loss')
fig

In [141]:
fig = px.line(validation_accuracy_class, labels={'value': 'accuracy', 'index': 'epoch'}, title='Accuracy per class')
fig