# CLT Project - Stage III





- **Author:**             Arian Contessotto, Tim Giger, Levin Reichmuth
- **Submission Date:**    1 June 2023

## 1. Prerequisites and Load

If necessary, install the required packages.

In [None]:
# Required package installation
!pip install transformers
!pip install torch

### 1.1 Import Packages and Make Downloads

In [None]:
# Imports
import torch
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from transformers import TrainingArguments, Trainer
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import EarlyStoppingCallback
from sklearn.metrics import accuracy_score,  mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.utils.tensorboard import SummaryWriter
warnings.filterwarnings("ignore")

### 1.2 Load Annotated Data

The final dataframe from stage one is loaded. These data are the basis for stage two.

In [47]:
# Define file name
esg_file = '../stage2/annotated/full_llm_annotated.csv'

# Define function to load and merge data
def load_data(file):

    # Load the data
    df = pd.read_csv(file, delimiter = '|')

    # Apply eval function
    df['esg_topics'] = df['esg_topics'].apply(eval)
    df['sentence_tokens'] = df['sentence_tokens'].apply(eval)
    df['sentiment_llm_continuous'] = df['sentiment_llm_continuous'].apply(eval)
    df['sentiment_llm_categorial'] = df['sentiment_llm_categorial'].apply(eval)

    return df

df = load_data(esg_file)

# Print shape and diyplay header
print(df.shape)
df.head()

(11071, 17)


Unnamed: 0,company,datatype,title,date,domain,esg_topics,internal,symbol,sentence_tokens,market_cap_in_usd_b,sector,industry,year_month,year,month,sentiment_llm_continuous,sentiment_llm_categorial
0,Beiersdorf,sustainability_report,BeiersdorfAG Sustainability Report 2021,2021-03-31,,"[CleanWater, GHGEmission, ProductLiability, Va...",1,BEI,[brands strategy sustainability agenda care be...,25.99,Consumer Staples,Household & Personal Products,2021-03,2021,3,"[0.4510161280632019, 0.6138720512390137, 0.226...","[0.5, 0.5, 0.0, 0.0, 0.5, 0.5, 0.5, 0.5, 1.0, ..."
1,Deutsche Telekom,sustainability_report,DeutscheTelekomAG Sustainability Report 2021,2021-03-31,,"[DataSecurity, Iso50001, GlobalWarming, Produc...",1,DTE,"[management facts, deutsche telekom cr report,...",101.78,Communication Services,Telecom Services,2021-03,2021,3,"[0.35756340622901917, 0.29088783264160156, 0.3...","[0.5, 0.0, 0.5, 0.5, 0.0, 0.5, 0.5, 0.5, 0.5, ..."
2,Vonovia,sustainability_report,VonoviaSE Sustainability Report 2021,2021-03-31,,"[Whistleblowing, DataSecurity, Vaccine, GHGEmi...",1,VNA,"[sustainable future, sustainability report dea...",20.35,Real Estate,Real Estate Services,2021-03,2021,3,"[0.4570336639881134, 0.45287153124809265, 0.26...","[0.5, 0.5, 0.0, 0.5, 0.5, 0.5, 0.0, 0.5, 0.0, ..."
3,Merck,sustainability_report,MerckKGaA Sustainability Report 2021,2021-03-31,,"[DataSecurity, DataMisuse, DrugResistance, Iso...",1,MRK,[management employees profile attractive emplo...,87.64,Healthcare,Drug Manufacturers—Specialty & Generic,2021-03,2021,3,"[0.36378589272499084, 0.6118267178535461, 0.48...","[0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, ..."
4,MTU,sustainability_report,MTUAeroEngines Sustainability Report 2020,2020-03-31,,"[WorkLifeBalance, Corruption, AirQuality, Data...",1,MTX,[sustainability goes far beyond climate action...,12.24,Industrials,Aerospace & Defense,2020-03,2020,3,"[0.46082836389541626, 0.46208637952804565, 0.4...","[0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.0, 0.5, 0.5, ..."


### 1.3 Create different Dataframes (Sentences & full Document)

In [48]:
# Function to create Subset to test code
def create_subset(df):
    
    # Select 10 rows with internal value of 1
    subset_1 = df[df['internal'] == 1].head(10)

    # Select 100 rows with internal value of 0
    subset_0 = df[df['internal'] == 0].head(100)

    # Concatenate the two subsets and reset the index
    subset = pd.concat([subset_1, subset_0])
    subset = subset.reset_index(drop=True)

    return subset

# Create subset
subset_df = create_subset(df)

# Display header and shape
print(subset_df.shape)
subset_df.head()

(110, 17)


Unnamed: 0,company,datatype,title,date,domain,esg_topics,internal,symbol,sentence_tokens,market_cap_in_usd_b,sector,industry,year_month,year,month,sentiment_llm_continuous,sentiment_llm_categorial
0,Beiersdorf,sustainability_report,BeiersdorfAG Sustainability Report 2021,2021-03-31,,"[CleanWater, GHGEmission, ProductLiability, Va...",1,BEI,[brands strategy sustainability agenda care be...,25.99,Consumer Staples,Household & Personal Products,2021-03,2021,3,"[0.4510161280632019, 0.6138720512390137, 0.226...","[0.5, 0.5, 0.0, 0.0, 0.5, 0.5, 0.5, 0.5, 1.0, ..."
1,Deutsche Telekom,sustainability_report,DeutscheTelekomAG Sustainability Report 2021,2021-03-31,,"[DataSecurity, Iso50001, GlobalWarming, Produc...",1,DTE,"[management facts, deutsche telekom cr report,...",101.78,Communication Services,Telecom Services,2021-03,2021,3,"[0.35756340622901917, 0.29088783264160156, 0.3...","[0.5, 0.0, 0.5, 0.5, 0.0, 0.5, 0.5, 0.5, 0.5, ..."
2,Vonovia,sustainability_report,VonoviaSE Sustainability Report 2021,2021-03-31,,"[Whistleblowing, DataSecurity, Vaccine, GHGEmi...",1,VNA,"[sustainable future, sustainability report dea...",20.35,Real Estate,Real Estate Services,2021-03,2021,3,"[0.4570336639881134, 0.45287153124809265, 0.26...","[0.5, 0.5, 0.0, 0.5, 0.5, 0.5, 0.0, 0.5, 0.0, ..."
3,Merck,sustainability_report,MerckKGaA Sustainability Report 2021,2021-03-31,,"[DataSecurity, DataMisuse, DrugResistance, Iso...",1,MRK,[management employees profile attractive emplo...,87.64,Healthcare,Drug Manufacturers—Specialty & Generic,2021-03,2021,3,"[0.36378589272499084, 0.6118267178535461, 0.48...","[0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, ..."
4,MTU,sustainability_report,MTUAeroEngines Sustainability Report 2020,2020-03-31,,"[WorkLifeBalance, Corruption, AirQuality, Data...",1,MTX,[sustainability goes far beyond climate action...,12.24,Industrials,Aerospace & Defense,2020-03,2020,3,"[0.46082836389541626, 0.46208637952804565, 0.4...","[0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.0, 0.5, 0.5, ..."


In [49]:
# Function to create sentence dataset
def create_sentence_df(data):
  
  # Get sentences and sentiments from data
  sentence_lists = data['sentence_tokens'].tolist()
  sentences = [sentence for sublist in sentence_lists for sentence in sublist]
  sentiment_list = data['sentiment_llm_categorial'].tolist()
  sentiment = [sentiment for sublist in sentiment_list for sentiment in sublist]

  # Create pandas dataframe
  df = {'sentence': sentences, 'sentiment': sentiment}
  sentence_df = pd.DataFrame(df)

  return sentence_df

# Create sentence data
sentence_df = create_sentence_df(df)

# Display header and shape
print(sentence_df.shape)
sentence_df.head()

(678529, 2)


Unnamed: 0,sentence,sentiment
0,brands strategy sustainability agenda care bey...,0.5
1,successfully reduced carbon footprint absolute...,0.5
2,end consumer business returned levels reduced ...,0.0
3,decoupling human economic activity natural res...,0.0
4,inspired beiersdorf ambitious sustainability a...,0.5


In [57]:
# Function to create document data
def create_document_df(data):

    # Ensure sentence_tokens and sentiment_llm_continuous are in the correct list format
    data['document'] = data['sentence_tokens']

    # Compute the mean of the computed sentiment and discretize it
    def discretize_sentiment(value):
        if value <= 0.33:
            return 0.0
        elif value <= 0.66:
            return 0.5
        else:
            return 1.0

    data['sentiment'] = data['sentiment_llm_continuous'].apply(np.mean).apply(discretize_sentiment)
    
    # Return only the "sentence_tokens" and discretized mean of the sentiment
    return data[['document', 'sentiment']]

# Create sentence data
document_df = create_document_df(df)

# Display header and shape
print(document_df.shape)
document_df.head()

(11071, 2)


Unnamed: 0,document,sentiment
0,brands strategy sustainability agenda care bey...,0.5
1,management facts deutsche telekom cr report th...,0.5
2,sustainable future sustainability report dear ...,0.5
3,management employees profile attractive employ...,0.5
4,sustainability goes far beyond climate action ...,0.5


As a result, the models can be trained and tested with 2 approaches:  
- A dataframe containing the full document and a discretized mean sentiment of all included sentences
- A dataframe containing each sentence with the corresponding discretized sentiment  

"Discretized" corresponds to the labels 0.0 (negative), 0.5 (neutral) and 1.0 (positive).

## Model Training

As a first test, we use the lightweight "distilbert-base-uncased" model and fine-tune it on the full documents and the sentences.  
Since BERT only accepts 512 input word tokens, the full documents are heavyily truncated.

In [58]:
# Define pretrained tokenizer and model
model_name = "distilbert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=1) # 1 label to get a continuous score between 0 and 1

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DistilBertTokenizer'. 
The class this function is called from is 'BertTokenizer'.
You are using a model of type distilbert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing BertForSequenceClassification: ['distilbert.transformer.layer.0.sa_layer_norm.bias', 'distilbert.transformer.layer.0.attention.out_lin.weight', 'distilbert.transformer.layer.4.output_layer_norm.weight', 'distilbert.transformer.layer.1.sa_layer_norm.weight', 'distilbert.transformer.layer.2.attention.out_lin.weight', 'distilbert.transformer.layer.3.attention.k_lin.weight', 'distilbert.transformer.layer.3.attention.out_lin.bias', 'distilbert.transformer.l

### Approch 1: Train on the truncated (full) documents

In [76]:
# Split the data with a 70%, 15% and 15% ratio (train, valid, test)
X = list(document_df["document"])
y = list(document_df["sentiment"])
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3) # Split 70% train data
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5) # Split the other 30% in 50% each to get the correct ratio

# Tokenize the datasets
X_train_tokenized = tokenizer(X_train, padding=True, truncation=True, max_length=512)
X_val_tokenized = tokenizer(X_val, padding=True, truncation=True, max_length=512)
X_test_tokenized = tokenizer(X_test, padding=True, truncation=True, max_length=512)

In [77]:
# Create the torch dataset to use dataset in PyTorch and override necessary methods
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

# Create the train, validation and test dataset as PyTorch datasets
train_dataset = Dataset(X_train_tokenized, y_train)
val_dataset = Dataset(X_val_tokenized, y_val)
test_dataset = Dataset(X_test_tokenized, y_test)

In [102]:
# Functio to compute the comparison metrics
def compute_metrics(p):
    pred, labels = p
    
    # Use the appropriate metrics, since we don't have discrete classes but a continous score 
    mse = mean_squared_error(y_true=labels, y_pred=pred)
    mae = mean_absolute_error(y_true=labels, y_pred=pred)
    r2 = r2_score(y_true=labels, y_pred=pred)

    return {"MSE": mse, "MAE": mae, "R2": r2}

In [119]:
# Define training arguments
args = TrainingArguments(
    output_dir="./full_documents",
    evaluation_strategy="steps",
    eval_steps=10,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    gradient_accumulation_steps=4,
    seed=0,
    optim="adamw_torch", # Use newer PyTorch optimizer
    learning_rate=2e-5,
    logging_steps=10,
    fp16=True,
    load_best_model_at_end=True,
    report_to='tensorboard'
)

# Define Huggingface Trainer
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

In [97]:
# Load Tensorboard
%load_ext tensorboard

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [121]:
# Kill potential Tensorboard process, so it don't block the port
!pkill -f "tensorboard"

In [123]:
# Start Tensorboard to monitor training process
%tensorboard --logdir ./ --port 6010

In [120]:
# Delete cache and train pre-trained model
torch.cuda.empty_cache()
trainer.train()

OutOfMemoryError: CUDA out of memory. Tried to allocate 12.00 MiB (GPU 0; 10.00 GiB total capacity; 9.20 GiB already allocated; 0 bytes free; 9.28 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

### Approch 2: Train on single sentences

In [None]:
# Split the data with a 70%, 15% and 15% ratio (train, valid, test)
X = list(sentence_df["sentence"])
y = list(sentence_df["sentiment"])
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3) # Split 70% train data
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5) # Split the other 30% in 50% each to get the correct ratio

# Tokenize the datasets
X_train_tokenized = tokenizer(X_train, padding=True, truncation=True, max_length=512)
X_val_tokenized = tokenizer(X_val, padding=True, truncation=True, max_length=512)
X_test_tokenized = tokenizer(X_test, padding=True, truncation=True, max_length=512)

In [23]:
# Again, create the Torch datasets
train_dataset = Dataset(X_train_tokenized, y_train)
val_dataset = Dataset(X_val_tokenized, y_val)

In [35]:
# Define training arguments
args = TrainingArguments(
    output_dir="./single_sentences", # To store logs seperately
    evaluation_strategy="steps",
    eval_steps=10,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    gradient_accumulation_steps=4,
    seed=0,
    optim="adamw_torch", # Use newer PyTorch optimizer
    learning_rate=2e-5,
    logging_steps=10,
    fp16=True,
    load_best_model_at_end=True,
    report_to='tensorboard'
)

# Define Huggingface Trainer
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

In [None]:
# Kill potential Tensorboard process, so it don't block the port
!pkill -f "tensorboard"

In [None]:
# Monitor training
%tensorboard --logdir ./ --port 6010

In [None]:
# Delete cache and train pre-trained model
torch.cuda.empty_cache()
trainer.train()

### Compare the models

In [None]:
# Create torch dataset
test_dataset = Dataset(X_test_tokenized)

# Load trained model
model_path ="TBD"
model = BertForSequenceClassification.from_pretrained(model_path, num_labels=1)

# Define test trainer
test_trainer = Trainer(model)

# Make predictions
predictions = test_trainer.predict(test_dataset)

## Train, Dev, Test Split

In [None]:
# Split the DataFrame into train, dev, and test sets (70%, 15% and 15%)
train_df, temp_df = train_test_split(sentence_df, test_size=0.3, random_state=42)
dev_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

## Tokenize Sentences

In [None]:
# Define the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

# Tokenize the sentences and convert them to input features
def tokenize_sentences(sentences):
    input_ids = []
    attention_masks = []
    
    for sentence in sentences:
        encoded_dict = tokenizer.encode_plus(
            sentence,
            add_special_tokens=True,
            max_length=100,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )
        
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])
    
    return torch.cat(input_ids, dim=0), torch.cat(attention_masks, dim=0)

# Tokenize the sentences in the train, dev, and test sets
train_sentences = train_df['sentence'].tolist()
dev_sentences = dev_df['sentence'].tolist()
test_sentences = test_df['sentence'].tolist()
train_inputs, train_masks = tokenize_sentences(train_sentences)
dev_inputs, dev_masks = tokenize_sentences(dev_sentences)
test_inputs, test_masks = tokenize_sentences(test_sentences)

## Convert sentiment scores to tensors

In [None]:
# Convert the sentiment scores to tensors
train_labels = torch.tensor(train_df['sentiment'].tolist())
dev_labels = torch.tensor(dev_df['sentiment'].tolist())
test_labels = torch.tensor(test_df['sentiment'].tolist())

## Create DataLoader and Load Data

In [None]:
# Create a DataLoader for each set
batch_size = 64

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

dev_data = TensorDataset(dev_inputs, dev_masks, dev_labels)
dev_sampler = SequentialSampler(dev_data)
dev_dataloader = DataLoader(dev_data, sampler=dev_sampler, batch_size=batch_size)

test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

## Train base model

In [None]:
# Load the pre-trained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=1)

# Set the device (GPU if available, else CPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

print(f'\n Selected device to run: {device}')

In [None]:
# Small evaluation function
def evaluate(model, dataloader):
    model.eval()

    total_loss = 0
    total_accuracy = 0

    for batch in dataloader:
        batch = tuple(t.to(device) for t in batch)
        inputs, masks, labels = batch

        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)

        loss = outputs.loss
        logits = outputs.logits

        total_loss += loss.item()

        # Calculate the accuracy for this batch
        predictions = torch.argmax(logits, dim=1).cpu().numpy()
        true_labels = labels.cpu().numpy()
        accuracy = accuracy_score(true_labels, predictions)
        total_accuracy += accuracy

    avg_loss = total_loss / len(dataloader)
    avg_accuracy = total_accuracy / len(dataloader)

    return avg_loss, avg_accuracy

In [None]:
# Load the TensorBoard notebook extension
%load_ext tensorboard

In [None]:
# Set the optimizer and parameter
optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 2

# Create the SummaryWriter
writer = SummaryWriter()

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for step, batch in enumerate(train_dataloader):        

        batch = tuple(t.to(device) for t in batch)
        inputs, masks, labels = batch

        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

        print(f'Epoch: {epoch+1}, Step: {step+1}, Loss: {loss.item()}')

        # Log the loss and learning rate to TensorBoard
        writer.add_scalar("Loss/train", loss, step)
        for param_group in optimizer.param_groups:
            writer.add_scalar("Learning rate", param_group['lr'], step)

        # Log histograms of all model parameters
        for name, param in model.named_parameters():
            if param.requires_grad:
                writer.add_histogram(name, param.data, step)
       

    avg_train_loss = total_loss / len(train_dataloader)
    writer.add_scalar("Average Loss/train", avg_train_loss, epoch)

    # Evaluate on the validation set and log metrics to TensorBoard
    avg_val_loss, avg_val_accuracy = evaluate(model, dev_dataloader)
    writer.add_scalar("Average Loss/validation", avg_val_loss, epoch)
    writer.add_scalar("Average Accuracy/validation", avg_val_accuracy, epoch)

# After training
trained_model = model
writer.close()

In [None]:
%tensorboard  --logdir=runs --port=6007

In [None]:
print(trained_model)

## Evaluate base model on dev set

In [None]:
# Evaluation on the dev set
trained_model.eval()
dev_predictions = []

with torch.no_grad():
    for batch in dev_dataloader:
        batch = tuple(t.to(device) for t in batch)
        inputs, masks, labels = batch

        outputs = trained_model(inputs, attention_mask=masks)
        logits = outputs.logits

        predictions = torch.sigmoid(logits).squeeze().tolist()
        dev_predictions.extend(predictions)

    # Handle the remaining instances
    if len(dev_predictions) < len(dev_labels):
        remaining_instances = len(dev_labels) - len(dev_predictions)
        last_batch_inputs = dev_inputs[-remaining_instances:]
        last_batch_masks = dev_masks[-remaining_instances:]
        last_batch = (last_batch_inputs, last_batch_masks)

        last_batch = tuple(t.to(device) for t in last_batch)

        outputs = trained_model(*last_batch, attention_mask=last_batch[1])
        logits = outputs.logits

        predictions = torch.sigmoid(logits).squeeze().tolist()
        dev_predictions.extend(predictions)

# Calculate evaluation metrics
# Define the thresholds for discretization
thresholds = [1/3, 2/3]

# Discretize the predicted probabilities
discretized_predictions = np.digitize(dev_predictions, thresholds)

# Convert continuous labels to integers
dev_labels_int = np.digitize(dev_labels, thresholds)

# Create the confusion matrix-like representation
num_classes = len(thresholds) + 1  # Number of classes: below threshold, between thresholds, above threshold
cm = np.zeros((num_classes, num_classes))

for true_label, predicted_label in zip(dev_labels_int, discretized_predictions):
    cm[true_label, predicted_label] += 1

# Print the confusion matrix-like representation
print("Confusion Matrix:")
print(cm)

# Get the values from the confusion matrix
TP = cm[1, 1]
FP = cm[0, 1] + cm[2, 1]
FN = cm[1, 0] + cm[1, 2]
TN = cm[0, 0] + cm[0, 2] + cm[2, 0] + cm[2, 2]

# Compute accuracy
accuracy = (TP + TN) / (TP + TN + FP + FN)

# Compute precision
precision = TP / (TP + FP)

# Compute recall
recall = TP / (TP + FN)

# Print the metrics
print(f"Accuracy:", accuracy)
print(f"Precision:", precision)
print(f"Recall:", recall)

## Parameter tuning

In [None]:
# Define list of parameters
batch_sizes = [16]
learning_rates = [5e-5, 1e-5]
num_epochs_list = [1, 2]

# Set results dict
results = {
    'batch_size': [],
    'learning_rate': [],
    'num_epochs': [],
    'accuracy': [],
    'precision': [],
    'recall': [],
    'confusion_matrix': []
}

# Iterate over parameter combinations
for batch_size in batch_sizes:
    for learning_rate in learning_rates:
        for num_epochs in num_epochs_list:
            # Train the model with the current hyperparameters
           
            # Load the pre-trained BERT model for sequence classification
            model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=1)

            # Set the device (GPU if available, else CPU)
            device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
            print(device)
            model = model.to(device)

            optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
            train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

            for epoch in range(num_epochs):
                model.train()
                for batch in train_dataloader:
                    batch = tuple(t.to(device) for t in batch)
                    inputs, masks, labels = batch
                    
                    optimizer.zero_grad()
                    # Forward pass
                    outputs = model(inputs, attention_mask=masks, labels=labels)
                    loss = outputs.loss
                    logits = outputs.logits
                    
                    # Backward pass and optimization
                    loss.backward()
                    optimizer.step()

            # Evaluation on dev set
            model.eval()
            dev_predictions = []

            with torch.no_grad():
                for batch in dev_dataloader:
                    batch = tuple(t.to(device) for t in batch)
                    inputs, masks, labels = batch

                    outputs = trained_model(inputs, attention_mask=masks)
                    logits = outputs.logits

                    predictions = torch.sigmoid(logits).squeeze().tolist()
                    dev_predictions.extend(predictions)

                # Handle the remaining instances
                if len(dev_predictions) < len(dev_labels):
                    remaining_instances = len(dev_labels) - len(dev_predictions)
                    last_batch_inputs = dev_inputs[-remaining_instances:]
                    last_batch_masks = dev_masks[-remaining_instances:]
                    last_batch = (last_batch_inputs, last_batch_masks)

                    last_batch = tuple(t.to(device) for t in last_batch)

                    outputs = trained_model(*last_batch, attention_mask=last_batch[1])
                    logits = outputs.logits

                    predictions = torch.sigmoid(logits).squeeze().tolist()
                    dev_predictions.extend(predictions)

            # Calculate evaluation metrics
            # Define the thresholds for discretization
            thresholds = [1/3, 2/3]

            # Discretize the predicted probabilities
            discretized_predictions = np.digitize(dev_predictions, thresholds)

            # Convert continuous labels to integers
            dev_labels_int = np.digitize(dev_labels, thresholds)

            # Create the confusion matrix-like representation
            num_classes = len(thresholds) + 1  # Number of classes: below threshold, between thresholds, above threshold
            cm = np.zeros((num_classes, num_classes))

            for true_label, predicted_label in zip(dev_labels_int, discretized_predictions):
                cm[true_label, predicted_label] += 1

            # Print the confusion matrix-like representation
            print("Confusion Matrix:")
            print(cm)

            # Get the values from the confusion matrix
            TP = cm[1, 1]
            FP = cm[0, 1] + cm[2, 1]
            FN = cm[1, 0] + cm[1, 2]
            TN = cm[0, 0] + cm[0, 2] + cm[2, 0] + cm[2, 2]

            # Compute accuracy
            accuracy = (TP + TN) / (TP + TN + FP + FN)

            # Compute precision
            precision = TP / (TP + FP)

            # Compute recall
            recall = TP / (TP + FN)

            # Store the results in the dictionary
            results['batch_size'].append(batch_size)
            results['learning_rate'].append(learning_rate)
            results['num_epochs'].append(num_epochs)
            results['accuracy'].append(accuracy)
            results['precision'].append(precision)
            results['recall'].append(recall)
            results['confusion_matrix'].append(cm)

# Create results df out of results dictionary
results_df = pd.DataFrame(results)

In [None]:
# Display results dataframe
results_df

In [None]:
# Create subplots for each metric
fig, axes = plt.subplots(2, 2, figsize=(12, 8))
metrics = ['accuracy', 'precision', 'recall']
num_epochs = results_df['num_epochs'].unique().tolist()

for i, metric in enumerate(metrics):
    ax = axes[i // 2, i % 2]
    
    # Group the dataframe by batch_size and learning_rate
    grouped_df = results_df.groupby(['batch_size', 'learning_rate'])
    
    # Iterate over the unique combinations
    for (bs, lr), group in grouped_df:
        # Get the metric values for the current combination
        metric_values = group[metric].values
        
        # Plot the metric values as a line
        ax.plot(num_epochs, metric_values, marker='o', label=f"Batch sizes={bs}, LR={lr}")

    ax.set_xticks(num_epochs)
    ax.set_xticklabels(num_epochs)
    ax.set_xlabel("Number of epochs")
    ax.set_ylabel(metric.capitalize())
    ax.set_title(metric.capitalize())
    ax.legend()

plt.tight_layout()
plt.show()


## Train final model

In [None]:
# Load the pre-trained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=1)

# Set the device (GPU if available, else CPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
model = model.to(device)

# Set the optimizer and parameter
optimizer = AdamW(model.parameters(), lr=1e-5)
num_epochs = 3
batch_size = 16

In [None]:
train_loss_values = []  # List to store training loss values

for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0.0  # Variable to accumulate the loss for each epoch
    
    for batch in train_dataloader:
        batch = tuple(t.to(device) for t in batch)
        inputs, masks, labels = batch
        
        optimizer.zero_grad()
        # Forward pass
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()  # Accumulate the loss for the current batch
        
    # Calculate the average training loss for the epoch
    avg_epoch_loss = epoch_loss / len(train_dataloader)
    
    train_loss_values.append(avg_epoch_loss)  # Store the training loss value for the epoch
    
    # Print the training loss for the epoch
    print(f"Epoch {epoch+1} - Training Loss: {avg_epoch_loss:.4f}")

# After training
final_model = model


In [None]:
print(final_model)

## Evaluate final model

In [None]:
# Evaluation on the test set
final_model.eval()
test_predictions = []

with torch.no_grad():
    for batch in test_dataloader:
        batch = tuple(t.to(device) for t in batch)
        inputs, masks, labels = batch

        outputs = final_model(inputs, attention_mask=masks)
        logits = outputs.logits

        predictions = torch.sigmoid(logits).squeeze().tolist()
        test_predictions.extend(predictions)

    # Handle the remaining instances
    if len(test_predictions) < len(test_labels):
        remaining_instances = len(test_labels) - len(test_predictions)
        last_batch_inputs = test_inputs[-remaining_instances:]
        last_batch_masks = test_masks[-remaining_instances:]
        last_batch = (last_batch_inputs, last_batch_masks)

        last_batch = tuple(t.to(device) for t in last_batch)

        outputs = final_model(*last_batch, attention_mask=last_batch[1])
        logits = outputs.logits

        predictions = torch.sigmoid(logits).squeeze().tolist()
        test_predictions.extend(predictions)

# Calculate evaluation metrics
# Define the thresholds for discretization
thresholds = [1/3, 2/3]

# Discretize the predicted probabilities
discretized_predictions = np.digitize(test_predictions, thresholds)

# Convert continuous labels to integers
test_labels_int = np.digitize(test_labels, thresholds)

# Create the confusion matrix-like representation
num_classes = len(thresholds) + 1  # Number of classes: below threshold, between thresholds, above threshold
cm = np.zeros((num_classes, num_classes))

for true_label, predicted_label in zip(test_labels_int, discretized_predictions):
    cm[true_label, predicted_label] += 1

# Print the confusion matrix-like representation
print("Confusion Matrix:")
print(cm)

# Get the values from the confusion matrix
TP = cm[1, 1]
FP = cm[0, 1] + cm[2, 1]
FN = cm[1, 0] + cm[1, 2]
TN = cm[0, 0] + cm[0, 2] + cm[2, 0] + cm[2, 2]

# Compute accuracy
accuracy = (TP + TN) / (TP + TN + FP + FN)

# Compute precision
precision = TP / (TP + FP)

# Compute recall
recall = TP / (TP + FN)

# Print the metrics
print(f"Accuracy:", accuracy)
print(f"Precision:", precision)
print(f"Recall:", recall)

## Annotate sentiments with final prediction model

In [None]:
# Define the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

# Tokenize the sentences and convert them to input features
def tokenize_sentences(sentences):
    input_ids = []
    attention_masks = []
    
    for sentence in sentences:
        encoded_dict = tokenizer.encode_plus(
            sentence,
            add_special_tokens=True,
            max_length=100,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )
        
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])
    
    return torch.cat(input_ids, dim=0), torch.cat(attention_masks, dim=0)

def make_predictions(tokenized_sentences):
  inputs, masks = tokenized_sentences

  # Prepare the data for model input
  inputs = inputs.to(device)
  masks = masks.to(device)

  # Evaluate the model on the dataframe
  final_model.eval()
  sentiment_predictions = []

  with torch.no_grad():
      for i in range(len(inputs)):
          input_ids = inputs[i].unsqueeze(0)
          attention_mask = masks[i].unsqueeze(0)
          
          outputs = final_model(input_ids, attention_mask=attention_mask)
          logits = outputs.logits
          
          predictions = torch.sigmoid(logits).squeeze().tolist()
          sentiment_predictions.append(predictions)
  return sentiment_predictions

subset_df['tokenized_sentences'] = subset_df['sentence_tokens'].apply(lambda x: tokenize_sentences(x))
subset_df['sentiments'] = subset_df['tokenized_sentences'].apply(lambda x: make_predictions(x))
subset_df.head()

## Compare internal vs. external

In [None]:
# Compute the average of each list in the 'sentiments' column
subset_df['sentiments_avg'] = subset_df['sentiments'].apply(lambda x: np.mean(x))

# Create two separate dataframes for internal values 0 and 1
subset_internal_0 = subset_df[subset_df['internal'] == 0]
subset_internal_1 = subset_df[subset_df['internal'] == 1]

# Create boxplots for average sentiments grouped by internal values
plt.figure(figsize=(8, 6))
sns.boxplot(x='internal', y='sentiments_avg', data=subset_df)
plt.xlabel('Internal')
plt.ylabel('Average Sentiments')
plt.title('Boxplot of Average Sentiments by Internal')
plt.show()

# Create histograms for average sentiments grouped by internal values
plt.figure(figsize=(8, 6))
sns.histplot(data=subset_df, x='sentiments_avg', hue='internal', element='step', bins=10, alpha=0.5, legend=True)
plt.xlabel('Average Sentiments')
plt.ylabel('Frequency')
plt.title('Histogram of Average Sentiments by Internal')
plt.show()
