In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from tqdm import tqdm

# Load the dataset from CSV file
df = pd.read_csv('output (3).csv')

# Extract post_text and annotations columns
X = df['post_text']
y = df.iloc[:, 3:]  # Assuming annotations start from the 4th column

# Initialize label dictionary for encoding
label_dict = {'yes': 1, 'no': 0}

# Initialize lists to store evaluation metrics for each annotation
metrics_per_annotation = []
classified_annotations = {}

# Loop through each annotation column
for annotation in y.columns:
    print(f"\nProcessing annotation: {annotation}")

    # Extract labels for the current annotation
    labels = y[annotation]

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

    # Load the pre-trained BERT tokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    # Tokenize the text data and convert to input IDs
    X_train_encoded = tokenizer(X_train.tolist(), padding=True, truncation=True, return_tensors='pt')
    X_test_encoded = tokenizer(X_test.tolist(), padding=True, truncation=True, return_tensors='pt')

    # Create DataLoader for training and testing data
    train_data = TensorDataset(X_train_encoded['input_ids'], X_train_encoded['attention_mask'], torch.tensor(y_train.values))
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=8)

    test_data = TensorDataset(X_test_encoded['input_ids'], X_test_encoded['attention_mask'], torch.tensor(y_test.values))
    test_sampler = SequentialSampler(test_data)
    test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=8)

    # Load pre-trained BERT model for sequence classification
    model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

    # Set device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Set optimizer and scheduler
    optimizer = AdamW(model.parameters(), lr=2e-5)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.1)

    # Train the model
    model.train()
    for epoch in range(3):  # Number of epochs
        total_loss = 0
        for batch in tqdm(train_dataloader, desc=f"Epoch {epoch+1}", unit="batch"):
            input_ids, attention_mask, labels = batch
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)
            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()
            loss.backward()
            optimizer.step()
        scheduler.step()
        print(f"Average training loss: {total_loss/len(train_dataloader)}")

    # Evaluate the model
    model.eval()
    y_pred = []
    y_true = []
    with torch.no_grad():
        for batch in tqdm(test_dataloader, desc="Evaluating", unit="batch"):
            input_ids, attention_mask, labels = batch
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)
            y_pred.extend(preds.cpu().numpy())
            y_true.extend(labels.cpu().numpy())

    # Calculate evaluation metrics
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    roc_auc = roc_auc_score(y_true, y_pred)

    # Print evaluation metrics
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1 Score:", f1)
    print("ROC AUC Score:", roc_auc)

    # Store metrics in the list
    metrics_per_annotation.append({
        'Annotation': annotation,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'ROC AUC': roc_auc
    })

    # Classify annotations based on metrics
    # Classify as 'yes' if F1 score is above a threshold, else 'no'
    if f1 > 0.5:  # You can adjust the threshold as needed
        classified_annotations[annotation] = 'yes'
    else:
        classified_annotations[annotation] = 'no'

# Convert the list of dictionaries to a DataFrame
metrics_df = pd.DataFrame(metrics_per_annotation)

# Convert the classified_annotations dictionary to a DataFrame
classified_annotations_df = pd.DataFrame(classified_annotations.items(), columns=['Annotation', 'Classification'])

# Save the classified annotations to a CSV file
classified_annotations_df.to_csv('classified_annotations.csv', index=False)

# Save the metrics to a CSV file
metrics_df.to_csv('annotation_metrics.csv', index=False)



Processing annotation: a1


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1: 100%|██████████| 201/201 [02:29<00:00,  1.34batch/s]


Average training loss: 0.395648061163464


Epoch 2: 100%|██████████| 201/201 [02:36<00:00,  1.29batch/s]


Average training loss: 0.31079731496579166


Epoch 3: 100%|██████████| 201/201 [02:36<00:00,  1.28batch/s]


Average training loss: 0.29989677454479297


Evaluating: 100%|██████████| 51/51 [00:14<00:00,  3.63batch/s]


Accuracy: 0.8204488778054863
Precision: 0.8204488778054863
Recall: 1.0
F1 Score: 0.9013698630136986
ROC AUC Score: 0.5

Processing annotation: a2


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1: 100%|██████████| 201/201 [02:37<00:00,  1.28batch/s]


Average training loss: 0.6358251365559611


Epoch 2: 100%|██████████| 201/201 [02:36<00:00,  1.28batch/s]


Average training loss: 0.5616839218198957


Epoch 3: 100%|██████████| 201/201 [02:36<00:00,  1.28batch/s]


Average training loss: 0.5066322729510454


Evaluating: 100%|██████████| 51/51 [00:14<00:00,  3.63batch/s]


Accuracy: 0.71571072319202
Precision: 0.6571428571428571
Recall: 0.46938775510204084
F1 Score: 0.5476190476190477
ROC AUC Score: 0.663827735818737

Processing annotation: Feeling-bad-about-yourself-or-that-you-are-a-failure-or-have-let-yourself-or-your-family-down


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1: 100%|██████████| 201/201 [02:36<00:00,  1.28batch/s]


Average training loss: 0.44639296536866707


Epoch 2: 100%|██████████| 201/201 [02:36<00:00,  1.28batch/s]


Average training loss: 0.39734600050680674


Epoch 3: 100%|██████████| 201/201 [02:36<00:00,  1.29batch/s]


Average training loss: 0.37908252417596416


Evaluating: 100%|██████████| 51/51 [00:13<00:00,  3.65batch/s]


Accuracy: 0.8428927680798005
Precision: 0.8428927680798005
Recall: 1.0
F1 Score: 0.9147496617050067
ROC AUC Score: 0.5

Processing annotation: Little-interest-or-pleasure-in-doing 


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1: 100%|██████████| 201/201 [02:36<00:00,  1.28batch/s]


Average training loss: 0.019095801759339795


Epoch 2: 100%|██████████| 201/201 [02:36<00:00,  1.28batch/s]


Average training loss: 0.0008776006705836574


Epoch 3: 100%|██████████| 201/201 [02:36<00:00,  1.28batch/s]


Average training loss: 0.0007995240380454678


Evaluating: 100%|██████████| 51/51 [00:14<00:00,  3.63batch/s]
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


ValueError: Only one class present in y_true. ROC AUC score is not defined in that case.

In [None]:
metrics_per_annotation


[{'Annotation': 'Feeling-bad-about-yourself-or-that-you-are-a-failure-or-have-let-yourself-or-your-family-down',
  'Accuracy': 0.8503740648379052,
  'Precision': 0.8564102564102564,
  'Recall': 0.9881656804733728,
  'F1 Score': 0.9175824175824177,
  'ROC AUC': 0.5496383957922419}]