In [None]:
# Using the Excerpt for the training and using the [top n parapgraphs] for the validation and test data
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import os
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from datasets import Dataset
from collections import defaultdict
import scipy.special
import optuna
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report

import gc
import torch
gc.collect()
torch.cuda.empty_cache()

os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

# Read the gold_standard.xlsx file and create a dictionary mapping DocID to IDEA CAREER 3 and Excerpt
df = pd.read_excel('gold_standard.xlsx')

# Read the highest_scoring_paragraph.csv file and create a dictionary mapping DocID to Paragraph
paragraph_df = pd.read_csv('highest_scoring_paragraphs.csv')
docid_to_paragraph = defaultdict(list)
for _, row in paragraph_df.iterrows():
    doc_id_str = str(row['DocID'])
    if len(doc_id_str) == 6:
        doc_id_str = '0' + doc_id_str
    docid_to_paragraph[doc_id_str].append(row['Paragraph'])

docid_to_label = {}
docid_to_excerpt = {}
for _, row in df.iterrows():
    doc_ids = str(row['DocID']).split(', ')
    for doc_id in doc_ids:
        docid_to_label[str(doc_id.strip())] = row['IDEA CAREER 3']
        docid_to_excerpt[str(doc_id.strip())] = row['Excerpt']

# Initialize an empty list to store batch sentences
batch_sentences = []

results_list_temp = []

# Path to the folder containing XML files
folder_path = "gold_standard_files"

# Iterate over each file in the folder
for file_name in os.listdir(folder_path):
    if file_name.endswith(".xml"):
        # Extract DocID from the filename
        doc_id = str(file_name.split(".")[0])

        # Parse the XML file
        tree = ET.parse(os.path.join(folder_path, file_name))
        root = tree.getroot()

        # Extract the title
        title = root.find(".//title").text

        # Extract paragraphs and concatenate with the title
        article_content = [title]
        for paragraph in root.findall(".//p"):
            article_content.append(paragraph.text)
        
        # Convert the article content into one string
        combined_text = " ".join(article_content)

        # Fetch the corresponding label using the DocID and adjust the label
        label = docid_to_label.get(doc_id, None)
        if label is not None:
            adjusted_label = label - 1  # Adjust the label
            batch_sentences.append({'doc_id': doc_id, 'label': adjusted_label, 'text': combined_text})

# Tokenization
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, return_tensors="pt", max_length=512)

# Global variable to store logits
global_logits = None

def compute_metrics_with_logits(eval_pred):
    global global_logits
    
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    # Store the logits for later use
    global_logits = logits
    
    # Compute metrics for each class separately
    f1_scores = f1_score(labels, predictions, average=None, zero_division=0).tolist()  # Convert to list
    precisions = precision_score(labels, predictions, average=None, zero_division=0).tolist()  # Convert to list
    recalls = recall_score(labels, predictions, average=None, zero_division=0).tolist()  # Convert to list
    
    # Update the dictionaries with the metrics for each class
    for i in range(len(f1_scores)):
        class_metric_sums[i]["f1_score"] += f1_scores[i]
        class_metric_sums[i]["precision"] += precisions[i]
        class_metric_sums[i]["recall"] += recalls[i]
        class_counts[i] += 1  # Assuming there is at least one instance for each class in every fold

    # Calculate the average metric results
    avg_f1_score = sum(f1_scores) / len(f1_scores)
    avg_precision = sum(precisions) / len(precisions)
    avg_recall = sum(recalls) / len(recalls)

    metrics = {
        "f1_score": avg_f1_score,  # Now the average value
        "precision": avg_precision,  # Now the average value
        "recall": avg_recall,  # Now the average value
        "accuracy": accuracy_score(labels, predictions)
    }
    
    # Store metrics for each class in the dictionary
    for i in range(len(f1_scores)):
        metrics[f"f1_score_class_{i}"] = f1_scores[i]
        metrics[f"precision_class_{i}"] = precisions[i]
        metrics[f"recall_class_{i}"] = recalls[i]
    
    return metrics

# Folder containing the 5-fold CSV files
fold_folder = "doc_id_5_fold"

# Initialize lists to store metric values for each fold
f1_scores = []
precisions = []
recalls = []
accuracies = []

class_metric_sums = defaultdict(lambda: defaultdict(float))
class_counts = defaultdict(int)

results_list = []

class_metrics = {
    0: {"precision": [], "recall": [], "f1-score": []},
    1: {"precision": [], "recall": [], "f1-score": []},
    2: {"precision": [], "recall": [], "f1-score": []}
}

# Iterate over each CSV file in the fold folder
for fold_file in os.listdir(fold_folder):
    if fold_file.endswith(".csv"):
        
        # Model, TrainingArguments, and Trainer initialization
        model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=3)

        training_args = TrainingArguments(
            output_dir="test_trainer",
            evaluation_strategy="epoch",
            learning_rate=1e-5,
            per_device_train_batch_size=32,
            per_device_eval_batch_size=32,
            num_train_epochs=25,
            #warmup_steps=50,
            #weight_decay=0.01,  
            logging_dir='./logs',
            logging_steps=500,
            save_strategy="epoch",
            load_best_model_at_end=True,
            save_total_limit=3,
        )

        with open(os.path.join(fold_folder, fold_file), 'r') as f:
            lines = f.readlines()
            
            # Split the lines based on empty lines
            splits = [i for i, line in enumerate(lines) if not line.strip()]
            train_ids = [str(line.strip()) for line in lines[:splits[0]]]
            val_ids = [str(line.strip()) for line in lines[splits[0]+1:splits[1]]]
            test_ids = [str(line.strip()) for line in lines[splits[1]+1:]]

        #train_data = []
        #for doc_id in train_ids:
            #excerpt = docid_to_excerpt.get(doc_id, None)
            #label = docid_to_label.get(doc_id, None)
            #if label is not None and excerpt is not None:
                #adjusted_label = label - 1  # Adjust the label
                #train_data.append({'doc_id': doc_id, 'label': adjusted_label, 'text': excerpt})

        train_data = []
        for doc_id in train_ids:
            paragraphs = docid_to_paragraph.get(doc_id, [])
            for paragraph in paragraphs:
                label = docid_to_label.get(doc_id, None)
                if label is not None and paragraph is not None:
                    adjusted_label = label - 1  # Adjust the label
                    train_data.append({'doc_id': doc_id, 'label': adjusted_label, 'text': paragraph})

        #val_data = []
        #for doc_id in val_ids:
            #excerpt = docid_to_excerpt.get(doc_id, None)
            #label = docid_to_label.get(doc_id, None)
            #if label is not None and excerpt is not None:
                #adjusted_label = label - 1  # Adjust the label
                #val_data.append({'doc_id': doc_id, 'label': adjusted_label, 'text': excerpt})

        val_data = []
        for doc_id in val_ids:
            paragraphs = docid_to_paragraph.get(doc_id, [])
            for paragraph in paragraphs:
                label = docid_to_label.get(doc_id, None)
                if label is not None and paragraph is not None:
                    adjusted_label = label - 1  # Adjust the label
                    val_data.append({'doc_id': doc_id, 'label': adjusted_label, 'text': paragraph})
                    
        #test_data = []
        #for doc_id in test_ids:
            #excerpt = docid_to_excerpt.get(doc_id, None)
            #label = docid_to_label.get(doc_id, None)
            #if label is not None and excerpt is not None:
                #adjusted_label = label - 1  # Adjust the label
                #test_data.append({'doc_id': doc_id, 'label': adjusted_label, 'text': excerpt})

        test_data = []
        for doc_id in test_ids:
            paragraphs = docid_to_paragraph.get(doc_id, [])
            for paragraph in paragraphs:
                label = docid_to_label.get(doc_id, None)
                if label is not None and paragraph is not None:
                    adjusted_label = label - 1  # Adjust the label
                    test_data.append({'doc_id': doc_id, 'label': adjusted_label, 'text': paragraph})

        # Convert data to datasets
        train_dataset = Dataset.from_pandas(pd.DataFrame(train_data))
        val_dataset = Dataset.from_pandas(pd.DataFrame(val_data))
        test_dataset = Dataset.from_pandas(pd.DataFrame(test_data))

        # Tokenize datasets
        train_dataset = train_dataset.map(tokenize_function, batched=True)
        val_dataset = val_dataset.map(tokenize_function, batched=True)
        test_dataset = test_dataset.map(tokenize_function, batched=True)

        
        def objective(trial):
            # Define a search space
            #learning_rate = trial.suggest_float("learning_rate", 1e-5, 1e-2, log=True)
            #num_train_epochs = trial.suggest_int("num_train_epochs", 1, 5)
            #per_device_train_batch_size = trial.suggest_categorical("per_device_train_batch_size", [4, 8, 16, 32])
            #weight_decay = trial.suggest_float("weight_decay", 1e-4, 1e-1, log=True)
            #warmup_steps = trial.suggest_int("warmup_steps", 0, 300)

            training_args = TrainingArguments(
                output_dir="test_trainer",
                evaluation_strategy="epoch",
                #learning_rate=learning_rate,
                #num_train_epochs=num_train_epochs,
                #per_device_train_batch_size=per_device_train_batch_size,
                #weight_decay=weight_decay,
                #warmup_steps=warmup_steps,
                save_strategy="epoch",
                load_best_model_at_end=True,
            )

            # Initialize the Trainer with the hyperparameters
            trainer = Trainer(
                model=model,
                args=training_args,
                train_dataset=train_dataset,
                eval_dataset=val_dataset,
                compute_metrics=compute_metrics_with_logits
            )

            # Train the model and evaluate it on the validation set
            trainer.train()
            eval_results = trainer.evaluate()

            # Return the metric that you want to optimize
            return eval_results["eval_loss"]
        
        # Create a study object and optimize the objective function
        # study = optuna.create_study(direction="minimize")
        # study.optimize(objective, n_trials=10)
        
        # Get the best hyperparameters
        # best_params = study.best_params
        # print(f"Best hyperparameters for fold {fold_file}: {best_params}")

        trainer = Trainer(model=model, args=training_args, train_dataset=train_dataset, eval_dataset=val_dataset, compute_metrics=compute_metrics_with_logits)
        trainer.train()
        best_model = AutoModelForSequenceClassification.from_pretrained(trainer.state.best_model_checkpoint)
        best_trainer = Trainer(model=best_model, args=training_args, train_dataset=train_dataset, eval_dataset=val_dataset, compute_metrics=compute_metrics_with_logits)
        eval_results = trainer.evaluate(test_dataset)

        logits = global_logits
        labels = eval_results.pop('labels', None) if 'labels' in eval_results else None
        
        # Convert logits to numpy array with float data type
        logits = np.array(logits, dtype=float)

        probabilities = scipy.special.softmax(logits, axis=1)
        confidence_scores = np.max(probabilities, axis=1)

        print("Results for this fold:")
        for key, value in eval_results.items():
            print(f"{key}: {value}")
        print("-" * 50)

        # Convert logits to numpy array with float data type
        logits = np.array(logits, dtype=float)

        # Now compute the softmax
        probabilities = scipy.special.softmax(logits, axis=1)
        confidence_scores = np.max(probabilities, axis=1)

        # Updated logic for selecting the highest confidence prediction for each DocID
        docid_to_max_conf = defaultdict(float)
        docid_to_pred = {}

        docid_to_max_conf = {}
        docid_to_pred = {}
        
        # Temporary code
        #for doc_id, value, pred in zip(test_dataset['doc_id'], confidence_scores, np.argmax(probabilities, axis=1)):
            #true_label = docid_to_label.get(doc_id, None)  # Fetching the true label using doc_id
            #if true_label is not None:  # Ensure that the true label exists for the given doc_id
                #true_label -= 1  # Adjusting the label
                #results_list_temp.append({
                    #'DocID': doc_id,
                    #'True Label': true_label,
                    #'Predicted Label': pred,
                    #'Confidence Score': value
                #})

        for i, (doc_id, value, pred) in enumerate(zip(test_dataset['doc_id'], confidence_scores, np.argmax(probabilities, axis=1))):
            true_label = docid_to_label.get(doc_id, None)  # Fetching the true label using doc_id
            if true_label is not None:  # Ensure that the true label exists for the given doc_id
                true_label -= 1  # Adjusting the label
                if doc_id in docid_to_max_conf:
                    # Check if True Label and Predicted Label are not the same
                    if true_label != pred:
                        print('Document ID: ', doc_id, ', True Label: ', true_label, ', Predicted Label: ', pred)
                        for j, class_prob in enumerate(probabilities[i]):
                            print(f"  Class {j} Confidence Score: {class_prob:.4f}")
                        print()
                    if value > docid_to_max_conf[doc_id]:
                        # Update the existing entry in results_list with the higher value
                        for result in results_list:
                            if result['DocID'] == doc_id:
                                result['Predicted Label'] = pred
                                result['Confidence Score'] = value
                        docid_to_max_conf[doc_id] = value
                        docid_to_pred[doc_id] = pred
                else:
                    # If the doc_id is not in the results_list, append it directly
                    results_list.append({
                        'DocID': doc_id,
                        'True Label': true_label,
                        'Predicted Label': pred,
                        'Confidence Score': value
                    })
                    docid_to_max_conf[doc_id] = value
                    docid_to_pred[doc_id] = pred
        
        # Temporary code
        if 'Unknown' in test_ids:
            test_ids.remove('Unknown')

        final_preds = [docid_to_pred[doc_id] for doc_id in test_ids]
        final_true_labels = [docid_to_label[doc_id] - 1 for doc_id in test_ids]

        # Compute the classification report
        report = classification_report(final_true_labels, final_preds, zero_division=0)
        print("Classification Report for Fold:", fold_file)
        print(report)
        report_dict = classification_report(final_true_labels, final_preds, output_dict=True, zero_division=0)

        for class_label, metrics in report_dict.items():
            if class_label.isdigit():  # Check if the key represents a class label
                class_metrics[int(class_label)]["precision"].append(metrics["precision"])
                class_metrics[int(class_label)]["recall"].append(metrics["recall"])
                class_metrics[int(class_label)]["f1-score"].append(metrics["f1-score"])
        
        # Compute the confusion matrix
        cm = confusion_matrix(final_true_labels, final_preds)

        # Display the confusion matrix using Seaborn's heatmap
        plt.figure(figsize=(10,7))
        sns.heatmap(cm, annot=True, fmt='g', cmap='Blues')
        plt.xlabel('Predicted labels')
        plt.ylabel('True labels')
        plt.title('Confusion Matrix for Fold: ' + fold_file)
        plt.show()

        results = {}  # Initialize the results dictionary
        results["f1_score"] = f1_score(final_true_labels, final_preds, average='weighted', zero_division=0)
        results["precision"] = precision_score(final_true_labels, final_preds, average='weighted', zero_division=0)
        results["recall"] = recall_score(final_true_labels, final_preds, average='weighted', zero_division=0)
        results["accuracy"] = accuracy_score(final_true_labels, final_preds)

        # Append metric values for this fold
        f1_scores.append(results["f1_score"])
        precisions.append(results["precision"])
        recalls.append(results["recall"])
        accuracies.append(results["accuracy"])

        print(results)

# Calculate average metric values
avg_f1_score = np.mean(f1_scores)
avg_precision = np.mean(precisions)
avg_recall = np.mean(recalls)
avg_accuracy = np.mean(accuracies)

print("Average F1-score:", avg_f1_score)
print("Average Precision:", avg_precision)
print("Average Recall:", avg_recall)
print("Average Accuracy:", avg_accuracy)

# Calculate and print the average metrics for each class
print("Average Metrics Per Class:")
for class_label, metrics in class_metrics.items():
    avg_precision = np.mean(metrics["precision"])
    avg_recall = np.mean(metrics["recall"])
    avg_f1 = np.mean(metrics["f1-score"])
    print(f"Class {class_label} - Precision: {avg_precision}, Recall: {avg_recall}, F1-score: {avg_f1}")
    
print("Final Results:")
for result in results_list:
    print(f"DocID: {result['DocID']}, True Label: {result['True Label']}, Predicted Label: {result['Predicted Label']}, Confidence Score: {result['Confidence Score']}")

2023-10-30 13:29:19.847039: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-10-30 13:29:20.413825: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-10-30 13:29:20.413868: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-10-30 13:29:20.413899: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-10-30 13:29:20.448064: I tensorflow/core/platform/cpu_feature_g

Map:   0%|          | 0/181 [00:00<?, ? examples/s]

Map:   0%|          | 0/37 [00:00<?, ? examples/s]

Map:   0%|          | 0/51 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,F1 Score,Precision,Recall,Accuracy,F1 Score Class 0,Precision Class 0,Recall Class 0,F1 Score Class 1,Precision Class 1,Recall Class 1,F1 Score Class 2,Precision Class 2,Recall Class 2
1,No log,1.016426,0.233918,0.18018,0.333333,0.540541,0.701754,0.540541,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,No log,1.028372,0.233918,0.18018,0.333333,0.540541,0.701754,0.540541,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,No log,1.043193,0.233918,0.18018,0.333333,0.540541,0.701754,0.540541,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,No log,1.066606,0.233918,0.18018,0.333333,0.540541,0.701754,0.540541,1.0,0.0,0.0,0.0,0.0,0.0,0.0
5,No log,1.120517,0.233918,0.18018,0.333333,0.540541,0.701754,0.540541,1.0,0.0,0.0,0.0,0.0,0.0,0.0
6,No log,1.173663,0.233918,0.18018,0.333333,0.540541,0.701754,0.540541,1.0,0.0,0.0,0.0,0.0,0.0,0.0
7,No log,1.185951,0.22619,0.175926,0.316667,0.513514,0.678571,0.527778,0.95,0.0,0.0,0.0,0.0,0.0,0.0
8,No log,1.2277,0.233918,0.18018,0.333333,0.540541,0.701754,0.540541,1.0,0.0,0.0,0.0,0.0,0.0,0.0
9,No log,1.284613,0.209877,0.166667,0.283333,0.459459,0.62963,0.5,0.85,0.0,0.0,0.0,0.0,0.0,0.0
10,No log,1.302755,0.209877,0.166667,0.283333,0.459459,0.62963,0.5,0.85,0.0,0.0,0.0,0.0,0.0,0.0


In [2]:
        #train_data = []
        #for doc_id in train_ids:
            #paragraphs = docid_to_paragraph.get(doc_id, [])
            #for paragraph in paragraphs:
                #label = docid_to_label.get(doc_id, None)
                #if label is not None and paragraph is not None:
                    #adjusted_label = label - 1  # Adjust the label
                    #train_data.append({'doc_id': doc_id, 'label': adjusted_label, 'text': paragraph})

        train_data = []
        for doc_id in train_ids:
            excerpt = docid_to_excerpt.get(doc_id, None)
            label = docid_to_label.get(doc_id, None)
            if label is not None and excerpt is not None:
                adjusted_label = label - 1  # Adjust the label
                train_data.append({'doc_id': doc_id, 'label': adjusted_label, 'text': excerpt})

        #val_data = []
        #for doc_id in val_ids:
            #paragraphs = docid_to_paragraph.get(doc_id, [])
            #for paragraph in paragraphs:
                #label = docid_to_label.get(doc_id, None)
                #if label is not None and paragraph is not None:
                    #adjusted_label = label - 1  # Adjust the label
                    #val_data.append({'doc_id': doc_id, 'label': adjusted_label, 'text': paragraph})

        val_data = []
        for doc_id in val_ids:
            excerpt = docid_to_excerpt.get(doc_id, None)
            label = docid_to_label.get(doc_id, None)
            if label is not None and excerpt is not None:
                adjusted_label = label - 1  # Adjust the label
                val_data.append({'doc_id': doc_id, 'label': adjusted_label, 'text': excerpt})

        #test_data = []
        #for doc_id in test_ids:
            #paragraphs = docid_to_paragraph.get(doc_id, [])
            #for paragraph in paragraphs:
                #label = docid_to_label.get(doc_id, None)
                #if label is not None and paragraph is not None:
                    #adjusted_label = label - 1  # Adjust the label
                    #test_data.append({'doc_id': doc_id, 'label': adjusted_label, 'text': paragraph})

        test_data = []
        for doc_id in test_ids:
            excerpt = docid_to_excerpt.get(doc_id, None)
            label = docid_to_label.get(doc_id, None)
            if label is not None and excerpt is not None:
                adjusted_label = label - 1  # Adjust the label
                test_data.append({'doc_id': doc_id, 'label': adjusted_label, 'text': excerpt})