In [5]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import BertTokenizer, BertForSequenceClassification, RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.preprocessing import LabelEncoder

# Load the dataset
df = pd.read_excel('/content/dataset_final.xlsx')

# Print out the column names to inspect them
print("Columns in the dataset:", df.columns)

# Replace 'Question' and 'Discipline' with your actual column names
df = df[['Question', 'Discipline']]  # Assuming 'Question' is text and 'Discipline' is the label column
# Remove rows with missing values
df = df.dropna()

# Label Encoding for the 'Discipline' column
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['Discipline'])

# Train-Test Split
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Convert the data to HuggingFace Dataset format
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Define function to load models and tokenizers
def load_model_and_tokenizer(model_name):
    if model_name.startswith('bert'):
        tokenizer = BertTokenizer.from_pretrained(model_name)
        model = BertForSequenceClassification.from_pretrained(model_name, num_labels=df['label'].nunique())
    elif model_name.startswith('roberta'):
        tokenizer = RobertaTokenizer.from_pretrained(model_name)
        model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=df['label'].nunique())
    return tokenizer, model

# Tokenization Function
def tokenize_function(examples, tokenizer):
    return tokenizer(examples['Question'], padding='max_length', truncation=True)

# Define function for metrics (Accuracy, Precision, Recall, F1)
def compute_metrics(p):
    preds = p.predictions.argmax(axis=-1)
    labels = p.label_ids
    accuracy = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

# Define function to train and evaluate the model
def train_and_evaluate(model_name, batch_size=8, learning_rate=2e-5, epochs=3):
    # Load model and tokenizer
    tokenizer, model = load_model_and_tokenizer(model_name)

    # Tokenize the datasets
    train_tokenized = train_dataset.map(lambda x: tokenize_function(x, tokenizer), batched=True)
    test_tokenized = test_dataset.map(lambda x: tokenize_function(x, tokenizer), batched=True)

    # Ensure 'Discipline' is encoded as 'labels'
    train_tokenized = train_tokenized.rename_column('label', 'labels')
    test_tokenized = test_tokenized.rename_column('label', 'labels')

    # Set format for PyTorch (ensure 'labels' is included)
    target_columns = ['input_ids', 'attention_mask', 'labels']
    train_tokenized.set_format(type='torch', columns=target_columns)
    test_tokenized.set_format(type='torch', columns=target_columns)

    # Define Training Arguments
    training_args = TrainingArguments(
        output_dir='./results',
        evaluation_strategy='epoch',
        learning_rate=learning_rate,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=epochs,
        weight_decay=0.01,
        report_to="none"  # Avoid logging to Weights & Biases for now
    )

    # Initialize the Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_tokenized,
        eval_dataset=test_tokenized,
        compute_metrics=compute_metrics
    )

    # Train the model
    trainer.train()

    # Evaluate the model
    eval_results = trainer.evaluate()
    print(f"Evaluation Results for {model_name}:")
    print(eval_results)  # Print out the evaluation results for comparison

# Experiment with different hyperparameters and models
print("Training BERT model...")
train_and_evaluate('bert-base-uncased', batch_size=8, learning_rate=2e-5, epochs=3)

print("\nTraining RoBERTa model...")
train_and_evaluate('roberta-base', batch_size=8, learning_rate=2e-5, epochs=3)

# Hyperparameter Tuning Example:
# You can experiment with different learning rates, batch sizes, and epochs

print("\nTraining BERT with a different learning rate (1e-5)...")
train_and_evaluate('bert-base-uncased', batch_size=8, learning_rate=1e-5, epochs=3)

print("\nTraining RoBERTa with a different batch size (16)...")
train_and_evaluate('roberta-base', batch_size=16, learning_rate=2e-5, epochs=3)


Columns in the dataset: Index(['Discipline', 'Question', 'Reason you think it is faulty',
       'Which top LLM you tried ', 'Response by a top LLM '],
      dtype='object')
Training BERT model...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/3523 [00:00<?, ? examples/s]

Map:   0%|          | 0/881 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.106109,0.972758,0.973133,0.972758,0.972652
2,0.638100,0.022496,0.996595,0.996635,0.996595,0.996595
3,0.045900,0.008076,0.996595,0.996712,0.996595,0.996595


Evaluation Results for bert-base-uncased:
{'eval_loss': 0.00807610247284174, 'eval_accuracy': 0.996594778660613, 'eval_precision': 0.9967122000861091, 'eval_recall': 0.996594778660613, 'eval_f1': 0.996594778660613, 'eval_runtime': 24.5358, 'eval_samples_per_second': 35.907, 'eval_steps_per_second': 4.524, 'epoch': 3.0}

Training RoBERTa model...


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/3523 [00:00<?, ? examples/s]

Map:   0%|          | 0/881 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.12377,0.975028,0.975289,0.975028,0.974973
2,0.474500,0.031269,0.994325,0.994495,0.994325,0.994325
3,0.073400,0.026152,0.992054,0.992068,0.992054,0.992042


Evaluation Results for roberta-base:
{'eval_loss': 0.02615249902009964, 'eval_accuracy': 0.9920544835414302, 'eval_precision': 0.9920684967979709, 'eval_recall': 0.9920544835414302, 'eval_f1': 0.9920419204634546, 'eval_runtime': 22.4536, 'eval_samples_per_second': 39.237, 'eval_steps_per_second': 4.944, 'epoch': 3.0}

Training BERT with a different learning rate (1e-5)...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/3523 [00:00<?, ? examples/s]

Map:   0%|          | 0/881 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.231522,0.965948,0.968083,0.965948,0.965728
2,1.016400,0.074031,0.985244,0.986005,0.985244,0.985224
3,0.103300,0.039929,0.992054,0.992451,0.992054,0.992051


Evaluation Results for bert-base-uncased:
{'eval_loss': 0.03992909938097, 'eval_accuracy': 0.9920544835414302, 'eval_precision': 0.9924511059372279, 'eval_recall': 0.9920544835414302, 'eval_f1': 0.9920512131694477, 'eval_runtime': 24.4949, 'eval_samples_per_second': 35.967, 'eval_steps_per_second': 4.532, 'epoch': 3.0}

Training RoBERTa with a different batch size (16)...


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/3523 [00:00<?, ? examples/s]

Map:   0%|          | 0/881 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.115379,0.969353,0.97082,0.969353,0.969288
2,No log,0.045972,0.990919,0.991284,0.990919,0.990917
3,0.367200,0.015079,0.996595,0.996635,0.996595,0.996595


Evaluation Results for roberta-base:
{'eval_loss': 0.015079336240887642, 'eval_accuracy': 0.996594778660613, 'eval_precision': 0.9966354995355018, 'eval_recall': 0.996594778660613, 'eval_f1': 0.9965951798214873, 'eval_runtime': 22.9924, 'eval_samples_per_second': 38.317, 'eval_steps_per_second': 2.436, 'epoch': 3.0}
