In [1]:
#1. Importing the libraries and modules
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import gradio as gr
from datasets import Dataset

In [None]:
# 2. Load and Clean the Dataset
data = pd.read_csv('train.csv')  

def clean_text(text):
    return text.lower().strip()

data['cleaned_text'] = data['text'].apply(clean_text)
data['cleaned_reason'] = data['reason'].apply(clean_text)

# 3. Split the Dataset
train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)

# 4. Convert to Hugging Face Dataset Format
train_dataset = Dataset.from_pandas(train_data[['cleaned_text', 'cleaned_reason', 'label']])
val_dataset = Dataset.from_pandas(val_data[['cleaned_text', 'cleaned_reason', 'label']])

# 5. Load Tokenizer and Tokenize
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_function(example):
    return tokenizer(example['cleaned_text'], example['cleaned_reason'],
                     padding='max_length', truncation=True, max_length=128)

train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

# 6. Format for PyTorch
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'token_type_ids', 'label'])
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'token_type_ids', 'label'])

# 7. Load Pre-trained Model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# 8. Define Metrics Function
def compute_metrics(p):
    preds = torch.argmax(torch.tensor(p.predictions), axis=1)
    labels = torch.tensor(p.label_ids)
    return {
        "accuracy": accuracy_score(labels, preds),
        "precision": precision_score(labels, preds),
        "recall": recall_score(labels, preds),
        "f1": f1_score(labels, preds)
    }

# 9. Training Arguments
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    eval_strategy="epoch",    
    save_strategy="epoch",    
    logging_dir="./logs",
    logging_steps=10,
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()


# 11. Train the Model
trainer.train()

# 12. Evaluate and Show Metrics
results = trainer.evaluate()
print("Evaluation Results:", results)

# Optional: Confusion Matrix
predictions = trainer.predict(val_dataset)
conf_matrix = confusion_matrix(predictions.label_ids, predictions.predictions.argmax(axis=1))
print("Confusion Matrix:\n", conf_matrix)

# 13. Gradio Interface
def predict_feedback(text, reason):
    inputs = tokenizer(text, reason, return_tensors="pt", truncation=True, padding=True, max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
    prediction = torch.argmax(outputs.logits, dim=1).item()
    return "Aligned ✅" if prediction == 1 else "Not Aligned ❌"

interface = gr.Interface(fn=predict_feedback,
                         inputs=["text", "text"],
                         outputs="text",
                         title="My Zoom Feedback Validator")

interface.launch(share=True)

Map:   0%|          | 0/1648 [00:00<?, ? examples/s]

Map:   0%|          | 0/413 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.0001,8.7e-05,1.0,1.0,1.0,1.0
2,0.0001,5.5e-05,1.0,1.0,1.0,1.0
3,0.0001,4.8e-05,1.0,1.0,1.0,1.0




Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.0,2.6e-05,1.0,1.0,1.0,1.0


