In [3]:
import json
import pandas as pd
from transformers import pipeline
from sklearn.metrics import accuracy_score

# Load your QA dataset
qa_data_path = "/Users/vivekmandal/Desktop/Ramu/aiproject/datacleaning/qa_data.json"
with open(qa_data_path, "r") as f:
    qa_dataset = json.load(f)

# Use first 100 samples for faster comparison (adjust as needed)
qa_dataset = qa_dataset[:100]

# Load two different QA models
model_1 = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")
model_2 = pipeline("question-answering", model="deepset/roberta-base-squad2")

# Collect results
results = []
for sample in qa_dataset:
    question = sample["question"]
    context = sample["context"]
    true_answer = sample["answer"].strip().lower()

    pred_1 = model_1(question=question, context=context)["answer"].strip().lower()
    pred_2 = model_2(question=question, context=context)["answer"].strip().lower()

    results.append({
        "question": question,
        "true_answer": true_answer,
        "model_1_answer": pred_1,
        "model_2_answer": pred_2,
        "model_1_correct": pred_1 == true_answer,
        "model_2_correct": pred_2 == true_answer
    })

# Convert to DataFrame
df_results = pd.DataFrame(results)

# Compute Accuracy
accuracy_1 = df_results["model_1_correct"].mean()
accuracy_2 = df_results["model_2_correct"].mean()

print(f"✅ Model 1 Accuracy (DistilBERT): {accuracy_1:.2%}")
print(f"✅ Model 2 Accuracy (RoBERTa): {accuracy_2:.2%}")

# Optionally save to CSV
df_results.to_csv("qa_model_comparison_results.csv", index=False)




config.json:   0%|          | 0.00/451 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/265M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/496M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

✅ Model 1 Accuracy (DistilBERT): 83.00%
✅ Model 2 Accuracy (RoBERTa): 86.00%
