In [None]:
import json

def check_nli_model_accuracy(file_path):
    total = 0
    label_descriptions = {
        0: "Entailment",
        1: "Neutral",
        2: "Contradiction"
    }

    with open(file_path, 'r') as file:
        for line in file:
            item = json.loads(line)

            premise = item['premise']
            hypothesis = item['hypothesis']
            actual_label = item['label']
            predicted_label = item['predicted_label']

            # Convert numeric labels to English descriptions
            actual_label_desc = label_descriptions.get(actual_label, "Unknown")
            predicted_label_desc = label_descriptions.get(predicted_label, "Unknown")

            if actual_label != predicted_label:
                total += 1
                print(f"Number: {total}")
                # print("Model got it wrong:")
                print(f"Premise: {premise}")
                print(f"Hypothesis: {hypothesis}")
                print(f"Actual Label: {actual_label_desc}, Predicted Label: {predicted_label_desc}\n")

In [4]:
check_nli_model_accuracy('eval_predictions.jsonl')

Number: 1
Premise: A woman is doing a cartwheel while wearing a bikini in the sand next to the beach.
Hypothesis: A woman is doing a cartwheel and falls on her head.
Actual Label: Contradiction, Predicted Label: Neutral

Number: 2
Premise: Two men on bicycles competing in a race.
Hypothesis: Men are riding bicycles on the street.
Actual Label: Neutral, Predicted Label: Entailment

Number: 3
Premise: At an outdoor event in an Asian-themed area, a crowd congregates as one person in a yellow Chinese dragon costume confronts the camera.
Hypothesis: A single man is next to a camera
Actual Label: Contradiction, Predicted Label: Entailment

Number: 4
Premise: At an outdoor event in an Asian-themed area, a crowd congregates as one person in a yellow Chinese dragon costume confronts the camera.
Hypothesis: A crowd is dancing
Actual Label: Neutral, Predicted Label: Contradiction

Number: 5
Premise: At an outdoor event in an Asian-themed area, a crowd congregates as one person in a yellow Chinese

In [12]:
import json

def check_qa_model_accuracy(file_path):
    total = 0

    with open(file_path, 'r') as file:
        for line in file:
            item = json.loads(line)

            context = item['context']
            question = item['question']
            correct_answers = item['answers']['text']
            predicted_answer = item['predicted_answer']

            if predicted_answer not in correct_answers:
                total += 1
                print(f"Total: {total}")
                print("Model got it wrong:")
                print(f"Context: {context}")
                print(f"Question: {question}")
                print(f"Correct Answers: {correct_answers}")
                print(f"Predicted Answer: {predicted_answer}\n")

In [13]:
check_qa_model_accuracy('eval_predictions_qa.jsonl')

Total: 1
Model got it wrong:
Context: Super Bowl 50 was an American football game to determine the champion of the National Football League (NFL) for the 2015 season. The American Football Conference (AFC) champion Denver Broncos defeated the National Football Conference (NFC) champion Carolina Panthers 24–10 to earn their third Super Bowl title. The game was played on February 7, 2016, at Levi's Stadium in the San Francisco Bay Area at Santa Clara, California. As this was the 50th Super Bowl, the league emphasized the "golden anniversary" with various gold-themed initiatives, as well as temporarily suspending the tradition of naming each Super Bowl game with Roman numerals (under which the game would have been known as "Super Bowl L"), so that the logo could prominently feature the Arabic numerals 50.
Question: Where did Super Bowl 50 take place?
Correct Answers: ['Santa Clara, California', "Levi's Stadium", "Levi's Stadium in the San Francisco Bay Area at Santa Clara, California."]
P

In [5]:
import json

path = "eval_predictions.jsonl"

#convert the file into an array of dictionaries
with open(path, 'r') as file:
    data = [json.loads(line) for line in file]

In [7]:
import json
import numpy as np
import json

path = "eval_predictions.jsonl"

#convert the file into an array of dictionaries
with open(path, 'r') as file:
    data = [json.loads(line) for line in file]

# Analysis
correct_predictions = 0
total_predictions = len(data)
class_distribution = {0: 0, 1: 0, 2: 0}
confidence_sum = 0

for item in data:
    # Check if prediction is correct
    if item['label'] == item['predicted_label']:
        correct_predictions += 1
    
    # Update class distribution
    class_distribution[item['predicted_label']] += 1

    # Calculate confidence
    softmax_scores = np.exp(item['predicted_scores']) / np.sum(np.exp(item['predicted_scores']))
    confidence = max(softmax_scores)
    confidence_sum += confidence

# Calculate metrics
accuracy = (correct_predictions / total_predictions) * 100
average_confidence = confidence_sum / total_predictions

# Print results
print(f"Accuracy: {accuracy}%")
print(f"Average Confidence: {average_confidence}")
print(f"Class Distribution: {class_distribution}")
print(f"Class Distribution (Percentage): {class_distribution[0] / total_predictions}, {class_distribution[1] / total_predictions}, {class_distribution[2] / total_predictions}")


Accuracy: 89.38223938223938%
Average Confidence: 0.9639950123454989
Class Distribution: {0: 3341, 1: 3248, 2: 3253}
Class Distribution (Percentage): 0.33946352367404997, 0.33001422475106684, 0.33052225157488313


In [8]:
from collections import Counter
import re

# Assuming 'data' is your dataset

# Separate incorrect predictions by label
incorrect_predictions = {0: [], 1: [], 2: []}
total_predictions_per_label = {0: 0, 1: 0, 2: 0}
word_frequency_per_label = {0: Counter(), 1: Counter(), 2: Counter()}

# Helper function to clean and split text into words
def tokenize(text):
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text.lower().split()

for item in data:
    total_predictions_per_label[item['label']] += 1

    if item['label'] != item['predicted_label']:
        incorrect_predictions[item['label']].append(item)
        words = tokenize(item['hypothesis'])
        word_frequency_per_label[item['label']].update(words)

# Calculate error percentage for each label
error_percentage = {label: (len(incorrect_predictions[label]) / total_predictions_per_label[label]) * 100
                    for label in total_predictions_per_label}

# Print results
print("Error Percentage by Label:")
for label, percentage in error_percentage.items():
    print(f"Label {label}: {percentage:.2f}%")

print("\nWord Frequency in Incorrect Predictions by Label:")
for label, freq_counter in word_frequency_per_label.items():
    print(f"Label {label}:")
    for word, freq in freq_counter.most_common(10):  # Print top 10 words
        print(f"  {word}: {freq}")


Error Percentage by Label:
Label 0: 8.95%
Label 1: 13.66%
Label 2: 9.30%

Word Frequency in Incorrect Predictions by Label:
Label 0:
  a: 252
  the: 185
  is: 115
  in: 82
  are: 64
  man: 61
  on: 42
  to: 35
  of: 33
  woman: 30
Label 1:
  a: 350
  the: 273
  is: 172
  are: 102
  man: 74
  in: 66
  woman: 55
  of: 49
  on: 47
  two: 42
Label 2:
  a: 244
  the: 214
  is: 124
  are: 66
  man: 60
  in: 60
  on: 53
  to: 38
  woman: 35
  people: 32
