### Performance Autograder

Evaluate the performance of the model's result

Metrics:
- Accuracy
- Precision and Recall
- F1 Score

Input:
- excel file of Ground Truth
- excel file of Model's Result

### Accuracy calculation approach:
For each timeslot:
- Calculate the intersection of unavailable students between ground truth and test results to get correctly matched students.
- Calculate extra students in the test results (those not in the ground truth).
- Calculate missing students from the ground truth (those not in the test results).

Finally, calculate accuracy by defining a ratio that takes into account both correct, extra, and missing students.

In [7]:
import pandas as pd

ground_truth_file = 'ground_truth_updated.xlsx'
test_result_file = 'Gemini 1.5 Pro.xlsx'

ground_truth_df = pd.read_excel(ground_truth_file)
test_result_df = pd.read_excel(test_result_file)

# print dataframes
ground_truth_df.head(), test_result_df.head()

(  Unnamed: 0 Monday Tuesday Wednesday Thursday Friday Saturday Sunday
 0    8AM-9AM      C       C         C        C      C       EJ      E
 1   9AM-10AM    BCH      CH        CH       CH     CH       EJ      E
 2  10AM-11AM    BDH       H         H        H     DH       EJ      E
 3  11AM-12PM      D     NaN       NaN      NaN      D       EJ      E
 4   12PM-1PM    NaN     NaN       NaN      NaN    NaN       EJ      E,
    Time Slot Monday Tuesday Wednesday Thursday Friday  Saturday  Sunday
 0    8AM-9AM      C      CH        CH       CH    CDH       NaN     NaN
 1   9AM-10AM    BCH      CH        CH       CH    CDH       NaN     NaN
 2  10AM-11AM   BCDH      CH        CH       CH    CDH       NaN     NaN
 3  11AM-12PM   BCDH      CH        CH       CH    CDH       NaN     NaN
 4   12PM-1PM    NaN     NaN       NaN      NaN    NaN       NaN     NaN)

### Precision and Recall, F1 Score

In [8]:
# Function to calculate refined accuracy, precision, recall, and F1-score
def refined_accuracy_and_f1(ground_truth_df, test_result_df):
    # Drop the first column (timeslot labels) for comparison
    ground_truth_data = ground_truth_df.iloc[:, 1:]
    test_result_data = test_result_df.iloc[:, 1:]
    
    total_correct = 0
    total_tested = 0
    total_ground_truth = 0
    
    # Iterate over each cell to compare ground truth and test results
    for col in ground_truth_data.columns:
        for row in ground_truth_data.index:
            # Get ground truth and test result strings, treating NaNs as empty strings
            ground_truth_value = ground_truth_data.at[row, col] if pd.notna(ground_truth_data.at[row, col]) else ""
            test_result_value = test_result_data.at[row, col] if pd.notna(test_result_data.at[row, col]) else ""
            
            # Convert both strings to sets of characters for comparison
            ground_truth_set = set(ground_truth_value)
            test_result_set = set(test_result_value)
            
            # Calculate correct, extra, and missing students
            correct_students = ground_truth_set.intersection(test_result_set)
            total_correct += len(correct_students)
            total_ground_truth += len(ground_truth_set)
            total_tested += len(test_result_set)
    
    # Calculate refined accuracy
    accuracy = (total_correct / (total_ground_truth + total_tested - total_correct)) * 100 if (total_ground_truth + total_tested - total_correct) > 0 else 0

    # Calculate precision, recall, and F1-score
    precision = total_correct / total_tested if total_tested > 0 else 0
    recall = total_correct / total_ground_truth if total_ground_truth > 0 else 0
    f1_score = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    # Print the results
    print("Refined Accuracy: {:.2f}%".format(accuracy))
    print("Precision: {:.2f}".format(precision))
    print("Recall: {:.2f}".format(recall))
    print("F1 Score: {:.2f}".format(f1_score))

    return accuracy, precision, recall, f1_score

# Calculate and display the refined accuracy, precision, recall, and F1-score
accuracy_score, precision, recall, f1_score = refined_accuracy_and_f1(ground_truth_df, test_result_df)


Refined Accuracy: 30.00%
Precision: 0.47
Recall: 0.45
F1 Score: 0.46
