In [11]:
import pandas as pd
import numpy as np
import scipy.stats as stats

In [12]:
result_dataset_path = "datasets/movies_reordered_sampled_labelled.csv"
movies = pd.read_csv(result_dataset_path, index_col=0)

In [13]:
def percent_match(list1, list2):
    """
    Calculate the percentage of matching values between two lists.
    
    Parameters:
    list1 (list): The first list of values.
    list2 (list): The second list of values.
    
    Returns:
    float: The percentage of matching values.
    """
    # Ensure the lists are of the same length
    if len(list1) != len(list2):
        raise ValueError("The two lists must be of the same length")

    # Count the number of matching values
    # matches = [1 for a, b in zip(list1, list2) if a == b]
    matches = [1 for a, b in zip(list1, list2) if b in a]

    # Calculate the percentage of matches
    #percent_match = (sum(matches) / len(list1))
    #return percent_match
    return sum(matches)

In [14]:
movies.columns

Index(['movieinfo_rottentomatoeslink_movietitle', 'reviewtype', 'topcritic',
       'productioncompany', 'genres', 'reviewcontent', 'manual_labels',
       'llama2-7b-chat-hf', 'llama3-8b', 'gemma-2-2b-it', 'gemma-2-9b-it'],
      dtype='object')

In [15]:
def bootstrap(df, prediction_column: str, N=10000):
    accuracy = []
    for _ in range(N):
        # Randomly sample from df["label"] with replacement
        sample_indices = df['manual_labels'].sample(n=100, replace=True).index
        
        sampled_prediction = df.loc[sample_indices, prediction_column]
        sampled_label = df.loc[sample_indices, 'manual_labels']
        
        accuracy.append(percent_match(list(sampled_prediction), list(sampled_label)))
    return accuracy

In [16]:
def mean_confidence_interval(data):
    """Calculate mean, p50, p5, and p95 intervals"""
    data = np.array(data)
    mean = np.mean(data)
    median = np.percentile(data, 50)
    lower = np.percentile(data, 5)
    upper = np.percentile(data, 95)
    return mean, median, lower, upper
    

In [17]:
# model = "gpt4o"
model = "llama3-8b"
# model = "llama3-70b"
for model in ["llama3-8b", "gemma-2-2b-it", "gemma-2-9b-it", "llama2-7b-chat-hf"]:
    print(f"Model: {model}")
    results = bootstrap(movies, model)
    print(mean_confidence_interval(results))

Model: llama3-8b
(97.0034, 97.0, 94.0, 99.0)
Model: gemma-2-2b-it
(90.0495, 90.0, 85.0, 95.0)
Model: gemma-2-9b-it
(90.98, 91.0, 86.0, 95.0)
Model: llama2-7b-chat-hf
(16.9818, 17.0, 11.0, 23.0)
