# **Metrics Calculation for LLama and GPT Models**

This notebook calculates the evaluation metrics (accuracy, F1, precision, and recall) for runs of both LLama and GPT models. After setting the paths to the respective directories, the notebook will generate a `stats.csv` file containing all the metrics for each dataset and task.

In [14]:
import pandas as pd
import os
import helper as analytics
from helper import calculate_scores, get_epoch_from_checkpoint
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from utils import parse_response

In [26]:
def calculate_metrics(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    
    return accuracy, f1, precision, recall

In [2]:
def list_folders(directory):
    checkpoint_folders = []
    
    for root, dirs, files in os.walk(directory):
        for folder in dirs:
            if 'results' in folder:
                checkpoint_folders.append(os.path.join(root, folder))
                
    return checkpoint_folders

# Function to extract the checkpoint number
def get_checkpoint_number(path):
    return int(path.split('-')[-1])

# Calculate the results for Llama models

In [10]:
RESULT_DIR = "../results/meta-llama/Meta-Llama-3.1-8B-Instruct/wdc_no_quantization/2024-09-01-17-59-27/results"

experiment_paths = analytics.get_all_files_in_directory(RESULT_DIR)

stats_dataframes = []

for experiment_path in experiment_paths:
    # Load the dataset
    dataset_name = experiment_path.split("/")[-2]
    print(f"Processing {dataset_name}")
    df = pd.read_json(experiment_path)
    # Calculate stats for the filtered DataFrame
    stats_df = analytics.calculate_stats(df)
    stats_df['Dataset'] = dataset_name  # Add dataset name for reference
    stats_dataframes.append(stats_df)
    
result_df = pd.concat(stats_dataframes)
result_df.to_csv(f"{RESULT_DIR}/stats.csv", index=False)

Processing amazon-google-full
Processing wdc-fullsize
Processing abt-buy-full
Processing walmart-amazon
Processing dblp-scholar
Processing dblp-acm


# Calculate the results of OpenAPI batch jobs

In [24]:
# Main processing function
def process_results_gpt(directory):
    """
    Main function to process the results from the .jsonl files in a directory, apply transformations, and compute metrics.
    
    Parameters:
    directory (str): The path to the directory containing the JSONL files.
    """
    # Load all .jsonl files from the directory
    gpt_result = analytics.get_all_files_in_directory(directory, "jsonl")
    # Load and concatenate all JSONL files
    all_dataframes = [pd.read_json(path, lines=True) for path in gpt_result]
    
    gpt_result = pd.concat(all_dataframes, ignore_index=True) if all_dataframes else pd.DataFrame()
    
    # Ensure the DataFrame is not empty
    if not gpt_result.empty:
        # Split the custom_id into dataset, task, pair_id, and label
        gpt_result[['dataset', 'task', 'pair_id', 'label']] = gpt_result.custom_id.str.split(";", expand=True)
        gpt_result = gpt_result.drop(columns=['custom_id'])
        
        # Apply the parse_response function to the response column
        parsed_df = gpt_result["response"].apply(parse_response)
        
        # Concatenate the parsed results with the original DataFrame
        gpt_result = pd.concat([gpt_result, parsed_df], axis=1)
        
        # Transform 'content' to binary (0 or 1 based on "Yes")
        gpt_result['content'] = gpt_result['content'].apply(lambda x: 1 if "Yes" in x else 0)
        
        # Convert label from string to integer
        gpt_result['label'] = gpt_result['label'].astype(int)
        
        # Group by 'dataset' and 'task', then calculate metrics
        results = []
        grouped = gpt_result.groupby(['dataset', 'task'])
        
        for (dataset, task), group in grouped:
            y_true = group['label']
            y_pred = group['content']
            
            accuracy, f1, precision, recall = calculate_metrics(y_true, y_pred)
            
            results.append({
                'dataset': dataset,
                'task': task,
                'accuracy': accuracy,
                'f1_score': f1,
                'precision': precision,
                'recall': recall
            })
        
        # Convert the results into a DataFrame
        metrics_df = pd.DataFrame(results)
        
        # Save metrics to CSV in the same directory
        output_path = os.path.join(directory, 'stats.csv')
        metrics_df.to_csv(output_path, index=False)
        print(f"Metrics saved to {output_path}")
    else:
        print("No data found in the provided directory.")


In [27]:
# set the directory of the gpt batch job
process_results_gpt("../results/gpt-4o-mini/dblp-scholar")

Metrics saved to ../../results/gpt-4o-mini/dblp-scholar/stats.csv
