In [1]:
import pandas as pd
import os
import helper as analytics
import time
from dotenv import load_dotenv
from helper import calculate_scores, get_epoch_from_checkpoint
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

In [2]:
def calculate_metrics(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    
    return accuracy, f1, precision, recall

In [2]:
def list_folders(directory):
    checkpoint_folders = []
    
    for root, dirs, files in os.walk(directory):
        for folder in dirs:
            if 'results' in folder:
                checkpoint_folders.append(os.path.join(root, folder))
                
    return checkpoint_folders

# Function to extract the checkpoint number
def get_checkpoint_number(path):
    return int(path.split('-')[-1])

In [13]:
result_folders = list_folders('../../results/')

for result_folder in result_folders:
    # get the json files in the folder
    experiment_paths = analytics.get_all_files_in_directory(result_folder, 'json')
    for experiment in experiment_paths:
        df = pd.read_json(experiment)
        df.to_pickle(experiment.replace(".json", ".pkl.gz"), compression='gzip')  # 'snappy' is a commonly used compression.
        print(f"Saved {experiment.replace('.json', '.pkl.gz')}")

Saved ../../results/lama3.1/8B-Instruct/fine-tuned/explanations/2.8k/results/amazon-google/2024-07-26-13-27-21_lama3.pkl.gz
Saved ../../results/lama3.1/8B-Instruct/fine-tuned/explanations/2.8k/results/wdc/2024-07-26-13-12-41_lama3.pkl.gz
Saved ../../results/lama3.1/8B-Instruct/fine-tuned/explanations/2.8k/results/wdc/2024-07-26-12-49-49_lama3.pkl.gz
Saved ../../results/lama3.1/8B-Instruct/fine-tuned/explanations/2.8k/results/wdc/2024-07-26-12-36-05_lama3.pkl.gz
Saved ../../results/lama3.1/8B-Instruct/fine-tuned/explanations/2.8k/results/abt-buy/2024-07-26-13-20-27_lama3.pkl.gz
Saved ../../results/meta-llama/Meta-Llama-3.1-8B-Instruct/amazon-google/no_explanations/2024-08-14-16-50-06/results/amazon-google-full/2024-08-14-19-18-11_lama3.pkl.gz
Saved ../../results/meta-llama/Meta-Llama-3.1-8B-Instruct/amazon-google/no_explanations/2024-08-14-16-50-06/results/wdc-fullsize/2024-08-14-18-48-41_lama3.pkl.gz
Saved ../../results/meta-llama/Meta-Llama-3.1-8B-Instruct/amazon-google/no_explanation

In [2]:
included_tasks=["domain-complex-free (Product)", "domain-simple-free (Product)", "domain-complex-force (Product)", "domain-simple-force (Product)"]

In [10]:
RESULT_DIR = "../../results/meta-llama/Meta-Llama-3.1-8B-Instruct/wdc_no_quantization/2024-09-01-17-59-27/results"

experiment_paths = analytics.get_all_files_in_directory(RESULT_DIR)

stats_dataframes = []

for experiment_path in experiment_paths:
    # Load the dataset
    dataset_name = experiment_path.split("/")[-2]
    print(f"Processing {dataset_name}")
    df = pd.read_json(experiment_path)
    # Calculate stats for the filtered DataFrame
    stats_df = analytics.calculate_stats(df)
    stats_df['Dataset'] = dataset_name  # Add dataset name for reference
    stats_dataframes.append(stats_df)
    
result_df = pd.concat(stats_dataframes)
result_df.to_csv(f"{RESULT_DIR}/stats.csv", index=False)

Processing amazon-google-full
Processing wdc-fullsize
Processing abt-buy-full
Processing walmart-amazon
Processing dblp-scholar
Processing dblp-acm


In [11]:
result_df

Unnamed: 0,Task,Accuracy,F1 Score,Precision,Recall,Num -1 Responses,Dataset
0,domain-complex-free (Product),0.783689,0.446429,0.302115,0.854701,0,amazon-google-full
1,domain-simple-free (Product),0.810728,0.488208,0.337134,0.884615,0,amazon-google-full
2,domain-complex-force (Product),0.756215,0.423117,0.278912,0.876068,0,amazon-google-full
3,domain-simple-force (Product),0.779765,0.45049,0.30219,0.884615,0,amazon-google-full
0,domain-complex-free (Product),0.910222,0.684867,0.561381,0.878,0,wdc-fullsize
1,domain-simple-free (Product),0.923556,0.709949,0.613703,0.842,0,wdc-fullsize
2,domain-complex-force (Product),0.898444,0.656649,0.525872,0.874,0,wdc-fullsize
3,domain-simple-force (Product),0.915778,0.694108,0.581867,0.86,0,wdc-fullsize
0,domain-complex-free (Product),0.935804,0.760234,0.635179,0.946602,0,abt-buy-full
1,domain-simple-free (Product),0.951983,0.803419,0.717557,0.912621,0,abt-buy-full


## Continues training

In [34]:
def analyse_validation(checkpoint_path, new_model_name):
    result_file = f"{checkpoint_path}/validation_results.csv"
    checkpoint_paths = list_checkpoint_folders(checkpoint_path)
    # Sorting the list by the checkpoint number
    checkpoint_paths = sorted(checkpoint_paths, key=get_checkpoint_number)
    
    results = []
    # get all validation files
    for checkpoint_path in checkpoint_paths:
        if not os.path.exists(f"{checkpoint_path}/validation_results.json"):
            print(f"Validation results already exist for {checkpoint_path}")
            continue
        df = pd.read_json(f"{checkpoint_path}/validation_results.json")
        df.loc[df['chatbot_response_clean'] == -1, 'chatbot_response_clean'] = 0
        f1, precision, recall = calculate_scores(df)
        epoch = get_epoch_from_checkpoint(checkpoint_paths, int(checkpoint_path.split("/")[-1].replace("checkpoint-", "")))
        result = {
            "checkpoint_path": checkpoint_path,
            "checkpoint_number": checkpoint_path.split("/")[-1].replace("checkpoint-", ""),
            "epoch": epoch,
            "f1": f1,
            "precision": precision,
            "recall": recall
        }
        results.append(result)
    df = pd.DataFrame(results)

    # sort by highest f1
    df_sorted = df.sort_values(by='f1', ascending=False)
    #df_sorted.to_csv("{checkpoint_path}/validation_results.csv", index=False)

    # get the checkpoint path for the best f1
    best_checkpoint_path = df_sorted.iloc[0]['checkpoint_path']
    print(f"Best Checkpoint Path: {best_checkpoint_path}")
    print(f"Best F1: {df_sorted.iloc[0]['f1']}")
    df_sorted.to_csv(result_file, index=False)
    return best_checkpoint_path

In [35]:
for i in range(1,6):
    checkpoint_path = f"../../results/meta-llama/Meta-Llama-3.1-8B-Instruct/error/synthetic_textual/2024-08-12-14-59-11_explanation/Meta-Llama-3.1-8B-Instruct-error-small_explanation-enhanced-{i}"
    new_model_name = f"Meta-Llama-3.1-70B-Instruct-error-small_explanation-enhanced-{i}"
    best_checkpoint_path = analyse_validation(checkpoint_path, new_model_name)
    print(f"Best Checkpoint Path: {best_checkpoint_path}")

Best Checkpoint Path: ../../results/meta-llama/Meta-Llama-3.1-8B-Instruct/error/synthetic_textual/2024-08-12-14-59-11_explanation/Meta-Llama-3.1-8B-Instruct-error-small_explanation-enhanced-1/checkpoint-286
Best F1: 0.7471852610030706
Best Checkpoint Path: ../../results/meta-llama/Meta-Llama-3.1-8B-Instruct/error/synthetic_textual/2024-08-12-14-59-11_explanation/Meta-Llama-3.1-8B-Instruct-error-small_explanation-enhanced-1/checkpoint-286
Best Checkpoint Path: ../../results/meta-llama/Meta-Llama-3.1-8B-Instruct/error/synthetic_textual/2024-08-12-14-59-11_explanation/Meta-Llama-3.1-8B-Instruct-error-small_explanation-enhanced-2/checkpoint-143
Best F1: 0.7523105360443623
Best Checkpoint Path: ../../results/meta-llama/Meta-Llama-3.1-8B-Instruct/error/synthetic_textual/2024-08-12-14-59-11_explanation/Meta-Llama-3.1-8B-Instruct-error-small_explanation-enhanced-2/checkpoint-143
Best Checkpoint Path: ../../results/meta-llama/Meta-Llama-3.1-8B-Instruct/error/synthetic_textual/2024-08-12-14-59-1

## Batch job

In [7]:
def parse_response(response):
    body = response.get("body", {})
    usage = body.get("usage", {})
    choices = body.get("choices", [{}])
    message = choices[0].get("message", {}) if choices else {}

    return pd.Series({
        "status_code": response.get("status_code"),
        "request_id": response.get("request_id"),
        "completion_id": body.get("id"),
        "created": body.get("created"),
        "model": body.get("model"),
        "content": message.get("content"),
        "prompt_tokens": usage.get("prompt_tokens"),
        "completion_tokens": usage.get("completion_tokens"),
        "total_tokens": usage.get("total_tokens"),
    })

In [35]:
df = pd.read_json("../../results/gpt-4o-mini/tobedetermined/batch_s1Z0ng7isxQqHWGDdTQI70bM_output.jsonl", lines=True)["response"].apply(parse_response)

# calculate the average completion tokens

df["model"].iloc[0]

'ft:gpt-4o-mini-2024-07-18:wbsg-uni-mannheim::9z0GiK5t'

In [12]:
gpt_result_1_path = "../../results/gpt-4o-mini/wdc-small-synthetic-filtered-interesting/batch_EHm0OsyYK3mO3l61WDzDwRs6_output.jsonl"

gpt_result_1 = pd.read_json(gpt_result_1_path, lines=True)
gpt_result_2 = pd.read_json("../../results/gpt-4o-mini/wdc-small-synthetic-filtered-interesting/batch_y1qVVew2QgFfM1qzXuSOo0zD_output.jsonl", lines=True)

# concatenate the two dataframes
gpt_result = pd.concat([gpt_result_1, gpt_result_2])
# reset the index
gpt_result = gpt_result.reset_index(drop=True)

# split the custom_id into dataset, task and index
gpt_result[['dataset', 'task', 'pair_id', 'label']] = gpt_result.custom_id.str.split(";", expand=True)
gpt_result = gpt_result.drop(columns=['custom_id'])

# Apply the function to the response column
parsed_df = gpt_result["response"].apply(parse_response)

# Concatenate the parsed results with the original dataframe
gpt_result = pd.concat([gpt_result, parsed_df], axis=1)

In [13]:
# transform content to 0 or 1
gpt_result['content'] = gpt_result['content'].apply(lambda x: 1 if "Yes" in x else 0)

# change label from string to int
gpt_result['label'] = gpt_result['label'].astype(int)

# Assuming 'label' is the true label and 'response' is the predicted label
results = []

grouped = gpt_result.groupby(['dataset', 'task'])

for (dataset, task), group in grouped:
    y_true = group['label']
    y_pred = group['content']
    
    accuracy, f1, precision, recall = calculate_metrics(y_true, y_pred)
    
    results.append({
        'dataset': dataset,
        'task': task,
        'accuracy': accuracy,
        'f1_score': f1,
        'precision': precision,
        'recall': recall
    })

# Convert the results into a DataFrame
metrics_df = pd.DataFrame(results)
metrics_df.to_csv(f"{gpt_result_1_path.rsplit('/', 1)[0]}/stats.csv", index=False)

In [14]:
metrics_df

Unnamed: 0,dataset,task,accuracy,f1_score,precision,recall
0,abt-buy-full,domain-complex-force (Product),0.9619,0.843011,0.756757,0.951456
1,abt-buy-full,domain-complex-free (Product),0.95929,0.834746,0.740602,0.956311
2,abt-buy-full,domain-simple-force (Product),0.962944,0.844639,0.768924,0.936893
3,abt-buy-full,domain-simple-free (Product),0.962944,0.845316,0.766798,0.941748
4,amazon-google-full,domain-complex-force (Product),0.873528,0.594972,0.441909,0.910256
5,amazon-google-full,domain-complex-free (Product),0.873092,0.596394,0.441478,0.918803
6,amazon-google-full,domain-simple-force (Product),0.873092,0.593007,0.440748,0.905983
7,amazon-google-full,domain-simple-free (Product),0.865678,0.580381,0.426,0.910256
8,dblp-acm,domain-complex-force (Product),0.854832,0.711647,0.553059,0.997748
9,dblp-acm,domain-complex-free (Product),0.85281,0.708333,0.549751,0.995495
