In [61]:
import os
import json
import pandas as pd
from openai import OpenAI
import openai

# OpenAI api key
key = ''
# set the env variable
os.environ['OPENAI_API_KEY'] = key

client = OpenAI()

ModuleNotFoundError: No module named 'openai'

In [None]:
def json_to_df(sys_eval_path):
    with open(sys_eval_path, 'r') as file:
        system_eval = json.load(file)

    # Convert the dictionary to a DataFrame
    df = pd.DataFrame(system_eval).T.reset_index().rename(columns={'index': 'query'})

    # Gather the model_x columns into two columns - model and value
    df_melt = df.melt(id_vars=['query', 'type', 'truth', 'context'], var_name='model', value_name='response')

    return df_melt

def generate_grade(row, model_name='gpt-3.5-turbo-1106', system_prompt=
                   """You are GPT-4, a large language model created by OpenAI. 
                    You are a precise grader and will be provided a question, some background context, and an answer to the question. 
                    Your task is to grade how good the answer to the question is based on the background context, on a 1-10 scale. 
                    Just answer with a grade as a single number, e.g. 1, no further explanation is needed. 
                    Please do a good job as my your work is very important to my career."""):
    """
    Get the grade for a specific qa pair.
    """
    query = row['query']
    context = row['context']
    response = row['response']
    # Create the string
    qa_pair = f'Question: {query}, Context: {context}, Answer: {response}'

    # llm evaluation
    try:
        completion = client.chat.completions.create(
            model=model_name,
            messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": qa_pair}
            ]
        )
    except openai.APIError as e:
        print(f'Error: {e}, setting grade to 0')
        grade = 0
        return 0

    # get the answer from the llm and cast to int
    try:
        grade = float(completion.choices[0].message.content)
        return grade
    except ValueError:
        print('Could not cast grade to float. Setting grade to 0.')
        grade = 0
        return grade

def llm_evaluation(df):
    """
    qa_pairs should be the result from the generate_qa_pairs function.
    """
    df['grade'] = df.apply(generate_grade, axis=1)

    return df

In [None]:
def update_pickle(graded_df, pickle_path='eval_dataset/graded_df.pkl'):
    # Read the pickle file
    old_data = pd.read_pickle(pickle_path)

    # Concatenate the new data with the existing DataFrame
    merged_df = pd.concat([graded_df, old_data], ignore_index=True)

    # Drop duplicates
    merged_df.drop_duplicates(inplace=True)

    # Save the updated DataFrame to a pickle file
    merged_df.to_pickle(pickle_path)

def filter_new_rows(df, graded_df):
    # Merge the two dataframes
    # indicator=True adds a '_merge' column that specifies the source
    merged_df = pd.merge(df, graded_df, how='outer', indicator=True)

    # Filter the rows only in 'df' dataframe
    # i.e., first argument in pd.merge
    df_only = merged_df.loc[merged_df['_merge'] == 'left_only']

    # Drop the '_merge' and 'grade' columns
    df_only = df_only.drop(columns=['grade', '_merge'])

    return df_only


In [None]:
# json files
sys_eval_path = 'eval_dataset/system_eval.json'
# create qa pairs
df = json_to_df(sys_eval_path)
# Read the pickled DataFrame
graded_df = pd.read_pickle("eval_dataset/graded_df.pkl")
# Get only newly added tests
new_test_df = filter_new_rows(df, graded_df)


In [None]:
# Example:
# this returns df[10:15]
# i.e., non-overlapping rows of df
filter_new_rows(df[8:15], graded_df[:10])

In [None]:
df[10:15]

In [None]:
# run evaluation
graded_df = llm_evaluation(new_test_df)

In [None]:
# Update pickle file
update_pickle(graded_df)

## Tests on data aggregation

### Evaluate retriever

In [111]:
def check_course_code(row):
    if isinstance(row['response'], list):
        return any(row['truth'] in str(item) for item in row['response'])
    elif pd.isna(row['response']):
        return False
    else:
        return row['truth'] in str(row['response'])

def sort_models(s):
    # Create a temporary DataFrame
    temp_df = pd.DataFrame({
        'model': s.index,
        'numbers': s.index.str.extract('(\d+)', expand=False).astype(int)
    })
    
    # Sort the DataFrame by the model numbers
    temp_df = temp_df.sort_values('numbers')
    
    # Set the index of the Series to the sorted original index
    s = s.reindex(temp_df['model'])
    
    return s

def sort_multi_index_models(s):
    # Convert the Series to a DataFrame
    df = s.reset_index()
    
    # Extract the numerical part of 'model_x'
    df['model_num'] = df['model'].str.extract('(\d+)', expand=False).astype(int)
    
    # Sort the DataFrame by 'model_num' and 'type'
    df = df.sort_values(['model_num', 'type'])
    
    # Drop the 'model_num' column
    df = df.drop(columns=['model_num'])
    
    # Convert the DataFrame back to a Series
    s = pd.Series(df['grade'].values, index=[df['model'], df['type']])
    
    return df

retr_df = json_to_df('eval_dataset/retriever_eval.json')
retr_df['course_code_check'] = retr_df.apply(check_course_code, axis=1)

In [None]:
retriever_results = retr_df[retr_df['type'] == 'specific'].groupby('model')['course_code_check'].mean().reset_index()
retriever_results.index = retriever_results['model']
retriever_results.drop('model', inplace=True, axis=1)

In [None]:
sort_models(retriever_results)

Unnamed: 0_level_0,course_code_check
model,Unnamed: 1_level_1
model_1,1.0
model_2,0.733333
model_3,1.0
model_4,0.733333
model_5,1.0
model_6,0.733333
model_7,1.0
model_8,0.733333
model_9,1.0
model_10,0.733333


In [None]:
retr_df[(retr_df['type'] == 'specific') & (retr_df['response'].isna())]


Unnamed: 0,query,type,truth,context,model,response,course_code_check


### Manual regrading

In [None]:
import textwrap

regraded_df = graded_df.sort_values(['query', 'model']).copy().reset_index(drop=True)

for index, row in regraded_df.iterrows():
    print(f"{index} Query: {row['query']}", flush=True)
    
    # Wrap the response to a maximum width of 160 characters
    response = textwrap.fill(str(row['response']), width=160)
    print(f"{row['model']} response: {response}", flush=True)
    
    print("Grade: ", row['grade'], flush=True)
    user_grade = input("Please enter your grade: ")
    print("------------------------------------------", flush=True)
    regraded_df.loc[index, 'user_grade'] = user_grade

0 Query: Are there any courses for students interested in artificial intelligence?
model_10 response: Yes, there are two courses listed in the passage: 1.  Artificial Intelligence (dit410) and 2.  Introduction to Artificial Intelligence (dit411).
Grade:  100.0
------------------------------------------
1 Query: Are there any courses for students interested in artificial intelligence?
model_2 response:  Yes Хронологија: 2021/09/%20AI%20in%20society:%20language%20knowledge%20and%20ethics Хронологија:
2021/09/%20AI%20in%20society:%20language%20knowledge%20and%20ethics
Хронологија:\</ightarrow<unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk> Хронологија Хронологија Хронологија<s> instanceof<s>
instanceof<s> instanceof<s> instanceof<s> instanceof<s> instanceof<s> instanceof<s> instanceof<s> instanceof<s> instanceof<s> instanceof<s> instanceof<s>
instanceof<s> instanceof<s> instanceof<s>bolds Хронологија<s>bolds Хронологија Хронологија Хронологија Хронологија<s

In [89]:
regraded_df['user_grade'] = regraded_df['user_grade'].astype(int)
regraded_df.to_pickle('regraded.pkl')

In [103]:
regraded_df = pd.read_pickle('regraded.pkl')
regraded_df.head()

Unnamed: 0,query,type,truth,context,model,response,grade,user_grade
0,Are there any courses for students interested ...,general,truth,"passage: course code: ais101, course content a...",model_10,"Yes, there are two courses listed in the passa...",100.0,95
1,Are there any courses for students interested ...,general,truth,"passage: course code: ais101, course content a...",model_2,Yes Хронологија: 2021/09/%20AI%20in%20society...,100.0,0
2,Are there any courses for students interested ...,general,truth,"passage: course code: ais101, course content a...",model_4,"Yes, there are several courses for students i...",100.0,0
3,Are there any courses for students interested ...,general,truth,"passage: course code: ais101, course content a...",model_6,"Yes, there are several courses related to Art...",100.0,95
4,Are there any courses for students interested ...,general,truth,"passage: course code: ais101, course content a...",model_8,"Yes, there are several courses at the Univers...",100.0,95


### Evaluate generator

In [136]:
def aggregated_results(pkl1='eval_dataset/regraded.pkl',
                       pkl2='eval_dataset/goldstandard_regraded.pkl',
                       pkl3='eval_dataset/regraded_anton.pkl',
                       pkl4='eval_dataset/regraded_patrick.pkl'):
    """
    """

    df1 = pd.read_pickle(pkl1)
    df2 = pd.read_pickle(pkl2)
    df3 = pd.read_pickle(pkl3)
    df4 = pd.read_pickle(pkl4)

    total_grades = df1['user_grade'].astype(float) + df2['user_grade'].astype(float) + df3['user_grade'].astype(float) + df4['user_grade'].astype(float)

    average_grades = total_grades / 4

    # Creating a new DataFrame with the average grades
    aggregated_df = df1.iloc[:, :-1].copy()
    aggregated_df['user_grade'] = average_grades

    return aggregated_df

In [140]:
regraded_df = aggregated_results()

In [141]:
# Calculate average score per model
average_score_per_model = sort_models(regraded_df.groupby('model')[['grade', 'user_grade']].mean())

# Calculate average score per model for "specific" and "general" questions separately
average_score_per_model_and_type = sort_multi_index_models(regraded_df.groupby(['model', 'type'])[['grade', 'user_grade']].mean())


In [None]:
average_score_per_model

Unnamed: 0_level_0,grade,user_grade
model,Unnamed: 1_level_1,Unnamed: 2_level_1
model_1,19.333333,0.0
model_2,17.5,0.0
model_3,58.0,0.0
model_4,80.75,0.0
model_5,89.0,95.0
model_6,68.5,72.25
model_7,93.666667,94.333333
model_8,84.75,69.25
model_9,86.0,78.0
model_10,83.75,57.5


In [143]:
average_score_per_model_and_type

Unnamed: 0,model,type,grade,user_grade
0,model_1,specific,19.333333,3.333333
3,model_2,general,20.0,0.0
4,model_2,specific,16.666667,1.583333
5,model_3,specific,58.0,17.151667
6,model_4,general,87.0,20.75
7,model_4,specific,78.666667,15.083333
8,model_5,specific,89.0,94.75
9,model_6,general,86.6,90.25
10,model_6,specific,62.466667,70.25
11,model_7,specific,93.666667,91.75


In [147]:
df = average_score_per_model_and_type.copy()
# Create a mapping dictionary
mapping_dict = {
    'model_1': 'Model 1',
    'model_2': 'Model 1',
    'model_3': 'Model 2',
    'model_4': 'Model 2',
    'model_5': 'Model 3',
    'model_6': 'Model 3',
    'model_7': 'Model 4',
    'model_8': 'Model 4',
    'model_9': 'Model 5',
    'model_10': 'Model 5'
}

# Replace the values in the 'model' column
df['model'] = df['model'].replace(mapping_dict)

# Create a mapping dictionary for the 'type' column
type_mapping_dict = {
    'specific': 'Exact search specific',
    'general': 'General'
}

# Replace the values in the 'type' column
df['type'] = df['type'].replace(type_mapping_dict)

# Add a new type for 'Similarity search specific'
df.loc[df['model'].duplicated(keep=False) & (df['type'] == 'Exact search specific'), 'type'] = 'Similarity search specific'

print(df)

      model                        type      grade  user_grade
0   Model 1  Similarity search specific  19.333333    3.333333
3   Model 1                     General  20.000000    0.000000
4   Model 1  Similarity search specific  16.666667    1.583333
5   Model 2  Similarity search specific  58.000000   17.151667
6   Model 2                     General  87.000000   20.750000
7   Model 2  Similarity search specific  78.666667   15.083333
8   Model 3  Similarity search specific  89.000000   94.750000
9   Model 3                     General  86.600000   90.250000
10  Model 3  Similarity search specific  62.466667   70.250000
11  Model 4  Similarity search specific  93.666667   91.750000
12  Model 4                     General  93.000000   93.500000
13  Model 4  Similarity search specific  82.000000   67.916667
14  Model 5  Similarity search specific  86.000000   74.916667
1   Model 5                     General  87.000000   67.750000
2   Model 5  Similarity search specific  82.666667   56

In [148]:
df

Unnamed: 0,model,type,grade,user_grade
0,Model 1,Similarity search specific,19.333333,3.333333
3,Model 1,General,20.0,0.0
4,Model 1,Similarity search specific,16.666667,1.583333
5,Model 2,Similarity search specific,58.0,17.151667
6,Model 2,General,87.0,20.75
7,Model 2,Similarity search specific,78.666667,15.083333
8,Model 3,Similarity search specific,89.0,94.75
9,Model 3,General,86.6,90.25
10,Model 3,Similarity search specific,62.466667,70.25
11,Model 4,Similarity search specific,93.666667,91.75


In [144]:
def top_k_acc(graded_df, grade_col='grade'):
    # Create a DataFrame where each row is a query and each column is a model, 
    # and the values are the grades given by GPT4
    pivot_df = graded_df.pivot(index='query', columns='model', values=grade_col)

    # For each query, find the models that got the highest grade
    winning_models = pivot_df.apply(lambda row: row[row == row.max()].index, axis=1)

    # Create a list of all winning models
    all_winning_models = [model for sublist in winning_models for model in sublist]

    # Calculate the percentage of queries where each model had the highest grade
    top_k_accuracy = sort_models(pd.Series(all_winning_models).value_counts() / pivot_df.shape[0])
    top_k_accuracy.name = grade_col
    return top_k_accuracy


In [145]:
top_k_accuracy = pd.concat([top_k_acc(regraded_df), top_k_acc(regraded_df, 'user_grade')], axis=1)

In [146]:
df_eval_1 = pd.concat([average_score_per_model, top_k_accuracy], axis=1)

df_eval_1.index = pd.MultiIndex.from_tuples([
    ('model 1', 'exact'),
    ('model 1', 'similarity'),
    ('model 2', 'exact'),
    ('model 2', 'similarity'),
    ('model 3', 'exact'),
    ('model 3', 'similarity'),
    ('model 4', 'exact'),
    ('model 4', 'similarity'),
    ('model 5', 'exact'),
    ('model 5', 'similarity')
], names=['Model', 'Type'])

df_eval_1.columns = pd.MultiIndex.from_tuples([
    ('Average', 'GPT3.5 grade'),
    ('Average', 'user grade'),
    ('Top Rate', 'GPT3.5 grade'),
    ('Top Rate', 'user grade')
], names=['Type', 'Measure'])

df_eval_1

Unnamed: 0_level_0,Type,Average,Average,Top Rate,Top Rate
Unnamed: 0_level_1,Measure,GPT3.5 grade,user grade,GPT3.5 grade,user grade
Model,Type,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
model 1,exact,19.333333,3.333333,0.1,
model 1,similarity,17.5,1.1875,0.1,
model 2,exact,58.0,17.151667,0.4,
model 2,similarity,80.75,16.5,0.55,
model 3,exact,89.0,94.75,0.35,0.45
model 3,similarity,68.5,75.25,0.2,
model 4,exact,93.666667,91.75,0.3,0.3
model 4,similarity,84.75,74.3125,0.45,0.25
model 5,exact,86.0,74.916667,0.25,0.15
model 5,similarity,83.75,59.4375,0.3,


### Stuff that was already here

In [None]:
# responses that generated an error
graded_df[graded_df['grade'] == 0]

In [None]:
print(graded_df.groupby('model')['model'].count())

print(graded_df.groupby('model')['grade'].mean())

In [None]:
def process_results(df, model_info):
    """
    """

    