In [1]:
import pandas as pd
import ast
import numpy as np

In [267]:
def recall_at_k(expected_files, actual_files, k):
     return len(set(actual_files[:k]) & set(expected_files)) / len(expected_files)

def precision_at_k(expected_files, actual_files, k):
    return len(set(actual_files[:k]) & set(expected_files)) / k

def f1(expected_files, actual_files, k):
    TP_set = set(expected_files) & set(actual_files)
    FN_set = set(expected_files) - set(actual_files)
    FP_set = set(actual_files) - set(expected_files)
    
    TP = len(TP_set)
    FN = len(FN_set)
    FP = len(FP_set)
    
    # Compute precision, recall, and F1-score
    P = TP / (TP + FP)
    R = TP / (TP + FN)
    F1 = 2 * P * R / (P + R)
    return F1

def get_expected_files_indexes(expected_files, actual_files) -> np.ndarray[int]:
    relevant = np.isin(expected_files, actual_files).astype(int)
    return np.where(relevant == 1)[0]

In [268]:
def calc_search_metrics(expected_files, actual_files):
    if len(expected_files) == 1:
        k = 1
    else:
        k = 2
    if len(set(actual_files) & set(expected_files)) == 0:
        return {
            'R@k': 0,
            'P@k': 0,
            'f1': 0,
        }
        
    metrics = {}
    metrics['R@k'] = recall_at_k(expected_files, actual_files, k)
    metrics['P@k'] = precision_at_k(expected_files, actual_files, k)
    metrics['f1'] = f1(expected_files, actual_files, k)
    return metrics

In [342]:
def calc_retrive_metrics(expected_files, actual_files, distances):
    expected_files_indexes = get_expected_files_indexes(expected_files, actual_files)
    if len(expected_files_indexes) == 0:
        return {
            "first_expected_pos": None,
            "last_expected_pos": None,
            "first_expected_distance": None,
            "last_expected_distance": None
        }
    metrics = {
        "first_expected_pos": expected_files_indexes[0] / len(actual_files),
        "last_expected_pos": expected_files_indexes[-1] / len(actual_files),
        "first_expected_index": expected_files_indexes[0],
        "last_expected_index": expected_files_indexes[-1],
        "first_expected_distance": distances[expected_files_indexes[0]],
        "last_expected_distance": distances[expected_files_indexes[-1]],
    }
    return metrics

In [343]:
def get_chat_metrics(results_path) -> pd.DataFrame:
    df = pd.read_csv(results_path)
    df['changed_files'] = df['changed_files'].map(lambda lst: ast.literal_eval(lst))
    df['final_files'] = df['final_files'].map(lambda lst: ast.literal_eval(lst))
    df['all_generated_files'] = df['all_generated_files'].map(lambda lst: ast.literal_eval(lst))

    metrics = []
    for i, row in df.iterrows():
        expected_files = row['changed_files']
        actual_files = row['final_files']
        m = calc_search_metrics(expected_files, actual_files)
        m['time_s'] = row['time_ms'] / 1000
        m['batches_count'] = row['batches_count']
        m['empty_output'] = 1 if len(row['final_files']) == 0 else 0
        m['irrelevant_output'] = 1 if len(set(row['changed_files']) & set(row['final_files'])) == 0 else 0
        m['wrong_output'] = 1 if len(set(row['all_generated_files']) - set(row['final_files'])) > 0 else 0
        metrics.append(m)

    return pd.DataFrame(metrics)

In [359]:
import re

def add_commas_after_second_tick(s):
    backtick_positions = [pos for pos, char in enumerate(s) if char == "'"]
    for i in range(len(backtick_positions) // 2):
        s = s[:backtick_positions[i * 2 + 1] + 1 + i] + ',' + s[backtick_positions[i * 2 + 1] + 1 + i:]
    return s

def get_emb_metrics(results_path) -> pd.DataFrame:
    df = pd.read_csv(results_path)
    try:
        df['final_files'] = df['final_files'].map(lambda lst: ast.literal_eval(lst))
        df['rank_scores'] = df['rank_scores'].map(lambda lst: ast.literal_eval(lst))
        df['changed_files'] = df['changed_files'].map(lambda lst: ast.literal_eval(lst))
    except Exception as e:
        df['final_files'] = df['final_files'].map(lambda lst: ast.literal_eval(add_commas_after_second_tick(lst.replace('\n', '').replace(' ... ', ' '))))
        df['rank_scores'] = df['rank_scores'].map(lambda lst: ast.literal_eval(re.sub(r'\s+', ' ', lst.replace('\n', '')).replace(' ', ', ')))
        df['changed_files'] = df['changed_files'].map(lambda lst: ast.literal_eval(lst))

    metrics = []
    for i, row in df.iterrows():
        expected_files = row['changed_files']
        actual_files = row['final_files']
        m = {}
        search_m = calc_search_metrics(expected_files, actual_files)
        m.update(search_m)
        retrive_m = calc_retrive_metrics(expected_files, actual_files, row['rank_scores'])
        m.update(retrive_m)
        m['time_s'] = row['time_ms'] / 1000
        metrics.append(m)

    return pd.DataFrame(metrics)

# Embedding-based baselines

## tfidf_nltk_cosine

In [360]:
df_emb_metrics['tfidf_nltk_cosine'] = get_emb_metrics('/home/tigina/bug-localization/output/tfidf_nltk_cosine/results.csv')
df_emb_metrics['tfidf_nltk_cosine'].dropna().mean()

R@k                        0.134377
P@k                        0.178030
f1                         0.069601
first_expected_pos         0.003790
last_expected_pos          0.019060
first_expected_index       0.166667
last_expected_index        1.719697
first_expected_distance    0.199603
last_expected_distance     0.177596
time_s                     4.650163
dtype: float64

## tfidf_bpe_cosine

In [362]:
df_emb_metrics['tfidf_bpe_cosine'] = get_emb_metrics('/home/tigina/bug-localization/output/tfidf_bpe_cosine/results.csv')
df_emb_metrics['tfidf_bpe_cosine'].dropna().mean()

AttributeError: 'list' object has no attribute 'replace'

## codet5_cosine

In [363]:
df_emb_metrics['codet5_cosine'] = get_emb_metrics('/home/tigina/bug-localization/output/codet5_emb/results.csv')
df_emb_metrics['codet5_cosine'].dropna().mean()

R@k                         0.172905
P@k                         0.203333
f1                          0.057408
first_expected_pos          0.002279
last_expected_pos           0.015942
first_expected_index        0.220000
last_expected_index         2.013333
first_expected_distance     0.626312
last_expected_distance      0.603506
time_s                     11.754244
dtype: float64

## gte_cosine

In [365]:
df_emb_metrics['gte_cosine'] = get_emb_metrics('/home/tigina/bug-localization/output/gte_cosine/results.csv')
df_emb_metrics['gte_cosine'].dropna().mean()

R@k                         0.229611
P@k                         0.290000
f1                          0.057408
first_expected_pos          0.002279
last_expected_pos           0.015942
first_expected_index        0.220000
last_expected_index         2.013333
first_expected_distance     0.886989
last_expected_distance      0.881204
time_s                     12.047388
dtype: float64

In [366]:
for baseline in ['tfidf_nltk_cosine', 'tfidf_bpe_cosine', 'codet5_cosine', 'gte_cosine']:
    m = df_emb_metrics[baseline].dropna().mean()
    row = '{} & {:.2f} & {:.2f} & {:.2f} \\\\'.format(baseline, m['R@k'], m['R@k'], m['f1'])
    print(row)

tfidf_nltk_cosine & 0.13 & 0.13 & 0.07 \\
tfidf_bpe_cosine & 0.31 & 0.31 & 0.12 \\
codet5_cosine & 0.17 & 0.17 & 0.06 \\
gte_cosine & 0.23 & 0.23 & 0.06 \\


# Chat-based baselines

In [145]:
df_chat_metrics = {}

## openai_chat_gpt-4-1106-preview

In [146]:
df_chat_metrics['openai_chat_gpt-4-1106-preview'] = get_chat_metrics('/home/tigina/bug-localization/output/openai_chat_gpt-4-1106-preview/results.csv')
df_chat_metrics['openai_chat_gpt-4-1106-preview'].mean()

R@k                  0.292573
P@k                  0.383333
f1                   0.391753
time_s               6.786515
batches_count        1.026667
empty_output         0.026667
irrelevant_output    0.166667
wrong_output         0.166667
dtype: float64

## openai_chat_gpt-3.5-turbo-1106

In [147]:
df_chat_metrics['openai_chat_gpt-3.5-turbo-1106'] = get_chat_metrics('/home/tigina/bug-localization/output/openai_chat_gpt-3.5-turbo-1106/results.csv')
df_chat_metrics['openai_chat_gpt-3.5-turbo-1106'].mean()

R@k                  0.248622
P@k                  0.313333
f1                   0.327193
time_s               3.645958
batches_count        1.886667
empty_output         0.040000
irrelevant_output    0.366667
wrong_output         0.260000
dtype: float64