# Predictions Evaluation

In [None]:
from pathlib import Path
import numpy as np
import pandas as pd

from utils.smooth_bleu import bleu_fromstr

In [None]:
def analyze_preds(base_file, sample_size=5):
    # read files
    hf_preds_file = Path(base_file).with_suffix('.hf_pred.csv')
    fine_tuned_file = Path(base_file).with_suffix('.finetuned_pred.csv')
    hf_preds = pd.read_csv(hf_preds_file)
    fine_tuned = pd.read_csv(fine_tuned_file)
    # put in df
    df = pd.DataFrame({'code': fine_tuned['code'], 'hf_pred': hf_preds['prediction'], 'fine_tuned_pred': fine_tuned['prediction']})
    df.replace(np.nan, '', regex=True)
    # print sample with predictions
    sample = df.sample(sample_size)
    for code, hf_pred, fine_tuned_pred in sample.to_numpy():
        print('-------------------')
        print(code)
        print(f'HF Pred: {hf_pred}')
        print(f'Fine Tuned Pred: {fine_tuned_pred}')
    return df

In [None]:
def calc_bleu(df):
    refs = list(df['target'])
    preds = list(df['prediction'])
    for i in range(len(preds)):
        chars = "(_)`."
        for c in chars:
            preds[i] = preds[i].replace(c, " " + c + " ")
            preds[i] = " ".join(preds[i].split())
            refs[i] = refs[i].replace(c, " " + c + " ")
            refs[i] = " ".join(refs[i].split())
    return bleu_fromstr(preds, refs, rmstop=False)

def calc_bleu_score(base_file):
    hf_preds_file = Path(base_file).with_suffix('.hf_pred.csv')
    fine_tuned_file = Path(base_file).with_suffix('.finetuned_pred.csv')
    hf_preds = pd.read_csv(hf_preds_file)
    ft_preds = pd.read_csv(fine_tuned_file)
    hf_preds.replace(np.nan, '', regex=True, inplace=True)
    ft_preds.replace(np.nan, '', regex=True, inplace=True)
    hf_bleu = calc_bleu(hf_preds)
    ft_bleu = calc_bleu(ft_preds)
    print(f'HF BLEU: {hf_bleu}')
    print(f'Fine Tuned BLEU: {ft_bleu}')
    return hf_bleu, ft_bleu

## Qualitative Evaluation
We will now compare the predictions of the HF model and the fine-tuned model on samples of the four datasets.

We will print the code, the prediction of the HF model and the prediction of the fine-tuned model.

In [None]:
df = {}

In [None]:
df['msg'] = analyze_preds('../data/msg-test')

In [None]:
df['vscode'] = analyze_preds('../data/microsoft_vscode_1000.csv')

In [None]:
df['kotlin'] = analyze_preds('../data/JetBrains_kotlin_1000.csv')

In [None]:
df['uppy'] = analyze_preds('../data/transloadit_uppy_1000.csv')

As we can see, the fine-tuned model produces better predictions than the HF model. The predictions are much more insightful and detailed. The HF model tends to produce more generic predictions, while the fine-tuned model produces predictions that are more specific to the code.

## Quantitative Evaluation
For each dataset, we calculate the [BLEU-4](https://en.wikipedia.org/wiki/BLEU) score for the predictions of the HF model and the fine-tuned model. The BLEU score is a measure of how similar the predictions are to the target. The higher the score, the better the predictions.

In [None]:
calc_bleu_score('../data/msg-test')

In [None]:
calc_bleu_score('../data/microsoft_vscode_1000.csv')

In [None]:
calc_bleu_score('../data/JetBrains_kotlin_1000.csv')

In [None]:
calc_bleu_score('../data/transloadit_uppy_1000.csv')

As we can see, the fine-tuned model performs better than the HF model on all datasets. The semantic value of the predictions is also better, as we can see in the qualitative evaluation.