# Predictions Evaluation

In [13]:
from pathlib import Path
import numpy as np
import pandas as pd

from utils.smooth_bleu import bleu_fromstr

In [14]:
def analyze_preds(base_file, sample_size=5):
    # read files
    hf_preds_file = Path(base_file).with_suffix('.hf_pred.csv')
    fine_tuned_file = Path(base_file).with_suffix('.finetuned_pred.csv')
    hf_preds = pd.read_csv(hf_preds_file)
    fine_tuned = pd.read_csv(fine_tuned_file)
    # put in df
    df = pd.DataFrame({'code': fine_tuned['code'],
     'hf_pred': hf_preds['prediction'],
     'fine_tuned_pred': fine_tuned['prediction'],
     'target': fine_tuned['target']})
    df.replace(np.nan, '', regex=True)
    # print sample with predictions
    sample = df.sample(sample_size, random_state=42)
    for code, hf_pred, fine_tuned_pred, target in sample.to_numpy():
        print('-------------------')
        print(code)
        print(f'## Human Review: {target}')
        print(f'## HF Pred: {hf_pred}')
        print(f'## Fine Tuned Pred: {fine_tuned_pred}')
    return df

In [15]:
def calc_bleu(df):
    refs = list(df['target'])
    preds = list(df['prediction'])
    for i in range(len(preds)):
        chars = "(_)`."
        for c in chars:
            preds[i] = preds[i].replace(c, " " + c + " ")
            preds[i] = " ".join(preds[i].split())
            refs[i] = refs[i].replace(c, " " + c + " ")
            refs[i] = " ".join(refs[i].split())
    return bleu_fromstr(preds, refs, rmstop=False)

def calc_bleu_score(base_file):
    hf_preds_file = Path(base_file).with_suffix('.hf_pred.csv')
    fine_tuned_file = Path(base_file).with_suffix('.finetuned_pred.csv')
    hf_preds = pd.read_csv(hf_preds_file)
    ft_preds = pd.read_csv(fine_tuned_file)
    hf_preds.replace(np.nan, '', regex=True, inplace=True)
    ft_preds.replace(np.nan, '', regex=True, inplace=True)
    hf_bleu = calc_bleu(hf_preds)
    ft_bleu = calc_bleu(ft_preds)
    print(f'HF BLEU: {hf_bleu}')
    print(f'Fine Tuned BLEU: {ft_bleu}')
    return hf_bleu, ft_bleu

## Qualitative Evaluation
We will now compare the predictions of the HF model and the fine-tuned model on samples of the four datasets.

We will print the code, the prediction of the HF model and the prediction of the fine-tuned model.

In [16]:
df = {}

In [17]:
df['msg'] = analyze_preds('../data/msg-test')

-------------------
@@ -1,6 +1,7 @@
 # frozen_string_literal: true

 require 'hocon'
+require 'bolt/error'

 class TransportConfig
   attr_accessor :host, :port, :ssl_cert, :ssl_key, :ssl_ca_cert, :ssl_cipher_suites,
## Human Review: Looks like this isn't used in this file?
## HF Pred: Why do we need this?
## Fine Tuned Pred: I am surprised you don't use `hocon` here.
-------------------
@@ -92,7 +92,7 @@ public final class OAuth2AuthorizedClientArgumentResolver implements HandlerMeth
 							clientRegistrationId.flatMap(id -> Mono.error(new IllegalStateException(
 									"Unable to resolve the Authorized Client with registration identifier \""
 											+ id
-											+ "\". An \"authenticated\" or \"unauthenticated\" session is required. To allow for unauthenticated access, ensure ServerHttpSecurity.anonymous() is configured."))))
+											+ "\". An \"authenticated\" or \"anonymous\" request is required. To allow for anonymous access, ensure ServerHttpSecurity.anonymous() is

In [18]:
df['vscode'] = analyze_preds('../data/microsoft_vscode_1000.csv')

-------------------
@@ -340,6 +341,17 @@ export class DebugEditorContribution implements IDebugEditorContribution {
 		return new RunOnceScheduler(() => this.hoverWidget.hide(), HOVER_DELAY);
 	}
 
+	@memoize
+	private get provideNonDebugHoverScheduler(): RunOnceScheduler {
+		return new RunOnceScheduler(() => {
## Human Review: Where is this disposed?
## HF Pred: Why is this memoized?
## Fine Tuned Pred: This is not `Runnable` but `RunOnceScheduler`, right?
-------------------
@@ -53,6 +53,11 @@ export class SassParser extends cssParser.Parser {
 
 	// Sass variables: $font-size: 12px;
 	public _parseVariableDeclaration(panic:scanner.TokenType[]=[]): nodes.VariableDeclaration {
+		var cssVariableDeclaration= super._parseCssVariableDeclaration(panic);
## Human Review: That looks wrong. Not all places where you can declare a sass variable are also suited to declare a css variable.

## HF Pred: Nit: space after `=`
## Fine Tuned Pred: Minor style issue: missing space before the `=`
-----

In [19]:
df['kotlin'] = analyze_preds('../data/JetBrains_kotlin_1000.csv')

-------------------
@@ -24,10 +24,13 @@ abstract class AbstractIrLineNumberTest : AbstractLineNumberTest() {
 
     override fun compareCustom(psiFile: KtFile, wholeFile: File) {
         val fileText = psiFile.text
-        val expectedLineNumbers = normalize(
-            fileText.substring(fileText.indexOf("//") + 2)
-                .trim().split(" ").map { it.trim() }.toMutableList()
-        )
+        val expectedLineNumbers = fileText.split("\n".toRegex()).filter { line ->
## Human Review: How about simplifying this to:

        val expectedLineNumbers = normalize(
            fileText.substring(Regex("// \\d+").find(fileText)!!.range.start + 2)
                .trim().split(" ").map { it.trim() }.toMutableList()
        )

Then we are looking for a line that starts with "// " with a number of digits after it. That should work to exclude the comment lines with text. That looks consistent with what the old backend expects from these tests.
## HF Pred: Why did you change this?
##

In [20]:
df['uppy'] = analyze_preds('../data/transloadit_uppy_1000.csv')

-------------------
@@ -28,6 +28,7 @@
     "@uppy/thumbnail-generator": "0.29.1",
     "@uppy/utils": "0.29.1",
     "classnames": "^2.2.6",
+    "cuid": "^2.1.1",
     "drag-drop": "2.13.3",
## Human Review: In other places we are using uuid, I think, is there a reason to use different packages? I was actually gonna suggest we switch to https://github.com/ai/nanoid everywhere, it’s smaller and ids are smaller.
## HF Pred: This should be `^2.1.0`, right?
## Fine Tuned Pred: Why are we using a fixed version here?
-------------------
@@ -49,7 +49,8 @@ class Tabs extends Component {
             tabindex="-1"
             type="file"
             name="files[]"
-            multiple="true"
+            multiple={this.props.maxNumberOfFiles !== 1 || !this.props.maxNumberOfFiles}
## Human Review: same here
## HF Pred: `multiple={this.props.maxNumberOfFiles !== 1}`
## Fine Tuned Pred: shouldnt it be only true if `this.props.maxNumberOfFiles` is defined ?
-------------------
@@ -496,6 +494,25

As we can see, the models are pretty mediocre at understanding the issue. But I find that the HF model tends to produce more generic predictions, while the fine-tuned model produces predictions that are more specific to the code and shows a better understanding.

Still, both models predict something sensible but struggle to pin-point the problem.

## Quantitative Evaluation
For each dataset, we calculate the [BLEU-4](https://en.wikipedia.org/wiki/BLEU) score for the predictions of the HF model and the fine-tuned model. The BLEU score is a measure of how similar the predictions are to the target. The higher the score, the better the predictions.

In [21]:
calc_bleu_score('../data/msg-test')

Total: 10169
Total: 10169


HF BLEU: 5.14
Fine Tuned BLEU: 5.34


(5.14, 5.34)

In [22]:
calc_bleu_score('../data/microsoft_vscode_1000.csv')

Total: 1000
Total: 1000


HF BLEU: 3.39
Fine Tuned BLEU: 4.02


(3.39, 4.02)

In [23]:
calc_bleu_score('../data/JetBrains_kotlin_1000.csv')

Total: 1000


HF BLEU: 3.39
Fine Tuned BLEU: 4.32


Total: 1000


(3.39, 4.32)

In [24]:
calc_bleu_score('../data/transloadit_uppy_1000.csv')

Total: 1000


HF BLEU: 4.55
Fine Tuned BLEU: 4.84


Total: 1000


(4.55, 4.84)

As we can see, the fine-tuned model performs slightly better than the HF model on all datasets.

Nevertheless, the score is still pretty low (as the authors of {cite}`li2022codereviewer` put it: "it is a hard task").