# Predictions Evaluation

In [1]:
from pathlib import Path
import numpy as np
import pandas as pd

from utils.smooth_bleu import bleu_fromstr

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def analyze_preds(base_file, sample_size=5):
    # read files
    hf_preds_file = Path(base_file).with_suffix('.hf_pred.csv')
    fine_tuned_file = Path(base_file).with_suffix('.finetuned_pred.csv')
    hf_preds = pd.read_csv(hf_preds_file)
    fine_tuned = pd.read_csv(fine_tuned_file)
    # put in df
    df = pd.DataFrame({'code': fine_tuned['code'], 'hf_pred': hf_preds['prediction'], 'fine_tuned_pred': fine_tuned['prediction']})
    df.replace(np.nan, '', regex=True)
    # print sample with predictions
    sample = df.sample(sample_size)
    for code, hf_pred, fine_tuned_pred in sample.to_numpy():
        print('-------------------')
        print(code)
        print(f'HF Pred: {hf_pred}')
        print(f'Fine Tuned Pred: {fine_tuned_pred}')
    return df

In [3]:
def calc_bleu(df):
    refs = list(df['target'])
    preds = list(df['prediction'])
    for i in range(len(preds)):
        chars = "(_)`."
        for c in chars:
            preds[i] = preds[i].replace(c, " " + c + " ")
            preds[i] = " ".join(preds[i].split())
            refs[i] = refs[i].replace(c, " " + c + " ")
            refs[i] = " ".join(refs[i].split())
    return bleu_fromstr(preds, refs, rmstop=False)

def calc_bleu_score(base_file):
    hf_preds_file = Path(base_file).with_suffix('.hf_pred.csv')
    fine_tuned_file = Path(base_file).with_suffix('.finetuned_pred.csv')
    hf_preds = pd.read_csv(hf_preds_file)
    ft_preds = pd.read_csv(fine_tuned_file)
    hf_preds.replace(np.nan, '', regex=True, inplace=True)
    ft_preds.replace(np.nan, '', regex=True, inplace=True)
    hf_bleu = calc_bleu(hf_preds)
    ft_bleu = calc_bleu(ft_preds)
    print(f'HF BLEU: {hf_bleu}')
    print(f'Fine Tuned BLEU: {ft_bleu}')
    return hf_bleu, ft_bleu

## Qualitative Evaluation
We will now compare the predictions of the HF model and the fine-tuned model on samples of the four datasets.

We will print the code, the prediction of the HF model and the prediction of the fine-tuned model.

In [4]:
df = {}

In [5]:
df['msg'] = analyze_preds('../data/msg-test')

-------------------
@@ -20,10 +20,6 @@ from pylint import checkers, interfaces
 from pylint.checkers import utils


-def _is_constant_empty_str(node):
-    return isinstance(node, nodes.Const) and node.value == ""
-
-
 class CompareToEmptyStringChecker(checkers.BaseChecker):
     """Checks for comparisons to empty string.
     Most of the times you should use the fact that empty strings are false.
HF Pred: Why is this removed?
Fine Tuned Pred: Removing this helper function because it isn't used anywhere else and `_is_constant_empty_str` is used in other places.
-------------------
@@ -14,7 +14,7 @@ import net.sourceforge.pmd.RuleViolation;
 /**
  * A {@link RuleViolation} implementation that is immutable, and therefore cache friendly
  */
-public final class CachedRuleViolation implements RuleViolation {
+public class CachedRuleViolation implements RuleViolation {

     private final CachedRuleMapper mapper;
 
HF Pred: Why did you remove `final`
Fine Tuned Pred: Why did you remove the 

In [6]:
df['vscode'] = analyze_preds('../data/microsoft_vscode_1000.csv')

-------------------
@@ -1127,4 +1134,12 @@ registry.add('eslint-stylish', {
 	applyTo: ApplyToKind.allDocuments,
 	fileLocation: FileLocationKind.Absolute,
 	pattern: defaultPattern('eslint-stylish')
-});
\ No newline at end of file
+});
+
+registry.add('go', {
+	owner: 'typescript',
+	applyTo: ApplyToKind.allDocuments,
+	fileLocation: FileLocationKind.Relative,
+	filePrefix: '${cwd}',
HF Pred: Why is this needed?
Fine Tuned Pred: I am not sure if this is a good idea. It makes me wonder if we shoul
-------------------
@@ -0,0 +1,46 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<component type="desktop">
+	<id>@@NAME@@.desktop</id>
+	<metadata_license>CC-BY-SA-3.0</metadata_license>
+	<project_license>MIT</project_license>
+	<name>@@NAME_LONG@@</name>
+	<url type="homepage">https://code.visualstudio.com</url>
+	<summary>Code editor for developers supporting integration with existing tools</summary>
+	<description>
+		<p>
+			Visual Studio Code is a lightweight but powerful source code edi

In [7]:
df['kotlin'] = analyze_preds('../data/JetBrains_kotlin_1000.csv')

-------------------
@@ -178,7 +181,7 @@ abstract class AbstractDebugTest : CodegenTestCase() {
         var inBoxMethod = false
         vmLoop@
         while (true) {
-            val eventSet = virtualMachine.eventQueue().remove(1000)
+            val eventSet = virtualMachine.eventQueue().remove(1000) ?: continue
HF Pred: Why is this change needed?
Fine Tuned Pred: This is the fix for the bug described in th
-------------------
@@ -3,8 +3,8 @@ package kara.internal
 /* Test <TYPO descr="Typo: In word 'somthing'">somthing</TYPO> */
 class <TYPO descr="Typo: In word 'Moree'">Moree</TYPO>Fun {
   fun <TYPO descr="Typo: In word 'wrte'">wrte</TYPO>() {
-    val <TYPO descr="Typo: In word 'childen'">childen</TYPO> = 12
+    val <TYPO descr="Typo: In word 'children'">children</TYPO> = 12
HF Pred: Why did you remove this
-------------------
@@ -0,0 +1,16 @@
+// IGNORE_BACKEND: JVM
+// KOTLIN_CONFIGURATION_FLAGS: ASSERTIONS_MODE=jvm
+
+class Outer {
+    class Inner {
+        fun f() { ass

In [8]:
df['uppy'] = analyze_preds('../data/transloadit_uppy_1000.csv')

-------------------
@@ -1,6 +1,6 @@
 import { expectError, expectType } from 'tsd'
 import DefaultStore from '@uppy/store-default'
-import Uppy, { UIPlugin } from '..'
+import Uppy, { SuccessResponse, UIPlugin, UppyFile } from '..'
 import type { UploadedUppyFile, FailedUppyFile, PluginOptions } from '..'
HF Pred: Why do we need to ex
Fine Tuned Pred: What's the reason for this change?
-------------------
@@ -1076,6 +1076,7 @@ class Uppy {
       const currentProgress = this.getFile(file.id).progress
       this.setFileState(file.id, {
         progress: Object.assign({}, currentProgress, {
+          postprocess: this.postProcessors.length > 0,
HF Pred: postprocess: this.postProcessors.length > 0,
Fine Tuned Pred: Why don't we use `getPostProcessors` here as w
-------------------
@@ -681,6 +681,8 @@ module.exports = class Tus extends Plugin {
     this.uppy.on('reset-progress', this.handleResetProgress)
 
     if (this.opts.autoRetry) {
HF Pred: this.log('[Tus] The `autoRetry` option 

As we can see, the fine-tuned model produces better predictions than the HF model. The predictions are much more insightful and detailed. The HF model tends to produce more generic predictions, while the fine-tuned model produces predictions that are more specific to the code.

Only one thing is weird: the codereviewer model predictions are sometimes cut and not shown in full. This is not due to sentence limits, as the limit is pretty high (512 tokens). Unfortunately, it is up to the future work to figure out the reason behind this.

## Quantitative Evaluation
For each dataset, we calculate the [BLEU-4](https://en.wikipedia.org/wiki/BLEU) score for the predictions of the HF model and the fine-tuned model. The BLEU score is a measure of how similar the predictions are to the target. The higher the score, the better the predictions.

In [9]:
calc_bleu_score('../data/msg-test')

Total: 10169
Total: 10169


HF BLEU: 4.25
Fine Tuned BLEU: 4.71


(4.25, 4.71)

In [10]:
calc_bleu_score('../data/microsoft_vscode_1000.csv')

Total: 1000
Total: 1000


HF BLEU: 2.39
Fine Tuned BLEU: 3.41


(2.39, 3.41)

In [11]:
calc_bleu_score('../data/JetBrains_kotlin_1000.csv')

Total: 1000
Total: 1000


HF BLEU: 2.71
Fine Tuned BLEU: 3.91


(2.71, 3.91)

In [12]:
calc_bleu_score('../data/transloadit_uppy_1000.csv')

Total: 1000
Total: 1000


HF BLEU: 3.25
Fine Tuned BLEU: 3.84


(3.25, 3.84)

As we can see, the fine-tuned model performs better than the HF model on all datasets. Nevertheless, the score is still pretty low. This means as authors put it "it is a hard task".

The semantic value of the predictions is also better, as we can see in the qualitative evaluation.