In [2]:
import pandas as pd
import numpy as np
import os
import json

In [3]:
PLUGIN_DATA_PATH = '/mnt/fheijden/code4me/code4me-server/data/'

In [4]:
paths = [*os.scandir(PLUGIN_DATA_PATH)]

In [39]:
json_events = []

for path in paths:
  with open(path) as fd:
    if os.stat(fd.fileno()).st_size == 0:
      continue

    json_events.append(json.load(fd))

len(json_events)

133588

In [52]:
def is_valid(x):
  return 'groundTruth' in x and \
    x['groundTruth'].strip() != '' and \
    len(list(filter(lambda p: p != '', x['predictions']))) != 0

def proper_json(inp):
  return {
    'completionTimestamp': inp['completionTimestamp'],
    'triggerPoint': inp['triggerPoint'],
    'language': inp['language'] if inp['language'] != 'ecmascript 6' else 'javascript',
    'ide': inp['ide'],
    'model': inp['model'] if inp['model'] != 'CodeFill' else 'InCoder',
    'predictions': inp['predictions'],
    'inferenceTime': inp['inferenceTime'],
    'leftContextLength': inp['leftContextLength'] if 'leftContextLength' in inp else -1,
    'rightContextLength': inp['rightContextLength'] if 'rightContextLength' in inp else -1,
    'keybind': inp['keybind'] if 'keybind' in inp else None,
    'pluginVersion': inp['pluginVersion'] if 'pluginVersion' in inp else None,
    'chosenPrediction': inp['chosenPrediction'] if 'chosenPrediction' in inp else None,
    'groundTruth': inp['groundTruth']
  }

has_gt = list(filter(is_valid, json_events))

valid_jsons = list(map(proper_json, has_gt))

len(valid_jsons)

62866

In [54]:
df = pd.DataFrame(valid_jsons).astype({
  'completionTimestamp': 'datetime64',
  'triggerPoint': 'object',
  'language': 'object',
  'ide': 'object',
  'model': 'object',
  'predictions': 'object',
  'inferenceTime': 'float64',
  'leftContextLength': 'int64',
  'rightContextLength': 'int64',
  'keybind': 'object',
  'pluginVersion': 'object',
  'chosenPrediction': 'object',
  'groundTruth': 'object'
})

df.dtypes

completionTimestamp    datetime64[ns]
triggerPoint                   object
language                       object
ide                            object
model                          object
predictions                    object
inferenceTime                 float64
leftContextLength               int64
rightContextLength              int64
keybind                        object
pluginVersion                  object
chosenPrediction               object
groundTruth                    object
dtype: object

In [55]:
keys = [
  'keybind',
  'pluginVersion',
  'model',
  'language'
]

for k in keys:
  print(f"{k}: {df[k].unique()}")
  print()


keybind: [None False True]
pluginVersion: [None '1.0.10' '1.0.7']
model: ['UniXCoder' 'InCoder']
language: ['python' 'vue' 'javascript' 'php' 'latex' 'java' 'typescript'
 'typescriptreact' 'text' 'kotlin' 'html' 'mdx' 'dotenv' 'csharp'
 'objectivec' 'groovy' 'ruby' 'scss' 'xml' 'c#' 'dart' 'javascriptreact'
 'robotframework' 'blade' 'go' 'markdown' 'rust' 'elixir' 'json' 'jsonc'
 'shell script' 'css' 'djangotemplate' 'ignore' 'properties' 'plaintext'
 'swift' 'rhtml' 'postcss' 'prisma' 'blazor' 'yaml' 'textmate' 'gotmpl'
 'angular2html' 'julia' 'typescript jsx' 'c' 'scala' 'django-html'
 'shellscript' 'ini' 'dockerfile' 'perl5' 'glsl' 'cython' 'gitignore'
 'razor' 'tex' 'bibtex' 'requirements.txt' 'dockercompose' 'yamlex'
 'git-commit' 'mongodb' 'toml' 'postgresql' 'smartyconfig' 'makefile'
 'proto3' 'scminput' 'pip-requirements']


In [79]:
ctx_missing = df[(df['leftContextLength']==-1) & (df['rightContextLength']==-1)]
ctx_missing
# ctx_missing['leftContextLength', 'rightContextLength']

Unnamed: 0,completionTimestamp,triggerPoint,language,ide,model,predictions,inferenceTime,leftContextLength,rightContextLength,keybind,pluginVersion,chosenPrediction,groundTruth
60,2022-06-08 09:04:56.770077,in,python,jetbrains,UniXCoder,[users:],36.525,-1,-1,,,,users:
88,2022-06-08 05:00:35.960239,.,groovy,jetbrains,UniXCoder,[0.0'],73.053,-1,-1,,,,2.1'
92,2022-06-07 14:41:31.012118,,ruby,jetbrains,InCoder,[in_loss_percent.abs * 100) >= percent_gain.to_d],397.145,-1,-1,,,in_loss_percent.abs * 100) >= percent_gain.to_d,ard.gain_loss_percent < 0
127,2022-06-08 12:30:13.455065,(,java,jetbrains,UniXCoder,[int live) {],86.001,-1,-1,,,,int leve){
186,2022-06-08 13:42:54.435063,&,ruby,jetbrains,UniXCoder,[.currency_code == Payment.currency_code],169.745,-1,-1,,,,.present?
...,...,...,...,...,...,...,...,...,...,...,...,...,...
62718,2022-06-08 13:01:00.588672,[,javascript,jetbrains,UniXCoder,[0];],36.711,-1,-1,,,,'Customer']
62738,2022-06-07 18:56:23.999194,.,ruby,jetbrains,InCoder,[by (0)],164.802,-1,-1,,,,by(0)
62826,2022-06-08 13:05:03.312528,(,markdown,jetbrains,InCoder,[as opposed to implicit ones).],210.768,-1,-1,,,,e.g. `const x = 1` becomes `const x: number = `).
62843,2022-06-08 15:39:19.549505,.,python,jetbrains,InCoder,[_bruh = Bruh._test],151.017,-1,-1,,,,_test = Bruh._test


In [None]:
by_tp = df.groupby('triggerPoint')

model_count = by_tp['model'].count()
chosen_count = by_tp['chosenPrediction'].count()
is_large = model_count > 100

print(f"Selected {is_large.sum()} different triggerpoints")

frac_chosen = (chosen_count / model_count)[is_large]

chooserate_df = pd.concat([frac_chosen.to_frame('fraction'), chosen_count[is_large], model_count[is_large]], axis=1)

chooserate_df.sort_values('fraction')

In [151]:
[name for name in by_tp.groups.keys() if name.startswith('.')]

['.',
 '...d',
 '...e',
 '...u',
 './sc',
 '.a',
 '.add',
 '.addb',
 '.addbea',
 '.addbearer',
 '.an',
 '.b',
 '.bg-whi',
 '.catch((e',
 '.catch((error',
 '.catch(e',
 '.corpus_d',
 '.corpus_datas',
 '.d',
 '.delete("/blog',
 '.e',
 '.el-select.multiselect:hover',
 '.f-m',
 '.f-main',
 '.find',
 '.get(`/blog/section',
 '.get(`/blog/section${p',
 '.get(`/user${s',
 '.get(`/user/?page=${page}limit=${t',
 '.get(`/user?limit=${t',
 '.get(`/user?limit=${this.q',
 '.get(`/user?limit=${this.s',
 '.get(`blog/article?limit=${this.query.limit}&q=${value}&article_section_id=${t',
 '.get(`faq/section?q=${v',
 '.get(z',
 '.getSessionReport(TitleSlugs.v',
 '.getSessionReport(t',
 '.l',
 '.link-row-adv-in',
 '.link-row-adv-insid',
 '.m',
 '.ma',
 '.multiselect--ac',
 '.multiselect--acti',
 '.multiselect_',
 '.multiselect__c',
 '.multiselect__con',
 '.multiselect__content',
 '.multiselect__content-wr',
 '.multiselect__content-wrapper',
 '.multiselect__option--h',
 '.multiselect__option--high',
 '.mult