In [4]:
import pandas as pd
import numpy as np
import os
import json

In [5]:
PLUGIN_DATA_PATH = '/mnt/fheijden/code4me/code4me-server/data/'

In [6]:
paths = [*os.scandir(PLUGIN_DATA_PATH)]

In [7]:
paths[0].name[:-5].split('-')

['b702de9a8eda47d2a66ef62944fdc163', '78aa9c1c21b84bada297edb2ee7bc569']

In [8]:
json_events = []

for path in paths:
  with open(path) as fd:
    if os.stat(fd.fileno()).st_size == 0:
      continue

    json_event = json.load(fd)

    user_token, verify_token = path.name[:-5].split('-')
    json_event['userToken'] = user_token
    json_event['verifyToken'] = verify_token

    json_events.append(json_event)

len(json_events)

143539

In [9]:
all_keys = set()

for json_event in json_events:
  all_keys.update(json_event.keys())

len(all_keys), all_keys

(18,
 {'chosenPrediction',
  'completionTimestamp',
  'groundTruth',
  'ide',
  'inferenceTime',
  'keybind',
  'language',
  'leftContext',
  'leftContextLength',
  'model',
  'modelPredictions',
  'pluginVersion',
  'predictions',
  'rightContext',
  'rightContextLength',
  'triggerPoint',
  'userToken',
  'verifyToken'})

In [10]:
either = list(filter(lambda x: not ('model' in x)|('modelPredictions' in x), json_events))
len(either)

0

In [11]:
def is_valid(x):
  # The model part is only to prevent crashes in the future, we currently do not have any events with both missing, which you can check in the block above
  return 'groundTruth' in x and \
    x['groundTruth'].strip() != '' and \
    ('model' in x or 'modelPredictions' in x) and \
    len(list(filter(lambda p: p != '', x['predictions']))) != 0

def proper_json(inp):

  if 'model' in inp:
    if inp['model'] == 'CodeFill':
      model = 'InCoder'
    else:
      model = inp['model']
  else:
    model = ",".join(inp['modelPredictions'].keys())

  return {
    'completionTimestamp': inp['completionTimestamp'],
    'triggerPoint': inp['triggerPoint'],
    'language': inp['language'] if inp['language'] != 'ecmascript 6' else 'javascript',
    'ide': inp['ide'],
    'model': model,
    'modelPredictions': inp.get('modelPredictions', None),
    'predictions': inp['predictions'],
    'inferenceTime': inp['inferenceTime'],
    'leftContextLength': inp.get('leftContextLength', -1),
    'rightContextLength': inp.get('rightContextLength', -1),
    'keybind': inp.get('keybind', None),
    'pluginVersion': inp.get('pluginVersion', None),
    'chosenPrediction': inp.get('chosenPrediction', None),
    'groundTruth': inp['groundTruth'],
    'leftContext': inp.get('leftContext', None),
    'rightContext': inp.get('rightContext', None),
    'userToken': inp['userToken'],
    'verifyToken': inp['verifyToken']
  }

valid_inputs = list(filter(is_valid, json_events))

valid_events = list(map(proper_json, valid_inputs))

len(valid_events)

68169

In [12]:
df = pd.DataFrame(valid_events).astype({
  'completionTimestamp': 'datetime64',
  'triggerPoint': 'object',
  'language': 'object',
  'ide': 'object',
  'model': 'object',
  'modelPredictions': 'object',
  'predictions': 'object',
  'inferenceTime': 'float64',
  'leftContextLength': 'int64',
  'rightContextLength': 'int64',
  'keybind': 'object',
  'pluginVersion': 'object',
  'chosenPrediction': 'object',
  'groundTruth': 'object',
  'leftContext': 'object',
  'rightContext': 'object',
  'userToken': 'object',
  'verifyToken': 'object'
})

df.dtypes

completionTimestamp    datetime64[ns]
triggerPoint                   object
language                       object
ide                            object
model                          object
modelPredictions               object
predictions                    object
inferenceTime                 float64
leftContextLength               int64
rightContextLength              int64
keybind                        object
pluginVersion                  object
chosenPrediction               object
groundTruth                    object
leftContext                    object
rightContext                   object
userToken                      object
verifyToken                    object
dtype: object

In [13]:
keys = [
  'keybind',
  'pluginVersion',
  'model',
  'language',
  'userToken'
]

for k in keys:
  uniq= df[k].unique()
  print(f"{k}: ({len(uniq)})")
  # print(f"{uniq}\n")


keybind: (3)
pluginVersion: (5)
model: (3)
language: (94)
userToken: (291)


In [14]:
ctx_missing = df[(df['leftContextLength']==-1) & (df['rightContextLength']==-1)]
ctx_missing
# ctx_missing['leftContextLength', 'rightContextLength']

Unnamed: 0,completionTimestamp,triggerPoint,language,ide,model,modelPredictions,predictions,inferenceTime,leftContextLength,rightContextLength,keybind,pluginVersion,chosenPrediction,groundTruth,leftContext,rightContext,userToken,verifyToken
65,2022-06-08 09:04:56.770077,in,python,jetbrains,UniXCoder,,[users:],36.525,-1,-1,,,,users:,,,4169f3a049ff4e108a21c9814d6c1bd9,accd83c772ac4c90a536764c17806161
94,2022-06-08 05:00:35.960239,.,groovy,jetbrains,UniXCoder,,[0.0'],73.053,-1,-1,,,,2.1',,,21bde26949274efbb17584c57eeedbd5,56ec382217574b7f8fd3ad84697df9bc
99,2022-06-07 14:41:31.012118,,ruby,jetbrains,InCoder,,[in_loss_percent.abs * 100) >= percent_gain.to_d],397.145,-1,-1,,,in_loss_percent.abs * 100) >= percent_gain.to_d,ard.gain_loss_percent < 0,,,5c04669f26024c10ae30960f6bbc50f2,803db09c01034775a3c8c08a9b735e53
140,2022-06-08 12:30:13.455065,(,java,jetbrains,UniXCoder,,[int live) {],86.001,-1,-1,,,,int leve){,,,21bde26949274efbb17584c57eeedbd5,561dddfe38b34414b7e73e4562634f82
200,2022-06-08 13:42:54.435063,&,ruby,jetbrains,UniXCoder,,[.currency_code == Payment.currency_code],169.745,-1,-1,,,,.present?,,,e3ec12a123894a8b910679bc72a27ad7,8a57f43d3f024b3abc1a84d90c4eed2a
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68003,2022-06-08 13:01:00.588672,[,javascript,jetbrains,UniXCoder,,[0];],36.711,-1,-1,,,,'Customer'],,,b8822374a04748bd8984bdf872d36dfb,0f56f43ae46f4e6ca4019fbe7adae71b
68024,2022-06-07 18:56:23.999194,.,ruby,jetbrains,InCoder,,[by (0)],164.802,-1,-1,,,,by(0),,,5c04669f26024c10ae30960f6bbc50f2,1c35c6f4c15e4ba68b5bf43af6666bc7
68118,2022-06-08 13:05:03.312528,(,markdown,jetbrains,InCoder,,[as opposed to implicit ones).],210.768,-1,-1,,,,e.g. `const x = 1` becomes `const x: number = `).,,,6e39eaf7e31a43aba3787d7a507ccab6,5832b67a0adf41948a1b9bd7c59c431d
68139,2022-06-08 15:39:19.549505,.,python,jetbrains,InCoder,,[_bruh = Bruh._test],151.017,-1,-1,,,,_test = Bruh._test,,,81fea46f1ecf49128565efba63607a10,acd4c9dcdf094b06a690924b5b0836ec


In [None]:
# Get the 20 most recent events from users using vs code, this is to find my own predictions more easily

vsc = df[df["ide"] == "vsc"]

vsc.sort_values('completionTimestamp', ascending=False).head(20)

In [17]:
by_tp = df.groupby('triggerPoint')

model_count = by_tp['model'].count()
chosen_count = by_tp['chosenPrediction'].count()
is_large = model_count > 100

print(f"Selected {is_large.sum()} different triggerpoints")

frac_chosen = (chosen_count / model_count)[is_large]

chooserate_df = pd.concat([frac_chosen.to_frame('fraction'), chosen_count[is_large], model_count[is_large]], axis=1)

chooserate_df.sort_values('fraction')

Selected 28 different triggerpoints


Unnamed: 0_level_0,fraction,chosenPrediction,model
triggerPoint,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
+,0.02381,20,840
%,0.025157,4,159
*,0.030556,22,720
is,0.04328,19,439
await,0.043478,5,115
[,0.046814,72,1538
if,0.047829,76,1589
/,0.049252,135,2741
&,0.049645,14,282
<,0.050921,105,2062


In [18]:
[name for name in by_tp.groups.keys() if name.endswith('.')]

['.']