In [80]:
import copy
import glob
import json
import os
import random
import re
from collections import Counter
from random import shuffle

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures

In [81]:
random.seed(0)

In [82]:
root_dir = "/home/kirill/Documents/1.Projects/" \
           "class-ranking/intellij-community/" \
           "project-processing-results/processing/" \
           "java/classes/processing/0.0.1/"

cnt = 0
individualUsages = {}  # {className -> {className -> count}}
graph = {}  #{className -> [parentNames]}

def good(usage) -> bool:
    # {'TYPE', 'THIS_EXPRESSION', 'EXTENDS_LIST', 'IMPLEMENTS_LIST', 'METHOD_REF_EXPRESSION', 'EXPRESSION_LIST', 'NEW_EXPRESSION', 'CONDITIONAL_EXPRESSION', 'FIELD', 'ANNOTATION', 'THROWS_LIST', 'SUPER_EXPRESSION', 'JAVA_CODE_REFERENCE', 'REFERENCE_EXPRESSION', 'DOC_REFERENCE_HOLDER'}
    if usage["features"]["referenceType"] == "ANNOTATION":
        return False
    if usage["features"]["referenceType"] == "DOC_REFERENCE_HOLDER":
        return False
    return True

def addToUsages(data, individualUsages):
    enclosingName = data["keyInfo"]["name"]
    c = Counter()
    for usage in data['usages']:
        if good(usage):
            c.update([usage['name']])
    individualUsages[enclosingName] = c

def addToGraph(data, graph):
    enclosingName = data["keyInfo"]["name"]
    extends = data["keyInfo"]["additionalInfo"]["enclosingClassExtendsList"]
    impls = data["keyInfo"]["additionalInfo"]["enclosingClassImplementsList"]
    if enclosingName not in graph:
        graph[enclosingName] = set()
    for par in (extends + impls):
        if par not in graph:
            graph[par] = set()
        graph[par].update({enclosingName})


print(f'load files, build a graph, count indevidual usages ')
for filename in glob.glob(root_dir + '**/*.json', recursive=True):
    #print(f'name {filename}')
    with open(os.path.join(os.getcwd(), filename), 'r') as f:
        data = json.load(f)
        addToUsages(data, individualUsages)
        addToGraph(data, graph)
        cnt += 1
        if cnt % 10**4 == 0:
            print(f'{cnt} files loaded, for current file: direct_usages = {len(individualUsages[data["keyInfo"]["name"]])} | direct_children = {len(graph[data["keyInfo"]["name"]])} ')
print(f'{cnt} files processed ')

load files, build a graph, count indevidual usages 
10000 files loaded, for current file: direct_usages = 6 | direct_children = 0 
20000 files loaded, for current file: direct_usages = 10 | direct_children = 0 
30000 files loaded, for current file: direct_usages = 2 | direct_children = 0 
40000 files loaded, for current file: direct_usages = 5 | direct_children = 0 
50000 files loaded, for current file: direct_usages = 2 | direct_children = 0 
53265 files processed 


In [83]:
familyUsages = {}  # className -> usages in all children (className -> number)
family = {}  # className -> self and all children set(className)
parent = {}  # className -> className


def dfs(v):
    cur_usages = copy.deepcopy(individualUsages[v])
    cur_family_set = {v}
    for u in graph[v]:
        if u not in family:
            parent[u] = v
            dfs(u)

        u_usages, u_child_set = familyUsages[u], family[u]
        cur_usages.update(u_usages)
        cur_family_set.update(u_child_set)
    familyUsages[v], family[v] = cur_usages, cur_family_set


v_count = 0
for v in individualUsages:
    if v not in familyUsages:
        dfs(v)
    v_count += 1
    if v_count % 10 ** 4 == 0:
        print(f'{v_count} vertexes evalueted | in {v} used {len(familyUsages[v])} classes, {len(family[v])} children')

10000 vertexes evalueted | in com.intellij.openapi.externalSystem.model.execution.ExternalSystemTaskExecutionSettings used 6 classes, 1 children
20000 vertexes evalueted | in com.intellij.codeInspection.SimplifyCollectorInspection used 10 classes, 1 children
30000 vertexes evalueted | in com.intellij.refactoring.changeSignature.ChangeSignatureParameterUsageInfo used 2 classes, 1 children
40000 vertexes evalueted | in com.intellij.structuralsearch.impl.matcher.MatchResultImpl used 5 classes, 1 children
50000 vertexes evalueted | in com.intellij.execution.actions.ChooseDebugConfigurationPopupAction used 2 classes, 1 children


In [84]:
ROOT_CLASS = "com.intellij.openapi.actionSystem.AnAction"
#print(familyUsages[ROOT_CLASS].most_common())

In [85]:
train_test_classes = family[ROOT_CLASS]
train_list, test_list = train_test_split(list(train_test_classes), train_size=0.7, random_state=0)
test_list, valid_list = train_test_split(test_list, train_size=0.65, random_state=0)
trainClasses = set(train_list)
testClasses = set(test_list)
validClasses = set(valid_list)

In [86]:
train_usages = set()
for c in trainClasses:
    train_usages.update(individualUsages[c].keys())

In [87]:
firstCharToNameList = {}
for k, v in familyUsages[ROOT_CLASS].most_common():
    if k not in train_usages:
        continue
    first_char = k.rpartition('.')[2][0]
    if first_char not in firstCharToNameList:
        firstCharToNameList[first_char] = []
    firstCharToNameList[first_char].append(k)

with open('FirstCharToNameList.txt', 'w') as convert_file:
    convert_file.write(json.dumps(firstCharToNameList))

# for k in firstCharToNameList:
#     print(f'{k} {len(firstCharToNameList[k])}')

In [88]:
firstCharToNameListSampling = {}
for k, v in familyUsages[ROOT_CLASS].most_common():
    if k not in train_usages:
        continue
    first_char = k.rpartition('.')[2][0]
    if first_char not in firstCharToNameListSampling:
        firstCharToNameListSampling[first_char] = []
    firstCharToNameListSampling[first_char] += [k] * v

for k in firstCharToNameListSampling:
    shuffle(firstCharToNameListSampling[k])

with open('FirstCharToNameListSampling.txt', 'w') as convert_file:
    convert_file.write(json.dumps(firstCharToNameListSampling))

In [89]:
def listToStr(df, col):
    tmp = df[col].isnull(), col
    df.loc[tmp] = df.loc[tmp].apply(lambda x: [])
    df[col] = df[col].apply(lambda x: " ".join(x))

In [90]:
def splitCamelCase(name: str):
    res = []
    for rname in name.split(' '):
        rname = rname.strip('.')
        if '.' in rname:
            rname = rname.rpartition('.')[2]
        res += re.sub('([A-Z][a-z]+)', r' \1', re.sub('([A-Z]+)', r' \1', rname)).split()
        #res += [rname]
    return res

In [91]:
fnameToSize = {
    'name': 5,
    'features.enclosingScopeName': 5,
    'features.enclosingClassName': 5,
    'features.variablesTypesInScope': 15
}


def namesToCats(usage):
    usage = copy.deepcopy(usage)
    for k in list(usage.keys()):
        if k not in fnameToSize:
            continue
        v = usage[k]
        if isinstance(v, list):
            v = " ".join(v)
        vl = splitCamelCase(v)
        #vl.sort()
        vl.reverse()
        if len(vl) < fnameToSize[k]:
            vl += [''] * (fnameToSize[k] - len(vl))
        if len(vl) > fnameToSize[k]:
            vl = vl[:fnameToSize[k]]
        for i, w in enumerate(vl):
            usage[f'{k}_{i}'] = w
        del usage[k]
    return usage


In [92]:


cnt = 0
cntu = 0
list_of_dfs_train = []
list_of_dfs_test = []
list_of_dfs_valid = []
group_id = 0
for filename in glob.glob(root_dir + '**/*.json', recursive=True):
    enclosingClassName = filename.rpartition('/')[0].rpartition('/')[2]
    if enclosingClassName not in train_test_classes:
        continue
    with open(os.path.join(os.getcwd(), filename), 'r') as f:
        data = json.load(f)
        data_usages = []
        for usage in data["usages"]:
            if not good(usage):
                continue
            usage = pd.json_normalize(usage).to_dict(orient='records')[0]
            usage['features.enclosingClassName'] = enclosingClassName
            usage['correct'] = 1
            usage['firstChar'] = usage['name'].rpartition('.')[2][0]
            usage['group'] = group_id
            group_id += 1
            if group_id == 1:
                print(namesToCats(usage))
            data_usages.append(namesToCats(usage))

            if enclosingClassName in trainClasses:
                negatives = 8
            else:
                negatives = 25

            for i in range(negatives):
                usage_negative = copy.deepcopy(usage)
                while True:
                    new_name = random.choice(firstCharToNameListSampling[usage['firstChar']])  #?
                    if usage['name'] != new_name:
                        usage_negative['name'] = new_name
                        usage_negative['correct'] = 0
                        break
                data_usages.append(namesToCats(usage_negative))

        df = pd.json_normalize(data_usages)


        df = df.drop(columns=['filePath', 'textOffset', 'features.lineInFile'])
        if enclosingClassName in trainClasses:
            list_of_dfs_train.append(df)
        if enclosingClassName in testClasses:
            list_of_dfs_test.append(df)
        if enclosingClassName in validClasses:
            list_of_dfs_valid.append(df)
        cntu += len(data_usages)
        cnt += 1

df_train_cat = pd.concat(list_of_dfs_train, axis=0, ignore_index=True)
df_test_cat = pd.concat(list_of_dfs_test, axis=0, ignore_index=True)
df_valid_cat = pd.concat(list_of_dfs_valid, axis=0, ignore_index=True)

# listToStr(df_train_raw, 'features.variablesTypesInScope')
# listToStr(df_test_raw, 'features.variablesTypesInScope')
# listToStr(df_valid_raw, 'features.variablesTypesInScope')
df_train_cat.fillna('', inplace=True)
df_test_cat.fillna('', inplace=True)
df_valid_cat.fillna('', inplace=True)

print(f'{cnt} files loaded, {cntu} usages')

print(f'test shape {df_test_cat.shape}')
print(f'train shape {df_train_cat.shape}')
print(f'valid shape {df_valid_cat.shape}')
df_train_cat.dtypes

# will we know referenceType irl ?

{'filePath': 'platform/xdebugger-impl/src/com/intellij/xdebugger/impl/ui/tree/actions/XJumpToTypeSourceAction.java', 'textOffset': 958, 'features.referenceType': 'EXTENDS_LIST', 'features.lineInFile': 23, 'correct': 1, 'firstChar': 'X', 'group': 0, 'name_0': 'Base', 'name_1': 'Action', 'name_2': 'Source', 'name_3': 'To', 'name_4': 'Jump', 'features.enclosingClassName_0': 'Action', 'features.enclosingClassName_1': 'Source', 'features.enclosingClassName_2': 'Type', 'features.enclosingClassName_3': 'To', 'features.enclosingClassName_4': 'Jump'}
2810 files loaded, 611400 usages
test shape (224562, 38)
train shape (267732, 38)
valid shape (119106, 38)


features.referenceType               object
correct                               int64
firstChar                            object
group                                 int64
name_0                               object
name_1                               object
name_2                               object
name_3                               object
name_4                               object
features.enclosingClassName_0        object
features.enclosingClassName_1        object
features.enclosingClassName_2        object
features.enclosingClassName_3        object
features.enclosingClassName_4        object
features.scopeKind                   object
features.enclosingScopeName_0        object
features.enclosingScopeName_1        object
features.enclosingScopeName_2        object
features.enclosingScopeName_3        object
features.enclosingScopeName_4        object
features.variablesTypesInScope_0     object
features.variablesTypesInScope_1     object
features.variablesTypesInScope_2

In [93]:
print(f'train0 ########## ------> :\n {df_train_cat.loc[[10]]}')
print(f'test0  ########## ------> :\n {df_test_cat.loc[[10]]}')
print(f'valid0  ########## ------> :\n {df_valid_cat.loc[[10]]}')

train0 ########## ------> :
    features.referenceType  correct firstChar  group  name_0 name_1 name_2  \
10                   TYPE        0         X      1  Bundle    Dom    Xml   

   name_3 name_4 features.enclosingClassName_0  ...  \
10                                      Action  ...   

   features.variablesTypesInScope_8 features.variablesTypesInScope_9  \
10                                                                     

   features.variablesTypesInScope_10 features.variablesTypesInScope_11  \
10                                                                       

   features.variablesTypesInScope_12 features.variablesTypesInScope_13  \
10                                                                       

   features.variablesTypesInScope_14 features.insideStatement_0  \
10                                                                

   features.insideStatement_1 features.insideStatement_2  
10                                                        

[1 rows 

In [94]:
df_train_cat.to_csv('train_cat.csv')

df_test_cat.to_csv('test_cat.csv')

df_valid_cat.to_csv('valid_cat.csv')

In [95]:
df_train_cat.columns.tolist()

['features.referenceType',
 'correct',
 'firstChar',
 'group',
 'name_0',
 'name_1',
 'name_2',
 'name_3',
 'name_4',
 'features.enclosingClassName_0',
 'features.enclosingClassName_1',
 'features.enclosingClassName_2',
 'features.enclosingClassName_3',
 'features.enclosingClassName_4',
 'features.scopeKind',
 'features.enclosingScopeName_0',
 'features.enclosingScopeName_1',
 'features.enclosingScopeName_2',
 'features.enclosingScopeName_3',
 'features.enclosingScopeName_4',
 'features.variablesTypesInScope_0',
 'features.variablesTypesInScope_1',
 'features.variablesTypesInScope_2',
 'features.variablesTypesInScope_3',
 'features.variablesTypesInScope_4',
 'features.variablesTypesInScope_5',
 'features.variablesTypesInScope_6',
 'features.variablesTypesInScope_7',
 'features.variablesTypesInScope_8',
 'features.variablesTypesInScope_9',
 'features.variablesTypesInScope_10',
 'features.variablesTypesInScope_11',
 'features.variablesTypesInScope_12',
 'features.variablesTypesInScope_13