In [190]:
import copy
import json
import glob
import os
import random

import matplotlib.pyplot as plt
import nltk as nltk
import numpy as np
from collections import Counter
import math
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import re
from random import shuffle
import random


In [191]:
root_dir = "/home/kirill/Documents/1.Projects/" \
           "class-ranking/intellij-community/" \
           "project-processing-results/processing/" \
           "java/classes/processing/0.0.1/"

cnt = 0
individualUsages = {}  # {className -> {className -> count}}
graph = {}  #{className -> [parentNames]}

def good(usage) -> bool:
    # {'TYPE', 'THIS_EXPRESSION', 'EXTENDS_LIST', 'IMPLEMENTS_LIST', 'METHOD_REF_EXPRESSION', 'EXPRESSION_LIST', 'NEW_EXPRESSION', 'CONDITIONAL_EXPRESSION', 'FIELD', 'ANNOTATION', 'THROWS_LIST', 'SUPER_EXPRESSION', 'JAVA_CODE_REFERENCE', 'REFERENCE_EXPRESSION', 'DOC_REFERENCE_HOLDER'}
    if usage["features"]["referenceType"] == "ANNOTATION":
        return False
    if usage["features"]["referenceType"] == "DOC_REFERENCE_HOLDER":
        return False
    return True

def addToUsages(data, individualUsages):
    enclosingName = data["keyInfo"]["name"]
    c = Counter()
    for usage in data['usages']:
        if good(usage):
            c.update([usage['name']])
    individualUsages[enclosingName] = c

def addToGraph(data, graph):
    enclosingName = data["keyInfo"]["name"]
    extends = data["keyInfo"]["additionalInfo"]["enclosingClassExtendsList"]
    impls = data["keyInfo"]["additionalInfo"]["enclosingClassImplementsList"]
    if enclosingName not in graph:
        graph[enclosingName] = set()
    for par in (extends + impls):
        if par not in graph:
            graph[par] = set()
        graph[par].update({enclosingName})


print(f'load files, build a graph, count indevidual usages ')
for filename in glob.glob(root_dir + '**/*.json', recursive=True):
    #print(f'name {filename}')
    with open(os.path.join(os.getcwd(), filename), 'r') as f:
        data = json.load(f)
        addToUsages(data, individualUsages)
        addToGraph(data, graph)
        cnt += 1
        if cnt % 10**4 == 0:
            print(f'{cnt} files loaded, for current file: direct_usages = {len(individualUsages[data["keyInfo"]["name"]])} | direct_children = {len(graph[data["keyInfo"]["name"]])} ')
print(f'{cnt} files processed ')

load files, build a graph, count indevidual usages 
10000 files loaded, for current file: direct_usages = 6 | direct_children = 0 
20000 files loaded, for current file: direct_usages = 10 | direct_children = 0 
30000 files loaded, for current file: direct_usages = 2 | direct_children = 0 
40000 files loaded, for current file: direct_usages = 5 | direct_children = 0 
50000 files loaded, for current file: direct_usages = 2 | direct_children = 0 
53265 files processed 


In [192]:
familyUsages = {} # className -> usages in all children (className -> number)
family = {} # className -> self and all children set(className)
parent = {} # className -> className
def dfs(v):
    cur_usages = copy.deepcopy(individualUsages[v])
    cur_family_set = {v}
    for u in graph[v]:
        if u not in family:
            parent[u] = v
            dfs(u)

        u_usages, u_child_set = familyUsages[u], family[u]
        cur_usages.update(u_usages)
        cur_family_set.update(u_child_set)
    familyUsages[v], family[v] = cur_usages, cur_family_set


v_count = 0
for v in individualUsages:
    if v not in familyUsages:
        dfs(v)
    v_count += 1
    if v_count % 10**4 == 0:
        print(f'{v_count} vertexes evalueted | in {v} used {len(familyUsages[v])} classes, {len(family[v])} children')

10000 vertexes evalueted | in com.intellij.openapi.externalSystem.model.execution.ExternalSystemTaskExecutionSettings used 6 classes, 1 children
20000 vertexes evalueted | in com.intellij.codeInspection.SimplifyCollectorInspection used 10 classes, 1 children
30000 vertexes evalueted | in com.intellij.refactoring.changeSignature.ChangeSignatureParameterUsageInfo used 2 classes, 1 children
40000 vertexes evalueted | in com.intellij.structuralsearch.impl.matcher.MatchResultImpl used 5 classes, 1 children
50000 vertexes evalueted | in com.intellij.execution.actions.ChooseDebugConfigurationPopupAction used 2 classes, 1 children


In [193]:
ROOT_CLASS = "com.intellij.openapi.actionSystem.AnAction"

In [194]:
randomNames = {}
for k, v in familyUsages[ROOT_CLASS].most_common():
    first_char = k.rpartition('.')[2][0]
    if first_char not in randomNames:
        randomNames[first_char] = []
    randomNames[first_char] += [k] * v

with open('randomNames.txt', 'w') as convert_file:
     convert_file.write(json.dumps(randomNames))

for ch in randomNames:
    shuffle(randomNames[ch])

In [195]:
def listToStr(df, col):
    tmp = df[col].isnull(), col
    df.loc[tmp] = df.loc[tmp].apply(lambda x: [])
    df[col] = df[col].apply(lambda x: " ".join(x) )

In [196]:
train_test_classes = family['com.intellij.openapi.actionSystem.AnAction']
train_test_list = list(train_test_classes)
shuffle(train_test_list)
trainClasses = set(train_test_list[:int(0.8 * len(train_test_list))])
testClasses = set(train_test_list[int(0.8 * len(train_test_list)):])

cnt = 0
cntu = 0
list_of_dfs_train = []
list_of_dfs_test = []
for filename in glob.glob(root_dir + '**/*.json', recursive=True):
    with open(os.path.join(os.getcwd(), filename), 'r') as f:
        data = json.load(f)
        enclosingClassName = data["keyInfo"]["name"]
        if enclosingClassName not in train_test_classes:
            continue

        data_usages = []
        for usage in data["usages"]:
            if not good(usage):
                continue
            usage['correct'] = 1
            usage['firstChar'] = usage['name'].rpartition('.')[2][0]
            data_usages.append(usage)
            for i in range(4):
                usage_negative = copy.deepcopy(usage)
                while True:
                    cur_name = usage_negative['name']
                    first_char = cur_name.rpartition('.')[2][0]
                    new_name = random.choice(randomNames[first_char])
                    if cur_name != new_name:
                        usage_negative['firstChar'] = first_char
                        usage_negative['name'] = new_name
                        break
                usage_negative['correct'] = 0
                data_usages.append(usage_negative)

        df = pd.json_normalize(data_usages)
        df['features.enclosingClassName'] = enclosingClassName
        df = df.drop(columns=['filePath', 'textOffset', 'features.lineInFile'])
        if enclosingClassName in trainClasses:
            list_of_dfs_train.append(df)
        if enclosingClassName in testClasses:
            list_of_dfs_test.append(df)
        cntu += len(data_usages)
        cnt += 1

df_train_raw = pd.concat(list_of_dfs_train, axis=0, ignore_index=True)
df_test_raw = pd.concat(list_of_dfs_test, axis=0, ignore_index=True)

listToStr(df_train_raw, 'features.variablesTypesInScope')
listToStr(df_test_raw, 'features.variablesTypesInScope')

print(f'{cnt} files loaded, {cntu} usages')

print(f'test shape {df_test_raw.shape}')
print(f'train shape {df_train_raw.shape}')
df_train_raw.dtypes

2810 files loaded, 214830 usages
test shape (45635, 11)
train shape (169195, 11)


name                              object
correct                            int64
firstChar                         object
features.referenceType            object
features.scopeKind                object
features.enclosingScopeName       object
features.enclosingClassName       object
features.variablesTypesInScope    object
features.insideStatement_0        object
features.insideStatement_1        object
features.insideStatement_2        object
dtype: object

In [197]:
def splitCamelCase(name: str):
    res = []
    for rname in name.split(' '):
        rname = rname.strip('.')
        rname = rname.rpartition('.')[2]
        res += re.sub('([A-Z][a-z]+)', r' \1', re.sub('([A-Z]+)', r' \1', rname)).split()
        #res += [rname]
    return res

In [198]:
def str_to_OHE_feature(df_train, df_test, col, pref):
    vectorizer_train = CountVectorizer(tokenizer=splitCamelCase, lowercase=False, max_features=80)
    term_doc_matrix_train = vectorizer_train.fit_transform(df_train[col].values.astype('str'))

    vectorizer_test = CountVectorizer(tokenizer=splitCamelCase, lowercase=False, vocabulary=vectorizer_train.vocabulary_)
    term_doc_matrix_test = vectorizer_test.fit_transform(df_test[col].values.astype('str'))

    df_train_bow = pd.DataFrame(term_doc_matrix_train.toarray(), columns=vectorizer_train.get_feature_names_out())
    df_test_bow = pd.DataFrame(term_doc_matrix_test.toarray(), columns=vectorizer_test.get_feature_names_out())
    df_train_bow = df_train_bow.add_prefix(pref)
    df_test_bow = df_test_bow.add_prefix(pref)

    df_train = pd.concat([df_train, df_train_bow], axis=1)
    df_test = pd.concat([df_test, df_test_bow], axis=1)

    df_train = df_train.drop(columns=[col])
    df_test = df_test.drop(columns=[col])

    return df_train, df_test

In [199]:
df_train, df_test = df_train_raw, df_test_raw
print(f'train0 ------> : {df_train.loc[[10]]}')
print(f'test0 ------> : {df_test.loc[[10]]}')
df_train, df_test = str_to_OHE_feature(df_train, df_test, 'name', pref='n_')
df_train, df_test = str_to_OHE_feature(df_train, df_test, 'features.enclosingScopeName', pref='e.s.n_')
df_train, df_test = str_to_OHE_feature(df_train, df_test, 'features.enclosingClassName', pref='e.c.n_')
df_train, df_test = str_to_OHE_feature(df_train, df_test, 'features.variablesTypesInScope', pref='v.t.s_')
df_train, df_test = str_to_OHE_feature(df_train, df_test, 'features.variablesTypesInScope', pref='v.t.s_')


#firstChar,features.referenceType,features.scopeKind,features.insideStatement_0,features.insideStatement_1,features.insideStatement_2

print(f'train2 ------> : {df_train.loc[[10]]}')
print(f'test2 ------> : {df_test.loc[[10]]}')


train0 ------> :                                          name  correct firstChar  \
10  com.intellij.xdebugger.frame.XNavigatable        1         X   

   features.referenceType features.scopeKind   features.enclosingScopeName  \
10                   TYPE             Method  startComputingSourcePosition   

                          features.enclosingClassName  \
10  com.intellij.xdebugger.impl.ui.tree.actions.XJ...   

   features.variablesTypesInScope features.insideStatement_0  \
10                                                       NaN   

   features.insideStatement_1 features.insideStatement_2  
10                        NaN                        NaN  
test0 ------> :                                   name  correct firstChar  \
10  com.intellij.openapi.editor.Editor        1         E   

   features.referenceType features.scopeKind features.enclosingScopeName  \
10                   TYPE             Method             actionPerformed   

   features.variablesTypesInScope f

In [201]:
df_train.fillna('', inplace=True)
df_train.to_csv('train.csv')

df_train_raw.to_csv('train_raw.csv')

df_train.fillna('', inplace=True)
df_train.to_csv('test.csv')

df_test_raw.to_csv('test_raw.csv')