In [22]:
import copy
import json
import glob
import os
import matplotlib.pyplot as plt
import nltk as nltk
import numpy as np
from collections import Counter
import math
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import re


In [2]:
root_dir = "/home/kirill/Documents/1.Projects/" \
           "class-ranking/intellij-community/" \
           "project-processing-results/processing/" \
           "java/classes/processing/0.0.1/"

cnt = 0
individualUsages = {}  # {className -> {className -> count}}
graph = {}  #{className -> [parentNames]}

def good(usage) -> bool:
    # {'TYPE', 'THIS_EXPRESSION', 'EXTENDS_LIST', 'IMPLEMENTS_LIST', 'METHOD_REF_EXPRESSION', 'EXPRESSION_LIST', 'NEW_EXPRESSION', 'CONDITIONAL_EXPRESSION', 'FIELD', 'ANNOTATION', 'THROWS_LIST', 'SUPER_EXPRESSION', 'JAVA_CODE_REFERENCE', 'REFERENCE_EXPRESSION', 'DOC_REFERENCE_HOLDER'}
    if usage["features"]["referenceType"] == "ANNOTATION":
        return False
    if usage["features"]["referenceType"] == "DOC_REFERENCE_HOLDER":
        return False
    return True

def addToUsages(data, individualUsages):
    enclosingName = data["keyInfo"]["name"]
    c = Counter()
    for usage in data['usages']:
        if good(usage):
            c.update([usage['name']])
    individualUsages[enclosingName] = c

def addToGraph(data, graph):
    enclosingName = data["keyInfo"]["name"]
    extends = data["keyInfo"]["additionalInfo"]["enclosingClassExtendsList"]
    impls = data["keyInfo"]["additionalInfo"]["enclosingClassImplementsList"]
    if enclosingName not in graph:
        graph[enclosingName] = set()
    for par in (extends + impls):
        if par not in graph:
            graph[par] = set()
        graph[par].update({enclosingName})


print(f'load files, build a graph, count indevidual usages ')
for filename in glob.glob(root_dir + '**/*.json', recursive=True):
    #print(f'name {filename}')
    with open(os.path.join(os.getcwd(), filename), 'r') as f:
        data = json.load(f)
        addToUsages(data, individualUsages)
        addToGraph(data, graph)
        cnt += 1
        if cnt % 10**4 == 0:
            print(f'{cnt} files loaded, for current file: direct_usages = {len(individualUsages[data["keyInfo"]["name"]])} | direct_children = {len(graph[data["keyInfo"]["name"]])} ')
print(f'{cnt} files processed ')

load files, build a graph, count indevidual usages 
10000 files loaded, for current file: direct_usages = 6 | direct_children = 0 
20000 files loaded, for current file: direct_usages = 10 | direct_children = 0 
30000 files loaded, for current file: direct_usages = 2 | direct_children = 0 
40000 files loaded, for current file: direct_usages = 5 | direct_children = 0 
50000 files loaded, for current file: direct_usages = 2 | direct_children = 0 
53265 files processed 


In [3]:
familyUsages = {} # className -> usages in all children (className -> number)
family = {} # className -> self and all children set(className)
parent = {} # className -> className
def dfs(v):
    cur_usages = copy.deepcopy(individualUsages[v])
    cur_family_set = {v}
    for u in graph[v]:
        if u not in family:
            parent[u] = v
            dfs(u)

        u_usages, u_child_set = familyUsages[u], family[u]
        cur_usages.update(u_usages)
        cur_family_set.update(u_child_set)
    familyUsages[v], family[v] = cur_usages, cur_family_set


v_count = 0
for v in individualUsages:
    if v not in familyUsages:
        dfs(v)
    v_count += 1
    if v_count % 10**4 == 0:
        print(f'{v_count} vertexes evalueted | in {v} used {len(familyUsages[v])} classes, {len(family[v])} children')

10000 vertexes evalueted | in com.intellij.openapi.externalSystem.model.execution.ExternalSystemTaskExecutionSettings used 6 classes, 1 children
20000 vertexes evalueted | in com.intellij.codeInspection.SimplifyCollectorInspection used 10 classes, 1 children
30000 vertexes evalueted | in com.intellij.refactoring.changeSignature.ChangeSignatureParameterUsageInfo used 2 classes, 1 children
40000 vertexes evalueted | in com.intellij.structuralsearch.impl.matcher.MatchResultImpl used 5 classes, 1 children
50000 vertexes evalueted | in com.intellij.execution.actions.ChooseDebugConfigurationPopupAction used 2 classes, 1 children


In [58]:

classNameSet = family['com.intellij.openapi.actionSystem.AnAction']
cnt = 0

list_of_dfs = []
for filename in glob.glob(root_dir + '**/*.json', recursive=True):
    with open(os.path.join(os.getcwd(), filename), 'r') as f:
        data = json.load(f)
        if data["keyInfo"]["name"] not in classNameSet:
            continue

        df = pd.json_normalize(data["usages"])
        df['enclosingClassName'] = data["keyInfo"]["name"]
        df = df.drop(columns=['filePath', 'textOffset', 'features.lineInFile'])
        if 'features.variablesTypesInScope' in df.columns:
            tmp = df['features.variablesTypesInScope'].isnull(), 'features.variablesTypesInScope'
            df.loc[tmp] = df.loc[tmp].apply(lambda x: [])
            df['features.variablesTypesInScope'] = df['features.variablesTypesInScope'].apply(lambda x: " ".join(x) )

        list_of_dfs.append(df)
        cnt += 1

raw_df = pd.concat(list_of_dfs, axis=0, ignore_index=True)
print(f'{cnt} files loaded ')


raw_df.dtypes

2810 files loaded 


name                              object
features.referenceType            object
features.scopeKind                object
features.enclosingScopeName       object
enclosingClassName                object
features.variablesTypesInScope    object
features.insideStatement_0        object
features.insideStatement_1        object
features.insideStatement_2        object
dtype: object

In [59]:
print(f'------> : {raw_df.loc[[10]]}')

------> :                             name features.referenceType features.scopeKind  \
10  com.intellij.ide.CutProvider                   TYPE             Method   

   features.enclosingScopeName                  enclosingClassName  \
10     getAvailableCutProvider  com.intellij.ide.actions.CutAction   

   features.variablesTypesInScope features.insideStatement_0  \
10                                                       NaN   

   features.insideStatement_1 features.insideStatement_2  
10                        NaN                        NaN  


In [51]:
def splitCamelCase(name: str):
    res = []
    for rname in name.split(' '):
        rname = rname.strip('.')
        rname = rname.rpartition('.')[2]
        res += re.sub('([A-Z][a-z]+)', r' \1', re.sub('([A-Z]+)', r' \1', rname)).split()
        #res += [rname]
    return res

In [52]:
splitCamelCase('aa.bbKe... ....cc.ddLo')

['bb', 'Ke', 'dd', 'Lo']

In [55]:
print(raw_df.dtypes)
vectorizer = CountVectorizer(tokenizer=splitCamelCase, max_features=100, lowercase=False)
term_doc_matrix = vectorizer.fit_transform(raw_df['features.enclosingScopeName'].values.astype('str'))

print(vectorizer.get_feature_names())
print(vectorizer.vocabulary_)

print(term_doc_matrix.toarray()[5:10])

name                              object
features.referenceType            object
features.scopeKind                object
features.enclosingScopeName       object
enclosingClassName                object
features.variablesTypesInScope    object
features.insideStatement_0        object
features.insideStatement_1        object
features.insideStatement_2        object
dtype: object
['<clinit>', '<init>', 'Action', 'Actions', 'Add', 'And', 'At', 'Available', 'Branch', 'Button', 'Caret', 'Changes', 'Children', 'Class', 'Component', 'Content', 'Context', 'Create', 'Current', 'Custom', 'Data', 'Dialog', 'Diff', 'Directory', 'Editor', 'Element', 'Elements', 'Enabled', 'File', 'Files', 'Fix', 'For', 'Frame', 'From', 'Group', 'Handler', 'Impl', 'In', 'Items', 'Line', 'Model', 'Module', 'Name', 'New', 'Nodes', 'On', 'Or', 'Panel', 'Perform', 'Performed', 'Popup', 'Presentation', 'Preview', 'Project', 'Psi', 'Roots', 'Run', 'Selected', 'Shortcut', 'Target', 'Template', 'Text', 'To', 'Tool', 'Type



In [5]:
df = pd.DataFrame({
          'A':['a','b','a'],
          'B':['b','a','c']
        })
print(df)

# Get one hot encoding of columns B
one_hot = pd.get_dummies(df['B'])
df = df.drop('B',axis = 1)
df = df.join(one_hot)

print(df)

   A  B
0  a  b
1  b  a
2  a  c
   A  a  b  c
0  a  0  1  0
1  b  1  0  0
2  a  0  0  1


In [11]:
df = pd.DataFrame({'frases': ['Do not let', 'without having been ', 'Do no','We are']})
df['tokenized'] = df.apply(lambda row: row['frases'].split(' '), axis=1)

print(df)

                 frases                  tokenized
0            Do not let             [Do, not, let]
1  without having been   [without, having, been, ]
2                 Do no                   [Do, no]
3                We are                  [We, are]
