# Concept Extraction for XMLEditor

In [1]:
import pandas as pd

df_info = pd.read_csv('xml_editor.csv', header=None, encoding='utf-8')
df_info = df_info.rename(columns={0:'project',1:'module_ID',2:'class_ID',3:'class_name',4:'class_path'})

## Only extract Module 1

In [2]:
df_info = df_info[df_info['module_ID'] == 1]

In [3]:
df_info.head()

Unnamed: 0,project,module_ID,class_ID,class_name,class_path
0,XMLEditor,1,33,ctietze.xmleditor.gui.editor.EditorWindow,/XMLEditor/ctietze/xmleditor/gui/editor/Editor...
1,XMLEditor,1,17,ctietze.xmleditor.actions.AbstractEditorAction,/XMLEditor/ctietze/xmleditor/actions/AbstractE...
2,XMLEditor,1,16,ctietze.xmleditor.actions.AbstractEditMenuAction,/XMLEditor/ctietze/xmleditor/actions/AbstractE...
3,XMLEditor,1,30,ctietze.xmleditor.actions.SaveAsAction,/XMLEditor/ctietze/xmleditor/actions/SaveAsAct...
4,XMLEditor,1,36,ctietze.xmleditor.Resources,/XMLEditor/ctietze/xmleditor/Resources.java


In [4]:
print('No. of classes: {}'.format(df_info.shape[0]))

No. of classes: 22


## Class ID / names

In [5]:
class_names = list(df_info.class_name)

In [6]:
class_names = [x.split('.')[-1] for x in class_names]

In [7]:
class_id = list(df_info.class_ID)

## Extraction

In [8]:
import spacy

nlp = spacy.load('en_core_web_md')
nlp.max_length = 4000000

In [9]:
import code_extract as ce
import re

kw = ce.generate_kw()
kw_pattern = re.compile(r'\b(' + r'|'.join(kw) + r')\b\s*')

extracted = []

In [10]:
classes = list(df_info.class_path)
classes = [x[1:] for x in classes]

In [11]:
for class_path in classes:
    extracted.append(ce.code_ingest(class_path, kw_pattern, nlp))

## Output

In [12]:
import numpy as np
data = np.transpose([class_id, class_names, extracted])

In [13]:
df = pd.DataFrame(data, columns = ['id', 'name', 'terms'])

In [14]:
# df.to_csv('xmleditor_terms_module_1.csv', encoding='utf-8', index=False)

In [15]:
display(df)

Unnamed: 0,id,name,terms
0,33,EditorWindow,editor window frame window title xml editor di...
1,17,AbstractEditorAction,editor action action editor window editor wind...
2,16,AbstractEditMenuAction,edit menu action editor action tree selection ...
3,30,SaveAsAction,save action editor action tree model listener ...
4,36,Resources,resource resource bundle resource bundle resou...
5,18,AbstractUnsavedChangesAction,unsaved changes action save action save docume...
6,39,XMLDocument,xml document xml header xml node root node cho...
7,29,SaveAction,save action save action action action tooltip ...
8,19,AddAttributeAction,add attribute action edit menu action action a...
9,20,AddChildNodeAction,add child node action edit menu action action ...


In [24]:
terms_output = ' '.join(list(df.terms))

In [25]:
terms_set = ' '.join(set(terms_output.split()))

In [26]:
terms_set

'unsaved already fit collapse setup press url gain ask frame editor perform resource image adapter listener tag model type header selection expand create close add item document node path tooltip dummy value empty xml component insert select dimension root cell localization pane write last exception chooser icon text enable release attrib remove attribute accept bundle show delete focus quit filename error find renderer top lose event dialog window structure confirm size save tree description change title child parent rich content action changes open edit key disk closing menu'