# Concept Extraction for XMLEditor

In [1]:
import pandas as pd

df_info = pd.read_csv('xml_editor.csv', header=None, encoding='utf-8')
df_info = df_info.rename(columns={0:'project',1:'module_ID',2:'class_ID',3:'class_name',4:'class_path'})

## Only extract Module 1 (sample)

In [2]:
df_info_1 = df_info[df_info['module_ID'] == 1]

In [3]:
df_info_1.head()

Unnamed: 0,project,module_ID,class_ID,class_name,class_path
0,XMLEditor,1,33,ctietze.xmleditor.gui.editor.EditorWindow,/XMLEditor/ctietze/xmleditor/gui/editor/Editor...
1,XMLEditor,1,17,ctietze.xmleditor.actions.AbstractEditorAction,/XMLEditor/ctietze/xmleditor/actions/AbstractE...
2,XMLEditor,1,16,ctietze.xmleditor.actions.AbstractEditMenuAction,/XMLEditor/ctietze/xmleditor/actions/AbstractE...
3,XMLEditor,1,30,ctietze.xmleditor.actions.SaveAsAction,/XMLEditor/ctietze/xmleditor/actions/SaveAsAct...
4,XMLEditor,1,36,ctietze.xmleditor.Resources,/XMLEditor/ctietze/xmleditor/Resources.java


In [4]:
print('No. of classes: {}'.format(df_info.shape[0]))

No. of classes: 43


## Class ID / names

In [5]:
class_names = list(df_info.class_name)

In [6]:
class_names = [x.split('.')[-1] for x in class_names]

In [7]:
class_id = list(df_info.class_ID)

## Extraction

In [8]:
import spacy

nlp = spacy.load('en_core_web_md')
nlp.max_length = 4000000

In [9]:
import code_extract as ce
import re

kw = ce.generate_kw()
kw_pattern = re.compile(r'\b(' + r'|'.join(kw) + r')\b\s*')

extracted = []

In [10]:
classes = list(df_info.class_path)
classes = [x[1:] for x in classes]

In [11]:
for class_path in classes:
    extracted.append(ce.code_ingest(class_path, kw_pattern, nlp))

## Output

In [12]:
import numpy as np
data = np.transpose([class_id, class_names, extracted])

In [13]:
df = pd.DataFrame(data, columns = ['id', 'name', 'terms'])

In [14]:
# df.to_csv('xmleditor_terms_module_1.csv', encoding='utf-8', index=False)

In [24]:
df.terms = df.terms.apply(lambda x: x.split())

In [25]:
df.head()

Unnamed: 0,id,name,terms
0,33,EditorWindow,"[editor, window, frame, window, title, xml, ed..."
1,17,AbstractEditorAction,"[editor, action, action, editor, window, edito..."
2,16,AbstractEditMenuAction,"[edit, menu, action, editor, action, tree, sel..."
3,30,SaveAsAction,"[save, action, editor, action, tree, model, li..."
4,36,Resources,"[resource, resource, bundle, resource, bundle,..."


In [17]:
#terms_set = ' '.join(set(terms_output.split()))

In [26]:
terms_output = list(df.terms)

In [36]:
terms_output[10]

['delete',
 'node',
 'action',
 'edit',
 'menu',
 'action',
 'action',
 'confirm',
 'dialog',
 'title',
 'confirm',
 'dialog',
 'text',
 'delete',
 'node',
 'action',
 'editor',
 'window',
 'editor',
 'action',
 'perform',
 'action',
 'event',
 'selection',
 'fit',
 'enable',
 'xml',
 'node',
 'select',
 'node']

## Topic Modelling with LDA

In [28]:
import gensim

processed_docs = terms_output

dictionary = gensim.corpora.Dictionary(processed_docs)

INFO:gensim.corpora.dictionary:adding document #0 to Dictionary(0 unique tokens: [])
INFO:gensim.corpora.dictionary:built Dictionary(159 unique tokens: ['action', 'adapter', 'add', 'attrib', 'attribute']...) from 43 documents (total 2260 corpus positions)


In [29]:
'''
Checking dictionary created
'''
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 30:
        break

0 action
1 adapter
2 add
3 attrib
4 attribute
5 cell
6 change
7 child
8 close
9 closing
10 collapse
11 component
12 content
13 create
14 delete
15 dimension
16 document
17 dummy
18 edit
19 editor
20 event
21 expand
22 frame
23 icon
24 item
25 key
26 listener
27 menu
28 node
29 open
30 pane


In [30]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

In [31]:
'''
Preview BOW for our sample preprocessed document
'''
document_num = 5
bow_doc_x = bow_corpus[document_num]

for i in range(len(bow_doc_x)):
    print("Word {} (\"{}\") appears {} time(s).".format(bow_doc_x[i][0], 
                                                     dictionary[bow_doc_x[i][0]], 
                                                     bow_doc_x[i][1]))

Word 0 ("action") appears 9 time(s).
Word 6 ("change") appears 3 time(s).
Word 16 ("document") appears 4 time(s).
Word 19 ("editor") appears 4 time(s).
Word 20 ("event") appears 3 time(s).
Word 29 ("open") appears 1 time(s).
Word 37 ("save") appears 4 time(s).
Word 43 ("text") appears 1 time(s).
Word 44 ("title") appears 1 time(s).
Word 48 ("window") appears 2 time(s).
Word 63 ("perform") appears 2 time(s).
Word 71 ("ask") appears 1 time(s).
Word 72 ("changes") appears 3 time(s).
Word 73 ("unsaved") appears 4 time(s).


## Gensim LDA

In [34]:
lda_model =  gensim.models.LdaMulticore(bow_corpus, num_topics = 3, id2word=dictionary,
                                        passes = 10, workers = 2)

INFO:gensim.models.ldamodel:using symmetric alpha at 0.3333333333333333
INFO:gensim.models.ldamodel:using symmetric eta at 0.3333333333333333
INFO:gensim.models.ldamodel:using serial LDA version on this node
INFO:gensim.models.ldamulticore:running online LDA training, 3 topics, 10 passes over the supplied corpus of 43 documents, updating every 4000 documents, evaluating every ~43 documents, iterating 50x with a convergence threshold of 0.001000
INFO:gensim.models.ldamulticore:training LDA model using 2 processes
INFO:gensim.models.ldamulticore:PROGRESS: pass 0, dispatched chunk #0 = documents up to #43/43, outstanding queue size 1
DEBUG:gensim.models.ldamodel:updating topics
INFO:gensim.models.ldamodel:topic #0 (0.333): 0.081*"node" + 0.073*"xml" + 0.039*"content" + 0.033*"attribute" + 0.031*"action" + 0.027*"tree" + 0.022*"event" + 0.020*"exception" + 0.019*"comment" + 0.018*"tag"
INFO:gensim.models.ldamodel:topic #1 (0.333): 0.138*"action" + 0.082*"tree" + 0.054*"key" + 0.048*"editor

INFO:gensim.models.ldamodel:topic #2 (0.333): 0.142*"key" + 0.118*"tree" + 0.069*"event" + 0.052*"action" + 0.042*"editor" + 0.034*"cell" + 0.031*"adapter" + 0.026*"xml" + 0.026*"press" + 0.022*"selection"
INFO:gensim.models.ldamodel:topic diff=0.060633, rho=0.353079
DEBUG:gensim.models.ldamodel:bound: at document #0
INFO:gensim.models.ldamodel:-3.963 per-word bound, 15.6 perplexity estimate based on a held-out corpus of 43 documents with 2260 words
INFO:gensim.models.ldamulticore:PROGRESS: pass 8, dispatched chunk #0 = documents up to #43/43, outstanding queue size 1
DEBUG:gensim.models.ldamodel:updating topics
INFO:gensim.models.ldamodel:topic #0 (0.333): 0.088*"xml" + 0.078*"node" + 0.068*"content" + 0.035*"tag" + 0.034*"comment" + 0.031*"attribute" + 0.027*"attrib" + 0.024*"exception" + 0.024*"url" + 0.023*"cdata"
INFO:gensim.models.ldamodel:topic #1 (0.333): 0.197*"action" + 0.065*"tree" + 0.055*"editor" + 0.052*"node" + 0.038*"event" + 0.037*"window" + 0.035*"xml" + 0.027*"menu" 

In [35]:
import re

for idx, topic in lda_model.print_topics(-1):
  
    topic_cluster = re.sub(r'[^A-Za-z ]','',topic)
    topic_cluster = '-'.join(topic_cluster.split())
  
    print("\nTopic: {} \nWeights: {}\nCluster: {}\n".format(idx, topic, topic_cluster))
    print("=============================================================================")

INFO:gensim.models.ldamodel:topic #0 (0.333): 0.088*"xml" + 0.078*"node" + 0.068*"content" + 0.035*"tag" + 0.034*"comment" + 0.031*"attribute" + 0.027*"attrib" + 0.024*"exception" + 0.024*"url" + 0.023*"cdata"
INFO:gensim.models.ldamodel:topic #1 (0.333): 0.197*"action" + 0.064*"tree" + 0.054*"editor" + 0.052*"node" + 0.037*"event" + 0.036*"window" + 0.036*"xml" + 0.028*"menu" + 0.027*"save" + 0.026*"edit"
INFO:gensim.models.ldamodel:topic #2 (0.333): 0.160*"key" + 0.129*"tree" + 0.074*"event" + 0.044*"editor" + 0.039*"cell" + 0.036*"adapter" + 0.035*"action" + 0.029*"press" + 0.023*"selection" + 0.023*"xml"



Topic: 0 
Weights: 0.088*"xml" + 0.078*"node" + 0.068*"content" + 0.035*"tag" + 0.034*"comment" + 0.031*"attribute" + 0.027*"attrib" + 0.024*"exception" + 0.024*"url" + 0.023*"cdata"
Cluster: xml-node-content-tag-comment-attribute-attrib-exception-url-cdata


Topic: 1 
Weights: 0.197*"action" + 0.064*"tree" + 0.054*"editor" + 0.052*"node" + 0.037*"event" + 0.036*"window" + 0.036*"xml" + 0.028*"menu" + 0.027*"save" + 0.026*"edit"
Cluster: action-tree-editor-node-event-window-xml-menu-save-edit


Topic: 2 
Weights: 0.160*"key" + 0.129*"tree" + 0.074*"event" + 0.044*"editor" + 0.039*"cell" + 0.036*"adapter" + 0.035*"action" + 0.029*"press" + 0.023*"selection" + 0.023*"xml"
Cluster: key-tree-event-editor-cell-adapter-action-press-selection-xml

