# Concept Extraction for Components

In [None]:
import pandas as pd
from extract import run_extract

df_status, err = run_extract('components')

In [2]:
print('Files unable to be extracted: {}'.format(err))

Files unable to be extracted: []


In [3]:
df_status.head()

Unnamed: 0,Repo,Code Extraction Status
0,component_15puzzle-1,Code extracted at output/component_15puzzle-1/...
1,component_15puzzle-2,Code extracted at output/component_15puzzle-2/...
2,component_15puzzle-3,Code extracted at output/component_15puzzle-3/...
3,component_Aladyn-1,Code extracted at output/component_Aladyn-1/co...
4,component_Aladyn-2,Code extracted at output/component_Aladyn-2/co...


## Data from /output

In [1]:
import pandas as pd

def pull_text(tf):
    buff_txt = ''
    with open(tf, 'r', encoding='utf-8') as myfile:
        buff_txt = myfile.read()
    return buff_txt

In [2]:
import glob
import os

root = 'output'
code_lst = []

# fetch all the folders
dirlist = [ item for item in os.listdir(root) if os.path.isdir(os.path.join(root, item)) ]

for dir in dirlist:
    # fetch all text filenames from folder
    code_files = [f for f in glob.glob(root + '\\' + dir + '\\' + '**/*.txt', recursive=True)]
    t_code = pull_text(code_files[0])
    code_lst.append(t_code)

In [3]:
code_lst = [ x.split() for x in code_lst]

In [4]:
dirlist = [ x[10:] for x in dirlist]

df_data = pd.DataFrame(list(zip(dirlist, code_lst)), columns=['Component','Terms'])

print('No. of rows: {}'.format(df_data.shape[0]))
df_data.head()

No. of rows: 848


Unnamed: 0,Component,Terms
0,15puzzle-1,"[action, label, action, action, label, node, s..."
1,15puzzle-2,"[action, state, pair, action, action, state, s..."
2,15puzzle-3,"[mini, max, node, node, maxdepth, maximum, dep..."
3,Aladyn-1,"[dynamic, generic, type, user, pair, type, pai..."
4,Aladyn-2,"[latent, type, runtime, serial, version, uid, ..."


## Topic Modelling with LDA

In [7]:
import gensim
import gensim.corpora as corpora

processed_docs = list(df_data['Terms'])

id2word = corpora.Dictionary(processed_docs)

corpus = [id2word.doc2bow(text) for text in processed_docs]

## Gensim LDA

In [37]:
lda_model =  gensim.models.LdaMulticore(corpus=corpus, 
                                        num_topics=3,
                                        random_state=42,
                                        id2word=id2word, 
                                        alpha=0.01,
                                        passes=10,
                                        per_word_topics=True,
                                        workers=2)

In [38]:
import re

for idx, topic in lda_model.print_topics(-1):
  
    topic_cluster = re.sub(r'[^A-Za-z ]','',topic)
    topic_cluster = '-'.join(topic_cluster.split())
  
    print("\nTopic: {} \nWeights: {}\nCluster: {}\n".format(idx, topic, topic_cluster))
    print("=============================================================================")


Topic: 0 
Weights: 0.015*"message" + 0.011*"list" + 0.011*"write" + 0.010*"value" + 0.010*"connection" + 0.009*"factory" + 0.009*"field" + 0.009*"result" + 0.009*"local" + 0.009*"request"
Cluster: message-list-write-value-connection-factory-field-result-local-request


Topic: 1 
Weights: 0.025*"object" + 0.020*"type" + 0.019*"value" + 0.015*"attribute" + 0.010*"key" + 0.010*"stream" + 0.010*"context" + 0.009*"code" + 0.009*"element" + 0.008*"clazz"
Cluster: object-type-value-attribute-key-stream-context-code-element-clazz


Topic: 2 
Weights: 0.029*"event" + 0.023*"action" + 0.016*"listener" + 0.014*"list" + 0.014*"component" + 0.013*"object" + 0.011*"text" + 0.010*"item" + 0.010*"change" + 0.009*"color"
Cluster: event-action-listener-list-component-object-text-item-change-color



In [39]:
from gensim.models import CoherenceModel

coherence_model_lda = CoherenceModel(model=lda_model, texts=processed_docs, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.4641197355994908


In [40]:
%%time
import pyLDAvis.gensim
import pickle 
import pyLDAvis

pyLDAvis.enable_notebook()
LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
LDAvis_prepared

Wall time: 19.1 s


In [41]:
pyLDAvis.save_html(LDAvis_prepared, 'lda_3.html')

## Extracting most dominant topic

In [43]:
df_data.head()

Unnamed: 0,Component,Terms
0,15puzzle-1,"[action, label, action, action, label, node, s..."
1,15puzzle-2,"[action, state, pair, action, action, state, s..."
2,15puzzle-3,"[mini, max, node, node, maxdepth, maximum, dep..."
3,Aladyn-1,"[dynamic, generic, type, user, pair, type, pai..."
4,Aladyn-2,"[latent, type, runtime, serial, version, uid, ..."


In [57]:
df_topics = pd.DataFrame()

for i, row_list in enumerate(lda_model[corpus]):
    row = row_list[0] if lda_model.per_word_topics else row_list
    row = sorted(row, key=lambda x: (x[1]), reverse=True)
    
    for j, (topic_num, prop_topic) in enumerate(row):
        if j == 0:
            wp = lda_model.show_topic(topic_num)
            topic_keywords = ", ".join([word for word, prop in wp])
            df_topics = df_topics.append(pd.Series([int(topic_num), round(prop_topic,3), topic_keywords]), ignore_index=True)
        else:
            break

            
df_topics.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

df_topics = pd.concat([df_data.Component, df_topics], axis=1)

In [63]:
pd.set_option('display.max_colwidth', None)

In [64]:
df_topics.head(10)

Unnamed: 0,Component,Dominant_Topic,Perc_Contribution,Topic_Keywords
0,15puzzle-1,0.0,0.507,"message, list, write, value, connection, factory, field, result, local, request"
1,15puzzle-2,2.0,0.625,"event, action, listener, list, component, object, text, item, change, color"
2,15puzzle-3,0.0,0.852,"message, list, write, value, connection, factory, field, result, local, request"
3,Aladyn-1,1.0,1.0,"object, type, value, attribute, key, stream, context, code, element, clazz"
4,Aladyn-2,1.0,1.0,"object, type, value, attribute, key, stream, context, code, element, clazz"
5,Aladyn-3,1.0,0.999,"object, type, value, attribute, key, stream, context, code, element, clazz"
6,Aladyn-4,1.0,0.999,"object, type, value, attribute, key, stream, context, code, element, clazz"
7,Aladyn-5,1.0,0.998,"object, type, value, attribute, key, stream, context, code, element, clazz"
8,altlaw-extract-1,1.0,0.907,"object, type, value, attribute, key, stream, context, code, element, clazz"
9,altlaw-extract-2,1.0,0.955,"object, type, value, attribute, key, stream, context, code, element, clazz"


In [66]:
df_topics.to_csv('dom_topics_3.csv', encoding='utf-8', index=False)