## Concept/Domain coverage analysis: Extraction and Topic Modelling

This notebook extracts corpus keywords and analyse the concepts emerging from three sources: source code, documentation and tests

In [1]:
import code_extract as ce
import doc_extract as de
import pandas as pd
import numpy as np
import spacy

In [2]:
nlp = spacy.load('en_core_web_md')
nlp.max_length = 4000000

In [8]:
dataset_df = pd.read_csv('dataset.csv').set_index('ID') # sorted by  project ID
dataset_df

Unnamed: 0_level_0,name,url,git url,default branch,commit SHA,stars,timestamp
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
P1,dubbo,https://github.com/apache/dubbo,https://github.com/apache/dubbo.git,3.2,502b4c86dffbc0de3863a5e41ed7b9f633450c1c,38.4k,13/02/2022 18:00
P2,skywalking,https://github.com/apache/skywalking,https://github.com/apache/skywalking.git,master,fd78739067b459dfd5474d3948b3cb97bc3cb63b,21.2k,13/02/2022 18:20
P3,flink,https://github.com/apache/flink,https://github.com/apache/flink.git,master,c0aa73df4df4e39c138f2cddaeb8efad6c831d03,20.6k,13/02/2022 19:32
P4,rocketmq,https://github.com/apache/rocketmq,https://github.com/apache/rocketmq.git,develop,2e8ef046465c4133cf0d6ad6f242f630021439b2,18.7k,13/02/2022 19:33
P5,shardingsphere,https://github.com/apache/shardingsphere,https://github.com/apache/shardingsphere.git,master,7d1d1c3cc2bbdd56cb7c245f681fb6666cedaf78,18k,13/02/2022 19:34
P6,hadoop,https://github.com/apache/hadoop,https://github.com/apache/hadoop.git,trunk,90de1ff151ede83a6f963aaf2407d3eb6220ae40,13.2k,13/02/2022 19:36
P7,druid,https://github.com/apache/druid,https://github.com/apache/druid.git,master,f09f83697df5daa5dd52c6f930d12c0c68233776,12.4k,13/02/2022 19:37
P8,pulsar,https://github.com/apache/pulsar,https://github.com/apache/pulsar.git,master,950ff441da28e144bdfb71c317a9bc339d4f05b7,12.3k,13/02/2022 19:38
P9,zookeeper,https://github.com/apache/zookeeper,https://github.com/apache/zookeeper.git,master,2d1bac7e077f49a7149d3fb878a2c73b9e627f6e,11.1k,13/02/2022 19:39
P10,dolphinscheduler,https://github.com/apache/dolphinscheduler,https://github.com/apache/dolphinscheduler.git,dev,2bd65fb2df68847130c7e1ab6616bbfba95783b2,9.8k,13/02/2022 19:40


In [9]:
# text to remove from repositories: name of org (apache) and project

def split_hyphen(word):
    if '-' in word:
        return word.split('-')
    return [word]

def flatten(list_of_list):
    return [x for y in list_of_list for x in y]

proj_names = dataset_df['name'].tolist()
proj_names = [[['apache'],split_hyphen(x)] for x in proj_names]
proj_names = [flatten(x) for x in proj_names]

proj_names

[['apache', 'dubbo'],
 ['apache', 'skywalking'],
 ['apache', 'flink'],
 ['apache', 'rocketmq'],
 ['apache', 'shardingsphere'],
 ['apache', 'hadoop'],
 ['apache', 'druid'],
 ['apache', 'pulsar'],
 ['apache', 'zookeeper'],
 ['apache', 'dolphinscheduler'],
 ['apache', 'shardingsphere', 'elasticjob'],
 ['apache', 'shenyu'],
 ['apache', 'tomcat'],
 ['apache', 'storm'],
 ['apache', 'zeppelin']]

In [10]:
import os

# we will have to define the module folders present in each repo manually, as they are not arranged uniformly

repo_folder_names = os.listdir('repo')

modules_dir = [
    ['dubbo-build-tools','dubbo-cluster','dubbo-common','dubbo-compatible','dubbo-compiler','dubbo-config','dubbo-configcenter','dubbo-container','dubbo-demo','dubbo-dependencies','dubbo-filter','dubbo-kubernetes','dubbo-metadata','dubbo-metrics','dubbo-monitor','dubbo-native','dubbo-native-plugin','dubbo-plugin','dubbo-registry','dubbo-remoting','dubbo-rpc','dubbo-serialization','dubbo-spring-boot','dubbo-xds'],
    ['apm-checkstyle','apm-protocol','apm-webapp','oap-server'],
    ['flink-annotations','flink-clients','flink-connectors','flink-container','flink-contrib','flink-core','flink-dstl','flink-external-resources','flink-filesystems','flink-formats','flink-java','flink-kubernetes','flink-libraries','flink-metrics','flink-optimizer','flink-queryable-state','flink-rpc','flink-runtime','flink-runtime-web','flink-scala','flink-state-backends','flink-streaming-java','flink-streaming-scala','flink-table','flink-walkthroughs','flink-yarn'],
    ['acl','bazel','broker','client','common','container','controller','dev','distribution','filter','namesrv','openmessaging','proxy','remoting','srvutil','store','style','tieredstore','tools'],
    ['db-protocol','dialect-exception','distribution','distsql','features','infra','jdbc','kernel','mode','proxy','sql-parser'],
    ['hadoop-assemblies','hadoop-build-tools','hadoop-client-modules','hadoop-cloud-storage-project','hadoop-common-project','hadoop-hdfs-project','hadoop-mapreduce-project','hadoop-maven-plugins','hadoop-minicluster','hadoop-project','hadoop-tools','hadoop-yarn-project'],
    ['cloud','codestyle','core','dev','extendedset','extensions-contrib','extensions-core','helm','hll','hooks','indexing-hadoop','indexing-service','processing','publications','server','services','web-console','website'],
    ['pulsar-broker','pulsar-broker-auth-athenz','pulsar-broker-auth-sasl','pulsar-broker-common','pulsar-client', 'pulsar-client-1x-base','pulsar-client-admin','pulsar-client-admin-api','pulsar-client-admin-shaded', 'pulsar-client-all','pulsar-client-api','pulsar-client-auth-athenz','pulsar-client-auth-sasl','pulsar-client-cpp','pulsar-client-messagecrypto-bc','pulsar-client-shaded','pulsar-client-tools','pulsar-client-tools-api','pulsar-common','pulsar-config-validation','pulsar-function-go','pulsar-functions','pulsar-io','pulsar-metadata','pulsar-package-management','pulsar-proxy','pulsar-sql','pulsar-transaction','pulsar-websocket'],
    ['zookeeper-assembly','zookeeper-client','zookeeper-contrib','zookeeper-docs','zookeeper-it','zookeeper-jute','zookeeper-metrics-providers','zookeeper-recipes','zookeeper-server'],
    ['dolphinscheduler-alert','dolphinscheduler-aop','dolphinscheduler-api','dolphinscheduler-api-test','dolphinscheduler-common','dolphinscheduler-dao','dolphinscheduler-data-quality','dolphinscheduler-datasource-plugin','dolphinscheduler-dist','dolphinscheduler-e2e','dolphinscheduler-master','dolphinscheduler-meter','dolphinscheduler-microbench','dolphinscheduler-registry','dolphinscheduler-remote','dolphinscheduler-scheduler-plugin','dolphinscheduler-service','dolphinscheduler-spi','dolphinscheduler-standalone-server','dolphinscheduler-storage-plugin','dolphinscheduler-task-plugin','dolphinscheduler-tools','dolphinscheduler-ui','dolphinscheduler-worker'],
    ['elasticjob-api','elasticjob-cloud','elasticjob-distribution','elasticjob-ecosystem','elasticjob-infra','elasticjob-lite'],
    ['shenyu-admin','shenyu-alert','shenyu-bootstrap','shenyu-client','shenyu-common','shenyu-disruptor','shenyu-e2e','shenyu-loadbalancer','shenyu-plugin','shenyu-protocol','shenyu-register-center','shenyu-sdk','shenyu-spi','shenyu-spring-boot-starter','shenyu-sync-data-center','shenyu-web'],
    ['java','modules','res','webapps'],
    ['storm-buildtools','storm-checkstyle','storm-client','storm-clojure','storm-core','storm-multilang','storm-server','storm-shaded-deps','storm-submit-tools','storm-webapp'],
    ['zeppelin-client','zeppelin-common','zeppelin-display','zeppelin-integration','zeppelin-interpreter','zeppelin-interpreter-integration','zeppelin-interpreter-parent','zeppelin-interpreter-shaded','zeppelin-jupyter','zeppelin-jupyter-interpreter','zeppelin-jupyter-interpreter-shaded','zeppelin-plugins','zeppelin-server','zeppelin-web','zeppelin-web-angular','zeppelin-zengine'],
    # ['minifi','nifi-api','nifi-assembly','nifi-bootstrap','nifi-commons','nifi-dependency-check-maven','nifi-docker','nifi-external','nifi-framework-api','nifi-h2','nifi-manifest','nifi-maven-archetypes','nifi-mock','nifi-nar-bundles','nifi-registry','nifi-server-api','nifi-stateless', 'nifi-toolkit'],
    # ['core','crypto','event','lang','support','tools','web'],
    # ['archetypes','buildingtools','camel-sbom','catalog','components','core','dsl'],
    # ['src'],
    # ['agent','api','client','core','engine','framework','server','services','tools','utils','vmware-base'],
    # ['tika-app','tika-batch','tika-bundles','tika-core','tika-deployment','tika-detectors','tika-dotnet','tika-eval','tika-fuzzing','tika-java7','tika-langdetect','tika-parent','tika-parsers','tika-pipes','tika-serialization','tika-server','tika-translate','tika-xmp'],
    # ['binder','common','core', 'flink','flink-shaded','spark-shaded','viz','zeppelin'],
    # ['ctakes-assertion','ctakes-assertion-zoner','ctakes-chunker','ctakes-clinical-pipeline','ctakes-constituency-parser','ctakes-context-tokenizer','ctakes-core','ctakes-coreference','ctakes-dependency-parser','ctakes-dictionary-lookup','ctakes-dictionary-lookup-fast','ctakes-distribution','ctakes-dockhand','ctakes-drug-ner','ctakes-fhir','ctakes-gui','ctakes-lvg','ctakes-ne-contexts','ctakes-pbj','ctakes-pos-tagger','ctakes-preprocessor','ctakes-regression-test','ctakes-relation-extractor','ctakes-side-effect','ctakes-smoking-status','ctakes-template-filler','ctakes-temporal','ctakes-tiny-rest','ctakes-type-system','ctakes-user-resources','ctakes-utils','ctakes-web-rest','ctakes-ytex','ctakes-ytex-uima','ctakes-ytex-web'],
    # ['src'],
    # ['gremlin-annotations','gremlin-archetype','gremlin-console','gremlin-core','gremlin-dotnet','gremlin-driver','gremlin-go','gremlin-groovy','gremlin-javascript','gremlin-language','gremlin-python','gremlin-server','gremlin-shaded', 'gremlin'],
    # ['alerting','api','appender','assembly','collector','marshaller','parser','processor'],
    # ['api','assembly','bundle','extensions','impl','parent'],
    # ['hudi-aws','hudi-cli','hudi-client','hudi-common','hudi-flink-datasource','hudi-gcp','hudi-hadoop-mr','hudi-kafka-connect','hudi-platform-service','hudi-spark-datasource','hudi-sync','hudi-tests-common','hudi-timeline-service','hudi-utilities'],
    # ['inlong-agent','inlong-audit','inlong-common','inlong-dashboard','inlong-dataproxy','inlong-distribution','inlong-manager','inlong-sdk','inlong-sort','inlong-sort-standalone','inlong-tools','inlong-tubemq'],
    # ['plc4c','plc4go','plc4j','plc4net','protocols','reactors','tools'],
    # ['src'],
    # ['freemarker-docgen-ant','freemarker-docgen-cli','freemarker-docgen-core','freemarker-docgen-maven'],
    # ['apache-whisker-app','apache-whisker-cli','apache-whisker-maven-plugin','apache-whisker-model','apache-whisker-scan','apache-whisker-velocity','apache-whisker-xml'],
    # ['src'],
    # ['src'],
    # ['abiquo','aliyun-ecs','cdmi','cloudsigma2','cloudsigma2-hnl','cloudsigma2-lvs','cloudsigma2-mia','cloudsigma2-sjc','cloudsigma2-wdc','cloudsigma2-zrh','dimensiondata','joyent-cloudapi','joyentcloud','oneandone','profitbricks-rest','vagrant'],
    # ['src'],
    # ['modules\\example-iterators-a','modules\\example-iterators-b','modules\\vfs-class-loader'],
    # ['src'],
    # ['src'],
    # ['asterix-graphix'],
    # ['org.apache.aries.typedevent.bus','org.apache.aries.typedevent.remote'],
    # ['src'],
    # ['geronimo-metrics','geronimo-metrics-common','geronimo-metrics-extensions'],
    # ['expression-evaluator','graph','rest-util','spring-apacheds','spring-cache','spring-quartz','spring-registry','spring-taskqueue']
]

In [11]:
def run_code_extract(repo_name, ctr):

    modules_final_list = []
    repo_final_list = []

    code_final_list = []
    code_file_count_list = []
    code_token_count_list = []

    test_final_list = []
    test_file_count_list = []
    test_token_count_list = []

    # print(f"repo name: {repo_name}")
    # print(f"ctr: {ctr}")

    for module in modules_dir[ctr]:

        folder_name = 'repo\\{}\\{}\\'.format(repo_name, module)
        
        extracted_code, code_file_count, code_token_count, extracted_test, test_file_count, test_token_count = ce.extract(folder_name, proj_names[ctr], nlp)
        
        modules_final_list.append(module)
        repo_final_list.append(repo_name)
            
        code_final_list.append(extracted_code)
        code_file_count_list.append(code_file_count)
        code_token_count_list.append(code_token_count)
        
        test_final_list.append(extracted_test)
        test_file_count_list.append(test_file_count)
        test_token_count_list.append(test_token_count)

    data_df = pd.DataFrame(zip(*[repo_final_list,modules_final_list,code_final_list,code_file_count_list,code_token_count_list,
                                  test_final_list,test_file_count_list,test_token_count_list]))
    data_df.columns = ['project_name','module','code_extract','code_file_count','code_token_count','test_extract',
                        'test_file_count','test_token_count']

    return data_df

In [12]:
from tqdm import tqdm

data_df_list = [run_code_extract(x,i) for i, x in tqdm(enumerate(repo_folder_names))]
full_data_df = pd.concat(data_df_list)
full_data_df = full_data_df[full_data_df['code_token_count']!=0]
full_data_df = full_data_df[full_data_df['test_token_count']!=0]

15it [38:27, 153.82s/it]


In [13]:
full_data_df

Unnamed: 0,project_name,module,code_extract,code_file_count,code_token_count,test_extract,test_file_count,test_token_count
1,01_dubbo,dubbo-cluster,"address listener,cacheable router factory rout...",153,4608,short response load balance load balance scope...,80,1869
2,01_dubbo,dubbo-common,"activate,reference,service,service metadata co...",403,17859,simple simple simple simple simple simple simp...,386,3621
3,01_dubbo,dubbo-compatible,"cache cache,cache factory cache factory,cache ...",89,1908,cache target service unique protocol service k...,38,538
5,01_dubbo,dubbo-config,"initializer,processor,scope model initializer ...",98,6778,"multi loader service,multi loader service impl...",295,5295
6,01_dubbo,dubbo-configcenter,apollo dynamic dynamic error aware error aware...,9,483,apollo dynamic session timeout key session nam...,6,251
...,...,...,...,...,...,...,...,...
8,15_zeppelin,zeppelin-jupyter,jupyter util gson pretty gson gson builder pre...,25,307,jupyter util format note note verify testget n...,1,8
9,15_zeppelin,zeppelin-jupyter-interpreter,jupyter interpreter interpreter map jupyter ke...,5,296,python kernel interpreter group intp group int...,2,50
11,15_zeppelin,zeppelin-plugins,cluster interpreter thread thread interpreter ...,27,1603,cluster interpreter launcher cluster mock inte...,17,513
12,15_zeppelin,zeppelin-server,jvm info binder meter binder unknown unknown b...,84,4848,"util instance clazz clear instance,cluster aut...",35,1338


In [14]:
# extract doc from each project

doc_final_list = []
doc_token_count_list = []

ctr = 0

for repo_name in repo_folder_names:
    
        folder_name = 'repo\\{}\\'.format(repo_name)
        
        extracted_doc, doc_token_count = de.extract_from_doc(folder_name, proj_names[ctr], nlp)
        
        doc_final_list.append(extracted_doc)
        doc_token_count_list.append(doc_token_count)
        
        print('{} done...'.format(repo_name))
        ctr+=1

01_dubbo done...
02_skywalking done...
03_flink done...
04_rocketmq done...
05_shardingsphere done...
06_hadoop done...
07_druid done...
08_pulsar done...
09_zookeeper done...
10_dolphinscheduler done...
11_shardingsphere-elasticjob done...
12_shenyu done...
13_tomcat done...
14_storm done...
15_zeppelin done...


In [15]:
doc_extract_df = pd.DataFrame(zip(*[repo_folder_names, doc_final_list, doc_token_count_list]))
doc_extract_df.columns = ['project_name', 'doc_extract', 'doc_token_count']
full_data_df = full_data_df.merge(doc_extract_df, on='project_name')

In [16]:
full_data_df.head()

Unnamed: 0,project_name,module,code_extract,code_file_count,code_token_count,test_extract,test_file_count,test_token_count,doc_extract,doc_token_count
0,01_dubbo,dubbo-cluster,"address listener,cacheable router factory rout...",153,4608,short response load balance load balance scope...,80,1869,project highperformance javabase opensource rp...,300
1,01_dubbo,dubbo-common,"activate,reference,service,service metadata co...",403,17859,simple simple simple simple simple simple simp...,386,3621,project highperformance javabase opensource rp...,300
2,01_dubbo,dubbo-compatible,"cache cache,cache factory cache factory,cache ...",89,1908,cache target service unique protocol service k...,38,538,project highperformance javabase opensource rp...,300
3,01_dubbo,dubbo-config,"initializer,processor,scope model initializer ...",98,6778,"multi loader service,multi loader service impl...",295,5295,project highperformance javabase opensource rp...,300
4,01_dubbo,dubbo-configcenter,apollo dynamic dynamic error aware error aware...,9,483,apollo dynamic session timeout key session nam...,6,251,project highperformance javabase opensource rp...,300


In [17]:
full_data_df.tail()

Unnamed: 0,project_name,module,code_extract,code_file_count,code_token_count,test_extract,test_file_count,test_token_count,doc_extract,doc_token_count
156,15_zeppelin,zeppelin-jupyter,jupyter util gson pretty gson gson builder pre...,25,307,jupyter util format note note verify testget n...,1,8,documentation user guide mailing list user dev...,65
157,15_zeppelin,zeppelin-jupyter-interpreter,jupyter interpreter interpreter map jupyter ke...,5,296,python kernel interpreter group intp group int...,2,50,documentation user guide mailing list user dev...,65
158,15_zeppelin,zeppelin-plugins,cluster interpreter thread thread interpreter ...,27,1603,cluster interpreter launcher cluster mock inte...,17,513,documentation user guide mailing list user dev...,65
159,15_zeppelin,zeppelin-server,jvm info binder meter binder unknown unknown b...,84,4848,"util instance clazz clear instance,cluster aut...",35,1338,documentation user guide mailing list user dev...,65
160,15_zeppelin,zeppelin-zengine,"dummy health health result,hdfs health health ...",94,6431,clear system variable cleanup allow malforme u...,58,2410,documentation user guide mailing list user dev...,65


In [18]:
full_data_df.to_csv("extracted_data.csv",index=False)

### Retrieve from csv -- start here

In [19]:
import pandas as pd
import numpy as np
import spacy

In [20]:
nlp = spacy.load('en_core_web_md')
nlp.max_length = 4000000

In [21]:
data_df = pd.read_csv("extracted_data.csv")
data_df.head()

Unnamed: 0,project_name,module,code_extract,code_file_count,code_token_count,test_extract,test_file_count,test_token_count,doc_extract,doc_token_count
0,01_dubbo,dubbo-cluster,"address listener,cacheable router factory rout...",153,4608,short response load balance load balance scope...,80,1869,project highperformance javabase opensource rp...,300
1,01_dubbo,dubbo-common,"activate,reference,service,service metadata co...",403,17859,simple simple simple simple simple simple simp...,386,3621,project highperformance javabase opensource rp...,300
2,01_dubbo,dubbo-compatible,"cache cache,cache factory cache factory,cache ...",89,1908,cache target service unique protocol service k...,38,538,project highperformance javabase opensource rp...,300
3,01_dubbo,dubbo-config,"initializer,processor,scope model initializer ...",98,6778,"multi loader service,multi loader service impl...",295,5295,project highperformance javabase opensource rp...,300
4,01_dubbo,dubbo-configcenter,apollo dynamic dynamic error aware error aware...,9,483,apollo dynamic session timeout key session nam...,6,251,project highperformance javabase opensource rp...,300


In [22]:
data_df.tail()

Unnamed: 0,project_name,module,code_extract,code_file_count,code_token_count,test_extract,test_file_count,test_token_count,doc_extract,doc_token_count
156,15_zeppelin,zeppelin-jupyter,jupyter util gson pretty gson gson builder pre...,25,307,jupyter util format note note verify testget n...,1,8,documentation user guide mailing list user dev...,65
157,15_zeppelin,zeppelin-jupyter-interpreter,jupyter interpreter interpreter map jupyter ke...,5,296,python kernel interpreter group intp group int...,2,50,documentation user guide mailing list user dev...,65
158,15_zeppelin,zeppelin-plugins,cluster interpreter thread thread interpreter ...,27,1603,cluster interpreter launcher cluster mock inte...,17,513,documentation user guide mailing list user dev...,65
159,15_zeppelin,zeppelin-server,jvm info binder meter binder unknown unknown b...,84,4848,"util instance clazz clear instance,cluster aut...",35,1338,documentation user guide mailing list user dev...,65
160,15_zeppelin,zeppelin-zengine,"dummy health health result,hdfs health health ...",94,6431,clear system variable cleanup allow malforme u...,58,2410,documentation user guide mailing list user dev...,65


### Topic Modeling to detect overlapping domain concepts

In [23]:
proj_list = data_df['project_name'].unique()
proj_list

array(['01_dubbo', '02_skywalking', '03_flink', '04_rocketmq',
       '05_shardingsphere', '06_hadoop', '07_druid', '08_pulsar',
       '09_zookeeper', '10_dolphinscheduler',
       '11_shardingsphere-elasticjob', '12_shenyu', '13_tomcat',
       '14_storm', '15_zeppelin'], dtype=object)

In [24]:
def convert_to_list(col):
    col = col.apply(lambda x: x.split(','))
    col = col.apply(lambda x: [y.split() for y in x])
    return col

In [25]:
def jaccard_similarity(topic_1, topic_2):
    intersection = set(topic_1).intersection(set(topic_2))
    union = set(topic_1).union(set(topic_2))               
    return float(len(intersection))/float(len(union))

In [26]:
def get_mean_stabilities(num_topics, LDA_topics):

    LDA_stability = {}

    for i in range(0, len(num_topics)):

        jaccard_sims = []

        for t1, topic1 in enumerate(LDA_topics[num_topics[i]]):

            sims = []

            for t2, topic2 in enumerate(LDA_topics[num_topics[i]]):
                sims.append(jaccard_similarity(topic1, topic2))    

            jaccard_sims.append(sims)    

        LDA_stability[num_topics[i]] = jaccard_sims

    return [np.array(LDA_stability[i]).mean() for i in num_topics]

In [8]:
import gensim.corpora as corpora
from gensim.models import LdaModel, CoherenceModel

import logging, sys
logging.disable(sys.maxsize)

def run_optimising_number_topics(data_df, col_name, max_range, num_keywords):
    
    modules_list = data_df['module'].tolist()
    corpus_list = data_df[col_name]
    
    ideal_num_list = []
    ctr = 0

    num_topics = list(range(max_range+1)[1:])
    
    for corpus_in in corpus_list:
    
        id2word = corpora.Dictionary(corpus_in)
        corpus_bow = [id2word.doc2bow(text) for text in corpus_in]

        LDA_models = {}
        LDA_topics = {}
        
        coh_list = []

        for i in num_topics:
            
#             print('Running LDA model topic num {} out of {} for {}'.format(i,max_range,modules_list[ctr]))

            LDA_models[i] = LdaModel(corpus=corpus_bow,
                                     id2word=id2word,
                                     num_topics=i,
                                     update_every=1,
                                     chunksize=len(corpus_bow),
                                     passes=20,
                                     alpha='auto',
                                     random_state=42)

            shown_topics = LDA_models[i].show_topics(num_topics=i, 
                                                     num_words=num_keywords,
                                                     formatted=False)
            LDA_topics[i] = [[word[0] for word in topic[1]] for topic in shown_topics]
            
            coh_score = CoherenceModel(model=LDA_models[i], texts=corpus_in, dictionary=id2word, coherence='c_v').get_coherence()
        
            coh_list.append(coh_score)
            
        # print('--- running coherence scores calculation for {}'.format(modules_list[ctr]))
        
#         coh_list = get_coherence_scores(corpus_in, id2word, num_topics, LDA_models)
        
        ms_list = get_mean_stabilities(num_topics, LDA_topics)
        
        coh_sta_diffs = [coh_list[i] - ms_list[i] for i in range(num_keywords)[:-1]]
        coh_sta_max = max(coh_sta_diffs)
        coh_sta_max_idxs = [i for i, j in enumerate(coh_sta_diffs) if j == coh_sta_max]

        ideal_topic_num_index = coh_sta_max_idxs[0]
        ideal_topic_num = num_topics[ideal_topic_num_index]
        
        ideal_num_list.append(ideal_topic_num)
        
        print('Module {} done...'.format(modules_list[ctr]))
        
        ctr+=1
        
    df = pd.DataFrame(zip(*[modules_list,ideal_num_list]))    
    df.columns = ['module','ideal_num']
    
    return df        

In [9]:
import gensim.corpora as corpora
from gensim.models import LdaModel, CoherenceModel

import logging, sys
logging.disable(sys.maxsize)

def run_optimising_number_doc_topics(data_df, col_name, max_range, num_keywords):

    project_list = data_df['project_name']
    corpus_list = data_df[col_name]
    
    ideal_num_list = []

    num_topics = list(range(max_range+1)[1:])
    
    for corpus_in in corpus_list:
    
        id2word = corpora.Dictionary(corpus_in)
        corpus_bow = [id2word.doc2bow(text) for text in corpus_in]

        LDA_models = {}
        LDA_topics = {}
        
        coh_list = []

        for i in num_topics:
            
#             print('Running LDA model topic num {} out of {} for {}'.format(i,max_range,modules_list[ctr]))

            LDA_models[i] = LdaModel(corpus=corpus_bow,
                                     id2word=id2word,
                                     num_topics=i,
                                     update_every=1,
                                     chunksize=len(corpus_bow),
                                     passes=20,
                                     alpha='auto',
                                     random_state=42)

            shown_topics = LDA_models[i].show_topics(num_topics=i, 
                                                     num_words=num_keywords,
                                                     formatted=False)
            LDA_topics[i] = [[word[0] for word in topic[1]] for topic in shown_topics]
            
            coh_score = CoherenceModel(model=LDA_models[i], texts=corpus_in, dictionary=id2word, coherence='c_v').get_coherence()
        
            coh_list.append(coh_score)
            
        # print('--- running coherence scores calculation for {}'.format(modules_list[ctr]))
        
#         coh_list = get_coherence_scores(corpus_in, id2word, num_topics, LDA_models)
        
        ms_list = get_mean_stabilities(num_topics, LDA_topics)
        
        coh_sta_diffs = [coh_list[i] - ms_list[i] for i in range(num_keywords)[:-1]]
        coh_sta_max = max(coh_sta_diffs)
        coh_sta_max_idxs = [i for i, j in enumerate(coh_sta_diffs) if j == coh_sta_max]

        ideal_topic_num_index = coh_sta_max_idxs[0]
        ideal_topic_num = num_topics[ideal_topic_num_index]
        
        ideal_num_list.append(ideal_topic_num)

    df = pd.DataFrame(zip(*[project_list,ideal_num_list]))    
    df.columns = ['project_name','ideal_num']
    
    return df        

In [10]:
# this has been completed -- undo this cell comment to re-run optimising topic number count
# this is for code and test

# for proj in proj_list:
    
#     print('Running project {}...'.format(proj))

#     sub_data_df = data_df[data_df['project_name']==proj]
    
#     sub_data_df['code_extract'] = convert_to_list(sub_data_df['code_extract'])
#     sub_data_df['test_extract'] = convert_to_list(sub_data_df['test_extract'])

#     ideal_num_code_df = run_optimising_number_topics(sub_data_df, 'code_extract', 10, 10)
#     ideal_num_test_df = run_optimising_number_topics(sub_data_df, 'test_extract', 10, 10)
    
#     ideal_num_code_df.to_csv('modules_config/ideal_num_code_{}.csv'.format(proj), index=False)
#     ideal_num_test_df.to_csv('modules_config/ideal_num_test_{}.csv'.format(proj), index=False)    

In [11]:
# this has been completed -- undo this cell comment to re-run optimising topic number count

# for proj in proj_list:
    
#     print('Running project {}...'.format(proj))

#     sub_data_df = data_df[data_df['project_name']==proj].head(1)
    
#     sub_data_df['doc_extract'] = convert_to_list(sub_data_df['doc_extract'])
    
#     ideal_num_doc_df = run_optimising_number_doc_topics(sub_data_df, 'doc_extract', 10, 10)
    
#     ideal_num_doc_df.to_csv('modules_config_doc/ideal_num_doc_{}.csv'.format(proj), index=False)   

In [12]:
# import matplotlib.pyplot as plt
# import seaborn as sns

# plt.figure(figsize=(9,6))
# ax = sns.lineplot(x=num_topics, y=mean_stabilities, label='Average overlap (Jaccard index)')
# ax = sns.lineplot(x=num_topics, y=coherences, label='Coherence score')

# ax.axvline(x=ideal_topic_num, label='Ideal number of topics', color='black')
# ax.axvspan(xmin=ideal_topic_num - 1, xmax=ideal_topic_num + 1, alpha=0.5, facecolor='grey')

# y_max = max(max(mean_stabilities), max(coherences)) + (0.10 * max(max(mean_stabilities), max(coherences)))
# ax.set_ylim([0, y_max])
# ax.set_xlim([1, len(num_topics)])
                
# # ax.axes.set_title('Model Metrics per Number of Topics')
# ax.set_ylabel('Metric Score')
# ax.set_xlabel('Number of Topics')
# plt.legend()
# plt.show()  

In [30]:
# list all the optimised number of topics

import glob

topic_num_files = glob.glob('modules_config/*', recursive=True)
topic_num_files_doc = glob.glob('modules_config_doc/*', recursive=True)
topic_num_files

['modules_config\\ideal_num_code_01_dubbo.csv',
 'modules_config\\ideal_num_code_02_skywalking.csv',
 'modules_config\\ideal_num_code_03_flink.csv',
 'modules_config\\ideal_num_code_04_rocketmq.csv',
 'modules_config\\ideal_num_code_05_shardingsphere.csv',
 'modules_config\\ideal_num_code_06_hadoop.csv',
 'modules_config\\ideal_num_code_07_druid.csv',
 'modules_config\\ideal_num_code_08_pulsar.csv',
 'modules_config\\ideal_num_code_09_zookeeper.csv',
 'modules_config\\ideal_num_code_10_dolphinscheduler.csv',
 'modules_config\\ideal_num_code_11_shardingsphere-elasticjob.csv',
 'modules_config\\ideal_num_code_12_shenyu.csv',
 'modules_config\\ideal_num_code_13_tomcat.csv',
 'modules_config\\ideal_num_code_14_storm.csv',
 'modules_config\\ideal_num_code_15_zeppelin.csv',
 'modules_config\\ideal_num_test_01_dubbo.csv',
 'modules_config\\ideal_num_test_02_skywalking.csv',
 'modules_config\\ideal_num_test_03_flink.csv',
 'modules_config\\ideal_num_test_04_rocketmq.csv',
 'modules_config\\ide

In [31]:
tnf_code_list = []
tnf_test_list = []
tnf_doc_list = []

for tnf in topic_num_files:
    if '_code_' in tnf:
        df = pd.read_csv(tnf)
        df['filename'] = tnf
        tnf_code_list.append(df)
    else:
        df = pd.read_csv(tnf)
        df['filename'] = tnf
        tnf_test_list.append(df)
        
for tnf in topic_num_files_doc:
    df = pd.read_csv(tnf)
    df['filename'] = tnf
    tnf_doc_list.append(df)
    
num_topic_code_df = pd.concat(tnf_code_list).reset_index(drop=True)
num_topic_test_df = pd.concat(tnf_test_list).reset_index(drop=True)
num_topic_doc_df = pd.concat(tnf_doc_list).reset_index(drop=True)

num_topic_code_df['project_name'] = num_topic_code_df['filename'].apply(lambda x: x[len('modules_config\ideal_num_code_'):])
num_topic_test_df['project_name'] = num_topic_test_df['filename'].apply(lambda x: x[len('modules_config\ideal_num_test_'):])
num_topic_doc_df['project_name'] = num_topic_doc_df['filename'].apply(lambda x: x[len('modules_config_doc\ideal_num_doc_'):])

num_topic_code_df['project_name'] = num_topic_code_df['project_name'].apply(lambda x: x[:-4])
num_topic_test_df['project_name'] = num_topic_test_df['project_name'].apply(lambda x: x[:-4])
num_topic_doc_df['project_name'] = num_topic_doc_df['project_name'].apply(lambda x: x[:-4])

num_topic_code_df = num_topic_code_df.drop(columns=['filename'])
num_topic_test_df = num_topic_test_df.drop(columns=['filename'])
num_topic_doc_df = num_topic_doc_df.drop(columns=['filename'])

num_topic_code_df = num_topic_code_df.rename(columns={'ideal_num': 'code_ideal_num'})
num_topic_test_df = num_topic_test_df.rename(columns={'ideal_num': 'test_ideal_num'})
num_topic_doc_df = num_topic_doc_df.rename(columns={'ideal_num': 'doc_ideal_num'})

num_topic_df = num_topic_code_df.merge(num_topic_test_df, on=['project_name','module'])
num_topic_df = num_topic_df.merge(num_topic_doc_df, on='project_name', how='left')
num_topic_df = num_topic_df[['project_name', 'module', 'code_ideal_num', 'test_ideal_num', 'doc_ideal_num']]

In [32]:
num_topic_df

Unnamed: 0,project_name,module,code_ideal_num,test_ideal_num,doc_ideal_num
0,01_dubbo,dubbo-cluster,9,6,8
1,01_dubbo,dubbo-common,9,9,8
2,01_dubbo,dubbo-compatible,8,9,8
3,01_dubbo,dubbo-config,7,8,8
4,01_dubbo,dubbo-configcenter,6,7,8
...,...,...,...,...,...
156,15_zeppelin,zeppelin-jupyter,6,1,9
157,15_zeppelin,zeppelin-jupyter-interpreter,4,8,9
158,15_zeppelin,zeppelin-plugins,7,8,9
159,15_zeppelin,zeppelin-server,8,9,9


In [33]:
full_data_df = data_df.merge(num_topic_df, on=['project_name','module'], how='left')

In [34]:
full_data_df.head()

Unnamed: 0,project_name,module,code_extract,code_file_count,code_token_count,test_extract,test_file_count,test_token_count,doc_extract,doc_token_count,code_ideal_num,test_ideal_num,doc_ideal_num
0,01_dubbo,dubbo-cluster,"address listener,cacheable router factory rout...",153,4608,short response load balance load balance scope...,80,1869,project highperformance javabase opensource rp...,300,9,6,8
1,01_dubbo,dubbo-common,"activate,reference,service,service metadata co...",403,17859,simple simple simple simple simple simple simp...,386,3621,project highperformance javabase opensource rp...,300,9,9,8
2,01_dubbo,dubbo-compatible,"cache cache,cache factory cache factory,cache ...",89,1908,cache target service unique protocol service k...,38,538,project highperformance javabase opensource rp...,300,8,9,8
3,01_dubbo,dubbo-config,"initializer,processor,scope model initializer ...",98,6778,"multi loader service,multi loader service impl...",295,5295,project highperformance javabase opensource rp...,300,7,8,8
4,01_dubbo,dubbo-configcenter,apollo dynamic dynamic error aware error aware...,9,483,apollo dynamic session timeout key session nam...,6,251,project highperformance javabase opensource rp...,300,6,7,8


In [35]:
full_data_df.tail()

Unnamed: 0,project_name,module,code_extract,code_file_count,code_token_count,test_extract,test_file_count,test_token_count,doc_extract,doc_token_count,code_ideal_num,test_ideal_num,doc_ideal_num
156,15_zeppelin,zeppelin-jupyter,jupyter util gson pretty gson gson builder pre...,25,307,jupyter util format note note verify testget n...,1,8,documentation user guide mailing list user dev...,65,6,1,9
157,15_zeppelin,zeppelin-jupyter-interpreter,jupyter interpreter interpreter map jupyter ke...,5,296,python kernel interpreter group intp group int...,2,50,documentation user guide mailing list user dev...,65,4,8,9
158,15_zeppelin,zeppelin-plugins,cluster interpreter thread thread interpreter ...,27,1603,cluster interpreter launcher cluster mock inte...,17,513,documentation user guide mailing list user dev...,65,7,8,9
159,15_zeppelin,zeppelin-server,jvm info binder meter binder unknown unknown b...,84,4848,"util instance clazz clear instance,cluster aut...",35,1338,documentation user guide mailing list user dev...,65,8,9,9
160,15_zeppelin,zeppelin-zengine,"dummy health health result,hdfs health health ...",94,6431,clear system variable cleanup allow malforme u...,58,2410,documentation user guide mailing list user dev...,65,9,9,9


In [36]:
full_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 161 entries, 0 to 160
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   project_name      161 non-null    object
 1   module            161 non-null    object
 2   code_extract      161 non-null    object
 3   code_file_count   161 non-null    int64 
 4   code_token_count  161 non-null    int64 
 5   test_extract      161 non-null    object
 6   test_file_count   161 non-null    int64 
 7   test_token_count  161 non-null    int64 
 8   doc_extract       161 non-null    object
 9   doc_token_count   161 non-null    int64 
 10  code_ideal_num    161 non-null    int64 
 11  test_ideal_num    161 non-null    int64 
 12  doc_ideal_num     161 non-null    int64 
dtypes: int64(8), object(5)
memory usage: 16.5+ KB


In [37]:
full_data_df.to_csv("full_data_df.csv",index=False)

## Import extracted data

In [38]:
full_data_df = pd.read_csv("full_data_df.csv")

In [39]:
full_data_df.head()

Unnamed: 0,project_name,module,code_extract,code_file_count,code_token_count,test_extract,test_file_count,test_token_count,doc_extract,doc_token_count,code_ideal_num,test_ideal_num,doc_ideal_num
0,01_dubbo,dubbo-cluster,"address listener,cacheable router factory rout...",153,4608,short response load balance load balance scope...,80,1869,project highperformance javabase opensource rp...,300,9,6,8
1,01_dubbo,dubbo-common,"activate,reference,service,service metadata co...",403,17859,simple simple simple simple simple simple simp...,386,3621,project highperformance javabase opensource rp...,300,9,9,8
2,01_dubbo,dubbo-compatible,"cache cache,cache factory cache factory,cache ...",89,1908,cache target service unique protocol service k...,38,538,project highperformance javabase opensource rp...,300,8,9,8
3,01_dubbo,dubbo-config,"initializer,processor,scope model initializer ...",98,6778,"multi loader service,multi loader service impl...",295,5295,project highperformance javabase opensource rp...,300,7,8,8
4,01_dubbo,dubbo-configcenter,apollo dynamic dynamic error aware error aware...,9,483,apollo dynamic session timeout key session nam...,6,251,project highperformance javabase opensource rp...,300,6,7,8


In [40]:
full_data_df.tail()

Unnamed: 0,project_name,module,code_extract,code_file_count,code_token_count,test_extract,test_file_count,test_token_count,doc_extract,doc_token_count,code_ideal_num,test_ideal_num,doc_ideal_num
156,15_zeppelin,zeppelin-jupyter,jupyter util gson pretty gson gson builder pre...,25,307,jupyter util format note note verify testget n...,1,8,documentation user guide mailing list user dev...,65,6,1,9
157,15_zeppelin,zeppelin-jupyter-interpreter,jupyter interpreter interpreter map jupyter ke...,5,296,python kernel interpreter group intp group int...,2,50,documentation user guide mailing list user dev...,65,4,8,9
158,15_zeppelin,zeppelin-plugins,cluster interpreter thread thread interpreter ...,27,1603,cluster interpreter launcher cluster mock inte...,17,513,documentation user guide mailing list user dev...,65,7,8,9
159,15_zeppelin,zeppelin-server,jvm info binder meter binder unknown unknown b...,84,4848,"util instance clazz clear instance,cluster aut...",35,1338,documentation user guide mailing list user dev...,65,8,9,9
160,15_zeppelin,zeppelin-zengine,"dummy health health result,hdfs health health ...",94,6431,clear system variable cleanup allow malforme u...,58,2410,documentation user guide mailing list user dev...,65,9,9,9


In [24]:
def run_lda(corpus_in, num_topics, num_keywords=10):

    id2word = corpora.Dictionary(corpus_in)
    corpus_bow = [id2word.doc2bow(text) for text in corpus_in]

    lda_model = LdaModel(corpus=corpus_bow,
                             id2word=id2word,
                             num_topics=num_topics,
                             update_every=1,
                             chunksize=len(corpus_bow),
                             passes=20,
                             alpha='auto',
                             random_state=42)

    shown_topics = lda_model.show_topics(num_topics=num_topics, 
                                         num_words=num_keywords,
                                         formatted=False)

    lda_topics = [[word[0] for word in topic[1]] for topic in shown_topics]
    
    return lda_model, shown_topics, lda_topics

In [25]:
# for code and tests

code_lda_models = []
test_lda_models = []

code_shown_topics_list = []
test_shown_topics_list = []

code_topics_list = []
test_topics_list = []

sub_data_df = full_data_df.copy()

sub_data_df['code_extract'] = convert_to_list(sub_data_df['code_extract'])
sub_data_df['test_extract'] = convert_to_list(sub_data_df['test_extract'])

for index, row in sub_data_df.iterrows():
    code_corpus_in = row['code_extract']
    code_num_topics = row['code_ideal_num']
    test_corpus_in = row['test_extract']
    test_num_topics = row['test_ideal_num']

    code_lda, code_shown_topics, code_topics = run_lda(code_corpus_in, code_num_topics)
    test_lda, test_shown_topics, test_topics = run_lda(test_corpus_in, test_num_topics)
    
    code_lda_models.append(code_lda)
    test_lda_models.append(test_lda)
    
    code_shown_topics_list.append(code_shown_topics)
    test_shown_topics_list.append(test_shown_topics)
    
    code_topics_list.append(code_topics)
    test_topics_list.append(test_topics)
    
    print('Index no. {} done..'.format(index))

Index no. 0 done..
Index no. 1 done..
Index no. 2 done..
Index no. 3 done..
Index no. 4 done..
Index no. 5 done..
Index no. 6 done..
Index no. 7 done..
Index no. 8 done..
Index no. 9 done..
Index no. 10 done..
Index no. 11 done..
Index no. 12 done..
Index no. 13 done..
Index no. 14 done..
Index no. 15 done..
Index no. 16 done..
Index no. 17 done..
Index no. 18 done..
Index no. 19 done..
Index no. 20 done..
Index no. 21 done..
Index no. 22 done..
Index no. 23 done..
Index no. 24 done..
Index no. 25 done..
Index no. 26 done..
Index no. 27 done..
Index no. 28 done..
Index no. 29 done..
Index no. 30 done..
Index no. 31 done..
Index no. 32 done..
Index no. 33 done..
Index no. 34 done..
Index no. 35 done..
Index no. 36 done..
Index no. 37 done..
Index no. 38 done..
Index no. 39 done..
Index no. 40 done..
Index no. 41 done..
Index no. 42 done..
Index no. 43 done..
Index no. 44 done..
Index no. 45 done..
Index no. 46 done..
Index no. 47 done..
Index no. 48 done..
Index no. 49 done..
Index no. 

In [26]:
# for doc

doc_lda_models = []

doc_shown_topics_list = []

doc_topics_list = []

sub_data_df = full_data_df.copy()

sub_data_df = sub_data_df.drop_duplicates('project_name', keep='first').reset_index(drop=True)

sub_data_df['doc_extract'] = convert_to_list(sub_data_df['doc_extract'])

for index, row in sub_data_df.iterrows():
    doc_corpus_in = row['doc_extract']
    doc_num_topics = row['doc_ideal_num']

    doc_lda, doc_shown_topics, doc_topics = run_lda(doc_corpus_in, doc_num_topics)
    
    doc_lda_models.append(doc_lda)
    
    doc_shown_topics_list.append(doc_shown_topics)
    
    doc_topics_list.append(doc_topics)
    
    print('Index no. {} done..'.format(index))

Index no. 0 done..
Index no. 1 done..
Index no. 2 done..
Index no. 3 done..
Index no. 4 done..
Index no. 5 done..
Index no. 6 done..
Index no. 7 done..
Index no. 8 done..
Index no. 9 done..
Index no. 10 done..
Index no. 11 done..
Index no. 12 done..
Index no. 13 done..
Index no. 14 done..
Index no. 15 done..
Index no. 16 done..
Index no. 17 done..
Index no. 18 done..
Index no. 19 done..
Index no. 20 done..
Index no. 21 done..
Index no. 22 done..
Index no. 23 done..
Index no. 24 done..
Index no. 25 done..
Index no. 26 done..
Index no. 27 done..
Index no. 28 done..
Index no. 29 done..
Index no. 30 done..
Index no. 31 done..
Index no. 32 done..
Index no. 33 done..
Index no. 34 done..
Index no. 35 done..
Index no. 36 done..
Index no. 37 done..
Index no. 38 done..
Index no. 39 done..
Index no. 40 done..
Index no. 41 done..
Index no. 42 done..
Index no. 43 done..
Index no. 44 done..


In [27]:
full_data_df.head()

Unnamed: 0,project_name,module,code_extract,code_file_count,code_token_count,test_extract,test_file_count,test_token_count,doc_extract,doc_token_count,code_ideal_num,test_ideal_num,doc_ideal_num
0,01_dubbo,dubbo-cluster,"address listener,cacheable router factory rout...",153,4608,short response load balance load balance scope...,80,1869,project highperformance javabase opensource rp...,300,9,6,8
1,01_dubbo,dubbo-common,"activate,reference,service,service metadata co...",403,17859,simple simple simple simple simple simple simp...,386,3621,project highperformance javabase opensource rp...,300,9,9,8
2,01_dubbo,dubbo-compatible,"cache cache,cache factory cache factory,cache ...",89,1908,cache target service unique protocol service k...,38,538,project highperformance javabase opensource rp...,300,8,9,8
3,01_dubbo,dubbo-config,"initializer,processor,scope model initializer ...",98,6778,"multi loader service,multi loader service impl...",295,5295,project highperformance javabase opensource rp...,300,7,8,8
4,01_dubbo,dubbo-configcenter,apollo dynamic dynamic error aware error aware...,9,483,apollo dynamic session timeout key session nam...,6,251,project highperformance javabase opensource rp...,300,6,7,8


In [28]:
topics_res_df = pd.DataFrame(np.transpose([full_data_df['project_name'], full_data_df['module'],  num_topic_df['code_ideal_num'], num_topic_df['test_ideal_num']]), columns = ['project_name', 'module', 'code_num_topics', 'test_num_topics'])
topics_res_df['code_shown_topics'] = code_shown_topics_list
topics_res_df['code_topics'] = code_topics_list
topics_res_df['test_shown_topics'] = test_shown_topics_list
topics_res_df['test_topics'] = test_topics_list

Unnamed: 0,project_name,module,code_num_topics,test_num_topics,code_shown_topics,code_topics,test_shown_topics,test_topics
0,01_dubbo,dubbo-cluster,9,6,"[(0, [('merger', 0.11644696), ('model', 0.0760...","[[merger, model, scope, aware, end, error, mer...","[(0, [('invoker', 0.10217771), ('hello', 0.059...","[[invoker, hello, menu, service, load, balance..."
1,01_dubbo,dubbo-common,9,9,"[(0, [('map', 0.071077), ('extension', 0.05852...","[[map, extension, loader, property, msg, throw...","[(0, [('address', 0.07335682), ('country', 0.0...","[[address, country, phone, full, size, number,..."
2,01_dubbo,dubbo-compatible,8,9,"[(0, [('invocation', 0.17978409), ('invoker', ...","[[invocation, invoker, attachment, argument, m...","[(0, [('consumer', 0.08403326), ('service', 0....","[[consumer, service, argument, application, pr..."
3,01_dubbo,dubbo-config,7,8,"[(0, [('application', 0.19198503), ('model', 0...","[[application, model, module, context, event, ...","[(0, [('box', 0.0995611), ('service', 0.097135...","[[box, service, demo, user, say, impl, prefix,..."
4,01_dubbo,dubbo-configcenter,6,7,"[(0, [('dynamic', 0.2249236), ('factory', 0.12...","[[dynamic, factory, url, zookeeper, applicatio...","[(0, [('namespace', 0.01471425), ('map', 0.014...","[[namespace, map, context, mock, property, ser..."
...,...,...,...,...,...,...,...,...
309,45_archiva-components,spring-apacheds,9,1,"[(0, [('partition', 0.038462438), ('context', ...","[[partition, context, password, port, enable, ...","[(0, [('context', 0.21739134), ('create', 0.13...","[[context, create, dir, attribute, basic, exis..."
310,45_archiva-components,spring-cache,6,3,"[(0, [('cache', 0.3110731), ('creator', 0.1051...","[[cache, creator, hint, factory, key, value, c...","[(0, [('cache', 0.3525324), ('refre', 0.102800...","[[cache, refre, wine, second, never, two, one,..."
311,45_archiva-components,spring-quartz,8,4,"[(0, [('job', 0.29270846), ('context', 0.07773...","[[job, context, execution, map, listener, prop...","[(0, [('job', 0.24515942), ('context', 0.16984...","[[job, context, one, execution, execute, trigg..."
312,45_archiva-components,spring-registry,9,6,"[(0, [('key', 0.1410248), ('registry', 0.08333...","[[key, registry, builder, add, save, listener,...","[(0, [('registry', 0.024453199), ('property', ...","[[registry, property, change, event, value, ad..."


In [29]:
doc_topics_df = pd.DataFrame(np.transpose([full_data_df['project_name'], full_data_df['doc_ideal_num']]), columns=['project_name', 'doc_num_topics'])
doc_topics_df = doc_topics_df.drop_duplicates(ignore_index=True)
doc_topics_df['doc_shown_topics'] = doc_shown_topics_list
doc_topics_df['doc_topics'] = doc_topics_list
doc_topics_df.head()

Unnamed: 0,project_name,doc_num_topics,doc_shown_topics,doc_topics
0,01_dubbo,8,"[(0, [('service', 0.005111123), ('see', 0.0051...","[[service, see, issue, sample, project, github..."
1,02_skywalking,9,"[(0, [('trace', 0.0060752206), ('support', 0.0...","[[trace, support, metric, mail, native, pipeli..."
2,03_flink,9,"[(0, [('scala', 0.0055014915), ('processing', ...","[[scala, processing, java, intellij, support, ..."
3,04_rocketmq,8,"[(0, [('software', 0.0030929884), ('use', 0.00...","[[software, use, message, run, cluster, client..."
4,05_shardingsphere,7,"[(0, [('database', 0.050372913), ('link', 0.02...","[[database, link, amp, provide, support, sql, ..."


In [30]:
topics_res_df = topics_res_df.merge(doc_topics_df, on='project_name', how='left')

# with pd.option_context('display.max_colwidth', None):
#     display(topics_res_df)

In [31]:
topics_res_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 314 entries, 0 to 313
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   project_name       314 non-null    object
 1   module             314 non-null    object
 2   code_num_topics    314 non-null    object
 3   test_num_topics    314 non-null    object
 4   code_shown_topics  314 non-null    object
 5   code_topics        314 non-null    object
 6   test_shown_topics  314 non-null    object
 7   test_topics        314 non-null    object
 8   doc_num_topics     314 non-null    object
 9   doc_shown_topics   314 non-null    object
 10  doc_topics         314 non-null    object
dtypes: object(11)
memory usage: 27.1+ KB


In [34]:
topics_res_df.to_csv("topics_res_df.csv", index=False)