## Concept/Domain coverage analysis: Matching with App Domains

This notebook extracts corpus keywords and analyse the concepts emerging from three sources: source code, documentation and tests

## Show topics

In [54]:
import pandas as pd
import numpy as np

topics_df = pd.read_csv("topics_res_df.csv")
topics_df

Unnamed: 0,project_name,module,code_num_topics,test_num_topics,code_shown_topics,code_topics,test_shown_topics,test_topics,doc_num_topics,doc_shown_topics,doc_topics
0,01_dubbo,dubbo-cluster,9,6,"[(0, [('merger', 0.11644696), ('model', 0.0760...","[['merger', 'model', 'scope', 'aware', 'end', ...","[(0, [('invoker', 0.10217771), ('hello', 0.059...","[['invoker', 'hello', 'menu', 'service', 'load...",8,"[(0, [('service', 0.005111123), ('see', 0.0051...","[['service', 'see', 'issue', 'sample', 'projec..."
1,01_dubbo,dubbo-common,9,9,"[(0, [('map', 0.071077), ('extension', 0.05852...","[['map', 'extension', 'loader', 'property', 'm...","[(0, [('address', 0.07335682), ('country', 0.0...","[['address', 'country', 'phone', 'full', 'size...",8,"[(0, [('service', 0.005111123), ('see', 0.0051...","[['service', 'see', 'issue', 'sample', 'projec..."
2,01_dubbo,dubbo-compatible,8,9,"[(0, [('invocation', 0.17978409), ('invoker', ...","[['invocation', 'invoker', 'attachment', 'argu...","[(0, [('consumer', 0.08403326), ('service', 0....","[['consumer', 'service', 'argument', 'applicat...",8,"[(0, [('service', 0.005111123), ('see', 0.0051...","[['service', 'see', 'issue', 'sample', 'projec..."
3,01_dubbo,dubbo-config,7,8,"[(0, [('application', 0.19198503), ('model', 0...","[['application', 'model', 'module', 'context',...","[(0, [('box', 0.0995611), ('service', 0.097135...","[['box', 'service', 'demo', 'user', 'say', 'im...",8,"[(0, [('service', 0.005111123), ('see', 0.0051...","[['service', 'see', 'issue', 'sample', 'projec..."
4,01_dubbo,dubbo-configcenter,6,7,"[(0, [('dynamic', 0.2249236), ('factory', 0.12...","[['dynamic', 'factory', 'url', 'zookeeper', 'a...","[(0, [('namespace', 0.01471425), ('map', 0.014...","[['namespace', 'map', 'context', 'mock', 'prop...",8,"[(0, [('service', 0.005111123), ('see', 0.0051...","[['service', 'see', 'issue', 'sample', 'projec..."
...,...,...,...,...,...,...,...,...,...,...,...
309,45_archiva-components,spring-apacheds,9,1,"[(0, [('partition', 0.038462438), ('context', ...","[['partition', 'context', 'password', 'port', ...","[(0, [('context', 0.21739134), ('create', 0.13...","[['context', 'create', 'dir', 'attribute', 'ba...",8,"[(0, [('publish', 0.008338352), ('mvn', 0.0083...","[['publish', 'mvn', 'repository', 'checkout', ..."
310,45_archiva-components,spring-cache,6,3,"[(0, [('cache', 0.3110731), ('creator', 0.1051...","[['cache', 'creator', 'hint', 'factory', 'key'...","[(0, [('cache', 0.3525324), ('refre', 0.102800...","[['cache', 'refre', 'wine', 'second', 'never',...",8,"[(0, [('publish', 0.008338352), ('mvn', 0.0083...","[['publish', 'mvn', 'repository', 'checkout', ..."
311,45_archiva-components,spring-quartz,8,4,"[(0, [('job', 0.29270846), ('context', 0.07773...","[['job', 'context', 'execution', 'map', 'liste...","[(0, [('job', 0.24515942), ('context', 0.16984...","[['job', 'context', 'one', 'execution', 'execu...",8,"[(0, [('publish', 0.008338352), ('mvn', 0.0083...","[['publish', 'mvn', 'repository', 'checkout', ..."
312,45_archiva-components,spring-registry,9,6,"[(0, [('key', 0.1410248), ('registry', 0.08333...","[['key', 'registry', 'builder', 'add', 'save',...","[(0, [('registry', 0.024453199), ('property', ...","[['registry', 'property', 'change', 'event', '...",8,"[(0, [('publish', 0.008338352), ('mvn', 0.0083...","[['publish', 'mvn', 'repository', 'checkout', ..."


## Process annotated modules

In [31]:
anno_df = pd.read_csv("module_annotation.csv")
anno_sub_df = anno_df[["labels", "distribution"]]
anno_df

Unnamed: 0,project,module,labels,distribution
0,dubbo,dubbo-configcenter,"['3D computer graphics', '3D modeling', 'Bayes...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0358421722524..."
1,dubbo,dubbo-remoting,"['3D computer graphics', '3D modeling', 'Bayes...","[0.003827634543563279, 0.0007640459047594847, ..."
2,dubbo,dubbo-spring-boot,"['3D computer graphics', '3D modeling', 'Bayes...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.002485344724677403..."
3,dubbo,dubbo-serialization,"['3D computer graphics', '3D modeling', 'Bayes...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1949338529501..."
4,dubbo,dubbo-native,"['3D computer graphics', '3D modeling', 'Bayes...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0785512761148..."
...,...,...,...,...
478,archiva-components,spring-apacheds,"['3D computer graphics', '3D modeling', 'Bayes...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1170141864804..."
479,archiva-components,spring-registry,"['3D computer graphics', '3D modeling', 'Bayes...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0535558945145..."
480,archiva-components,spring-quartz,"['3D computer graphics', '3D modeling', 'Bayes...","[0.0, 0.028540502180797137, 0.0, 0.0, 0.0, 0.0..."
481,archiva-components,rest-util,"['3D computer graphics', '3D modeling', 'Bayes...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.04058421758514396,..."


In [90]:
# unify the project names
proj_list_b = list(dict.fromkeys(topics_df['project_name']))
proj_list_a = list(dict.fromkeys(anno_df['project']))
to_chg = dict(zip(proj_list_a, proj_list_b))


In [91]:
import ast

updated_rows = []

for row in anno_sub_df.values:

    updated_labels = []
    updated_dist = []
    this_row = []

    labels = row[0]
    values = row[1]

    labels = ast.literal_eval(labels)
    values = ast.literal_eval(values)

    for i, l in enumerate(labels):
        if float(values[i]) > 0:
            this_row.append((l,values[i]))

    for vals in this_row:
        updated_labels.append(vals[0])
        updated_dist.append(vals[1])

    updated_rows.append((updated_labels, updated_dist))

In [111]:
updated_anno_df = pd.concat([anno_df, pd.DataFrame(updated_rows, columns=["domains", "prob_dist"])], axis=1).drop(columns=["labels","distribution"])
updated_anno_df['project_name'] = updated_anno_df['project'].apply(lambda x: to_chg[x])
updated_anno_df['module'] = updated_anno_df['module'].str.replace("modules", "modules\\vfs-class-loader")
updated_anno_df = updated_anno_df[['project_name', 'module', 'domains', 'prob_dist']]
updated_anno_df

Unnamed: 0,project_name,module,domains,prob_dist
0,01_dubbo,dubbo-configcenter,"[File Transfer Protocol, HTTP client, HTTP ser...","[0.035842172252428264, 0.040560328491004986, 0..."
1,01_dubbo,dubbo-remoting,"[3D computer graphics, 3D modeling, File Trans...","[0.003827634543563279, 0.0007640459047594847, ..."
2,01_dubbo,dubbo-spring-boot,"[DevOps, File Transfer Protocol, HTTP client, ...","[0.0024853447246774036, 0.011500804409850242, ..."
3,01_dubbo,dubbo-serialization,"[File Transfer Protocol, HTTP client, HTTP ser...","[0.1949338529501777, 0.014838969973809206, 0.0..."
4,01_dubbo,dubbo-native,"[File Transfer Protocol, HTTP client, HTTP ser...","[0.07855127611486389, 0.171984535082097, 0.271..."
...,...,...,...,...
478,45_archiva-components,spring-apacheds,"[File Transfer Protocol, HTTP server, World Wi...","[0.11701418648043749, 0.2335993974088256, 0.05..."
479,45_archiva-components,spring-registry,"[File Transfer Protocol, HTTP server, World Wi...","[0.05355589451454959, 0.030031248756518632, 0...."
480,45_archiva-components,spring-quartz,"[3D modeling, Web Components, World Wide Web, ...","[0.028540502180797137, 0.014346165183265333, 0..."
481,45_archiva-components,rest-util,"[DevOps, World Wide Web, algorithmic trading, ...","[0.04058421758514396, 0.14341622866869835, 0.0..."


In [112]:
updated_anno_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 483 entries, 0 to 482
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   project_name  483 non-null    object
 1   module        483 non-null    object
 2   domains       483 non-null    object
 3   prob_dist     483 non-null    object
dtypes: object(4)
memory usage: 15.2+ KB


In [118]:
module_df = topics_df.merge(updated_anno_df, on=['project_name', 'module'])
module_df

Unnamed: 0,project_name,module,code_num_topics,test_num_topics,code_shown_topics,code_topics,test_shown_topics,test_topics,doc_num_topics,doc_shown_topics,doc_topics,domains,prob_dist
0,01_dubbo,dubbo-cluster,9,6,"[(0, [('merger', 0.11644696), ('model', 0.0760...","[['merger', 'model', 'scope', 'aware', 'end', ...","[(0, [('invoker', 0.10217771), ('hello', 0.059...","[['invoker', 'hello', 'menu', 'service', 'load...",8,"[(0, [('service', 0.005111123), ('see', 0.0051...","[['service', 'see', 'issue', 'sample', 'projec...","[Bayesian inference, Containerization, File Tr...","[0.0006186695022802121, 0.001075709038256659, ..."
1,01_dubbo,dubbo-common,9,9,"[(0, [('map', 0.071077), ('extension', 0.05852...","[['map', 'extension', 'loader', 'property', 'm...","[(0, [('address', 0.07335682), ('country', 0.0...","[['address', 'country', 'phone', 'full', 'size...",8,"[(0, [('service', 0.005111123), ('see', 0.0051...","[['service', 'see', 'issue', 'sample', 'projec...","[3D computer graphics, 3D modeling, Bidirectio...","[0.00022489652756263503, 0.019850776647796192,..."
2,01_dubbo,dubbo-compatible,8,9,"[(0, [('invocation', 0.17978409), ('invoker', ...","[['invocation', 'invoker', 'attachment', 'argu...","[(0, [('consumer', 0.08403326), ('service', 0....","[['consumer', 'service', 'argument', 'applicat...",8,"[(0, [('service', 0.005111123), ('see', 0.0051...","[['service', 'see', 'issue', 'sample', 'projec...","[Bayesian inference, File Transfer Protocol, H...","[0.0014187991538512738, 0.040202071048378446, ..."
3,01_dubbo,dubbo-config,7,8,"[(0, [('application', 0.19198503), ('model', 0...","[['application', 'model', 'module', 'context',...","[(0, [('box', 0.0995611), ('service', 0.097135...","[['box', 'service', 'demo', 'user', 'say', 'im...",8,"[(0, [('service', 0.005111123), ('see', 0.0051...","[['service', 'see', 'issue', 'sample', 'projec...","[3D modeling, Bayesian inference, Containeriza...","[0.005673695513894933, 0.0016755344658510142, ..."
4,01_dubbo,dubbo-configcenter,6,7,"[(0, [('dynamic', 0.2249236), ('factory', 0.12...","[['dynamic', 'factory', 'url', 'zookeeper', 'a...","[(0, [('namespace', 0.01471425), ('map', 0.014...","[['namespace', 'map', 'context', 'mock', 'prop...",8,"[(0, [('service', 0.005111123), ('see', 0.0051...","[['service', 'see', 'issue', 'sample', 'projec...","[File Transfer Protocol, HTTP client, HTTP ser...","[0.035842172252428264, 0.040560328491004986, 0..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
308,45_archiva-components,spring-apacheds,9,1,"[(0, [('partition', 0.038462438), ('context', ...","[['partition', 'context', 'password', 'port', ...","[(0, [('context', 0.21739134), ('create', 0.13...","[['context', 'create', 'dir', 'attribute', 'ba...",8,"[(0, [('publish', 0.008338352), ('mvn', 0.0083...","[['publish', 'mvn', 'repository', 'checkout', ...","[File Transfer Protocol, HTTP server, World Wi...","[0.11701418648043749, 0.2335993974088256, 0.05..."
309,45_archiva-components,spring-cache,6,3,"[(0, [('cache', 0.3110731), ('creator', 0.1051...","[['cache', 'creator', 'hint', 'factory', 'key'...","[(0, [('cache', 0.3525324), ('refre', 0.102800...","[['cache', 'refre', 'wine', 'second', 'never',...",8,"[(0, [('publish', 0.008338352), ('mvn', 0.0083...","[['publish', 'mvn', 'repository', 'checkout', ...","[3D modeling, File Transfer Protocol, HTTP ser...","[0.004910765950067891, 0.020708469498936777, 0..."
310,45_archiva-components,spring-quartz,8,4,"[(0, [('job', 0.29270846), ('context', 0.07773...","[['job', 'context', 'execution', 'map', 'liste...","[(0, [('job', 0.24515942), ('context', 0.16984...","[['job', 'context', 'one', 'execution', 'execu...",8,"[(0, [('publish', 0.008338352), ('mvn', 0.0083...","[['publish', 'mvn', 'repository', 'checkout', ...","[3D modeling, Web Components, World Wide Web, ...","[0.028540502180797137, 0.014346165183265333, 0..."
311,45_archiva-components,spring-registry,9,6,"[(0, [('key', 0.1410248), ('registry', 0.08333...","[['key', 'registry', 'builder', 'add', 'save',...","[(0, [('registry', 0.024453199), ('property', ...","[['registry', 'property', 'change', 'event', '...",8,"[(0, [('publish', 0.008338352), ('mvn', 0.0083...","[['publish', 'mvn', 'repository', 'checkout', ...","[File Transfer Protocol, HTTP server, World Wi...","[0.05355589451454959, 0.030031248756518632, 0...."


In [119]:
module_df[:5]

Unnamed: 0,project_name,module,code_num_topics,test_num_topics,code_shown_topics,code_topics,test_shown_topics,test_topics,doc_num_topics,doc_shown_topics,doc_topics,domains,prob_dist
0,01_dubbo,dubbo-cluster,9,6,"[(0, [('merger', 0.11644696), ('model', 0.0760...","[['merger', 'model', 'scope', 'aware', 'end', ...","[(0, [('invoker', 0.10217771), ('hello', 0.059...","[['invoker', 'hello', 'menu', 'service', 'load...",8,"[(0, [('service', 0.005111123), ('see', 0.0051...","[['service', 'see', 'issue', 'sample', 'projec...","[Bayesian inference, Containerization, File Tr...","[0.0006186695022802121, 0.001075709038256659, ..."
1,01_dubbo,dubbo-common,9,9,"[(0, [('map', 0.071077), ('extension', 0.05852...","[['map', 'extension', 'loader', 'property', 'm...","[(0, [('address', 0.07335682), ('country', 0.0...","[['address', 'country', 'phone', 'full', 'size...",8,"[(0, [('service', 0.005111123), ('see', 0.0051...","[['service', 'see', 'issue', 'sample', 'projec...","[3D computer graphics, 3D modeling, Bidirectio...","[0.00022489652756263503, 0.019850776647796192,..."
2,01_dubbo,dubbo-compatible,8,9,"[(0, [('invocation', 0.17978409), ('invoker', ...","[['invocation', 'invoker', 'attachment', 'argu...","[(0, [('consumer', 0.08403326), ('service', 0....","[['consumer', 'service', 'argument', 'applicat...",8,"[(0, [('service', 0.005111123), ('see', 0.0051...","[['service', 'see', 'issue', 'sample', 'projec...","[Bayesian inference, File Transfer Protocol, H...","[0.0014187991538512738, 0.040202071048378446, ..."
3,01_dubbo,dubbo-config,7,8,"[(0, [('application', 0.19198503), ('model', 0...","[['application', 'model', 'module', 'context',...","[(0, [('box', 0.0995611), ('service', 0.097135...","[['box', 'service', 'demo', 'user', 'say', 'im...",8,"[(0, [('service', 0.005111123), ('see', 0.0051...","[['service', 'see', 'issue', 'sample', 'projec...","[3D modeling, Bayesian inference, Containeriza...","[0.005673695513894933, 0.0016755344658510142, ..."
4,01_dubbo,dubbo-configcenter,6,7,"[(0, [('dynamic', 0.2249236), ('factory', 0.12...","[['dynamic', 'factory', 'url', 'zookeeper', 'a...","[(0, [('namespace', 0.01471425), ('map', 0.014...","[['namespace', 'map', 'context', 'mock', 'prop...",8,"[(0, [('service', 0.005111123), ('see', 0.0051...","[['service', 'see', 'issue', 'sample', 'projec...","[File Transfer Protocol, HTTP client, HTTP ser...","[0.035842172252428264, 0.040560328491004986, 0..."


In [120]:
module_df[-5:]

Unnamed: 0,project_name,module,code_num_topics,test_num_topics,code_shown_topics,code_topics,test_shown_topics,test_topics,doc_num_topics,doc_shown_topics,doc_topics,domains,prob_dist
308,45_archiva-components,spring-apacheds,9,1,"[(0, [('partition', 0.038462438), ('context', ...","[['partition', 'context', 'password', 'port', ...","[(0, [('context', 0.21739134), ('create', 0.13...","[['context', 'create', 'dir', 'attribute', 'ba...",8,"[(0, [('publish', 0.008338352), ('mvn', 0.0083...","[['publish', 'mvn', 'repository', 'checkout', ...","[File Transfer Protocol, HTTP server, World Wi...","[0.11701418648043749, 0.2335993974088256, 0.05..."
309,45_archiva-components,spring-cache,6,3,"[(0, [('cache', 0.3110731), ('creator', 0.1051...","[['cache', 'creator', 'hint', 'factory', 'key'...","[(0, [('cache', 0.3525324), ('refre', 0.102800...","[['cache', 'refre', 'wine', 'second', 'never',...",8,"[(0, [('publish', 0.008338352), ('mvn', 0.0083...","[['publish', 'mvn', 'repository', 'checkout', ...","[3D modeling, File Transfer Protocol, HTTP ser...","[0.004910765950067891, 0.020708469498936777, 0..."
310,45_archiva-components,spring-quartz,8,4,"[(0, [('job', 0.29270846), ('context', 0.07773...","[['job', 'context', 'execution', 'map', 'liste...","[(0, [('job', 0.24515942), ('context', 0.16984...","[['job', 'context', 'one', 'execution', 'execu...",8,"[(0, [('publish', 0.008338352), ('mvn', 0.0083...","[['publish', 'mvn', 'repository', 'checkout', ...","[3D modeling, Web Components, World Wide Web, ...","[0.028540502180797137, 0.014346165183265333, 0..."
311,45_archiva-components,spring-registry,9,6,"[(0, [('key', 0.1410248), ('registry', 0.08333...","[['key', 'registry', 'builder', 'add', 'save',...","[(0, [('registry', 0.024453199), ('property', ...","[['registry', 'property', 'change', 'event', '...",8,"[(0, [('publish', 0.008338352), ('mvn', 0.0083...","[['publish', 'mvn', 'repository', 'checkout', ...","[File Transfer Protocol, HTTP server, World Wi...","[0.05355589451454959, 0.030031248756518632, 0...."
312,45_archiva-components,spring-taskqueue,3,5,"[(0, [('task', 0.33837277), ('evaluator', 0.12...","[['task', 'evaluator', 'queue', 'executor', 'e...","[(0, [('task', 0.32901022), ('evaluator', 0.10...","[['task', 'evaluator', 'evaluate', 'project', ...",8,"[(0, [('publish', 0.008338352), ('mvn', 0.0083...","[['publish', 'mvn', 'repository', 'checkout', ...","[HTTP client, HTTP server, Web Components, Wor...","[0.024542570908181953, 0.021938531171453578, 0..."


In [121]:
### separate the doc concepts

module_docs_df = module_df[['project_name', 'doc_topics']].drop_duplicates(ignore_index=True)
module_docs_df[:5]

Unnamed: 0,project_name,doc_topics
0,01_dubbo,"[['service', 'see', 'issue', 'sample', 'projec..."
1,02_skywalking,"[['trace', 'support', 'metric', 'mail', 'nativ..."
2,03_flink,"[['scala', 'processing', 'java', 'intellij', '..."
3,04_rocketmq,"[['software', 'use', 'message', 'run', 'cluste..."
4,05_shardingsphere,"[['database', 'link', 'amp', 'provide', 'suppo..."


## LLM Matching of AD to Concepts

### Using embeddings then calculate semantic similarity to match with domains

- StackOverflow w2v
- text-embedding-ada-002 (openAI + scikit-LLM)

In [137]:
code_concept_list = module_df['code_topics']
test_concept_list = module_df['test_topics']
doc_concept_list = module_docs_df['doc_topics']
prob_dist_list = module_df['prob_dist']
domains_list = module_df['domains']

In [123]:
proj_names = module_df[['project_name', 'module']]
proj_names

Unnamed: 0,project_name,module
0,01_dubbo,dubbo-cluster
1,01_dubbo,dubbo-common
2,01_dubbo,dubbo-compatible
3,01_dubbo,dubbo-config
4,01_dubbo,dubbo-configcenter
...,...,...
308,45_archiva-components,spring-apacheds
309,45_archiva-components,spring-cache
310,45_archiva-components,spring-quartz
311,45_archiva-components,spring-registry


In [124]:
len(domains_list)

313

## StackOverflow W2V

In [125]:
from gensim.models.keyedvectors import KeyedVectors

so_w2v_model = KeyedVectors.load_word2vec_format('C:/Users/biadge/OneDrive - BP/PhD/extraction/SO_vectors_200.bin', binary=True)

In [126]:
# test
so_w2v_model.n_similarity(['test', 'case'], ['quality'])

0.1252212

In [127]:
def comp_con_domains_sow2v(so_w2v_model, con, domains):
    # list of domains should be given, now we return an array of similarity scores for all domains
    # one concept matching against all the domains for that concept
    sim_res = []

    for dom in domains:
        sim_score = so_w2v_model.n_similarity(con, [dom])
        sim_res.append(sim_score)

    return sim_res

### Match concepts to domains

In [128]:
import ast

sow2v_code_domain_res = []

for i, con in enumerate(code_concept_list):

    c = ast.literal_eval(con)

    for con_str in c:

        curr_domains = domains_list[i]
        curr_prob = prob_dist_list[i]

        sim_res_list = comp_con_domains_sow2v(so_w2v_model, con_str, curr_domains)

        proj_name = proj_names.values[i][0]
        module_name = proj_names.values[i][1]

        sow2v_code_domain_res.append([proj_name, module_name, con_str, curr_domains, curr_prob, sim_res_list])

In [129]:
sow2v_test_domain_res = []

for i, con in enumerate(test_concept_list):

    c = ast.literal_eval(con)

    for con_str in c:

        curr_domains = domains_list[i]
        curr_prob = prob_dist_list[i]

        sim_res_list = comp_con_domains_sow2v(so_w2v_model, con_str, curr_domains)

        proj_name = proj_names.values[i][0]
        module_name = proj_names.values[i][1]

        sow2v_test_domain_res.append([proj_name, module_name, con_str, curr_domains, curr_prob, sim_res_list])

In [142]:
sow2v_doc_domain_res = []

for i, con in enumerate(doc_concept_list):

    c = ast.literal_eval(con)

    for con_str in c:

        curr_domains = domains_list[i]
        curr_prob = prob_dist_list[i]

        sim_res_list = comp_con_domains_sow2v(so_w2v_model, con_str, curr_domains)

        proj_name = module_docs_df.values[i][0]
        # module_name = proj_names.values[i][1]

        sow2v_doc_domain_res.append([proj_name, con_str, curr_domains, curr_prob, sim_res_list])

In [133]:
sow2v_code_domain_res_df = pd.DataFrame(sow2v_code_domain_res, columns=['project_name', 'module', 'concept', 'domains', 'weight_dist', 'sow2v_code_sim_score'])
sow2v_code_domain_res_df

Unnamed: 0,project_name,module,concept,domains,weight_dist,sow2v_code_sim_score
0,01_dubbo,dubbo-cluster,"[merger, model, scope, aware, end, error, merg...","[Bayesian inference, Containerization, File Tr...","[0.0006186695022802121, 0.001075709038256659, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,01_dubbo,dubbo-cluster,"[match, value, bool, exact, weight, result, de...","[Bayesian inference, Containerization, File Tr...","[0.0006186695022802121, 0.001075709038256659, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,01_dubbo,dubbo-cluster,"[rule, mesh, listener, app, merger, map, merge...","[Bayesian inference, Containerization, File Tr...","[0.0006186695022802121, 0.001075709038256659, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,01_dubbo,dubbo-cluster,"[rule, key, invoker, node, router, destination...","[Bayesian inference, Containerization, File Tr...","[0.0006186695022802121, 0.001075709038256659, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,01_dubbo,dubbo-cluster,"[url, invoker, invocation, configurator, load,...","[Bayesian inference, Containerization, File Tr...","[0.0006186695022802121, 0.001075709038256659, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...,...,...,...
2394,45_archiva-components,spring-registry,"[key, save, registry, add, path, builder, prop...","[File Transfer Protocol, HTTP server, World Wi...","[0.05355589451454959, 0.030031248756518632, 0....","[0.0, 0.0, 0.0, -0.03649073, 0.0, 0.0, -0.0692..."
2395,45_archiva-components,spring-registry,"[key, registry, add, value, path, builder, lis...","[File Transfer Protocol, HTTP server, World Wi...","[0.05355589451454959, 0.030031248756518632, 0....","[0.0, 0.0, 0.0, -0.03791273, 0.0, 0.0, -0.0609..."
2396,45_archiva-components,spring-taskqueue,"[task, evaluator, queue, executor, entry, viab...","[HTTP client, HTTP server, Web Components, Wor...","[0.024542570908181953, 0.021938531171453578, 0...","[0.0, 0.0, 0.0, 0.0, 0.063066274, 0.0, 0.0, 0...."
2397,45_archiva-components,spring-taskqueue,"[task, message, execution, queue, cause, throw...","[HTTP client, HTTP server, Web Components, Wor...","[0.024542570908181953, 0.021938531171453578, 0...","[0.0, 0.0, 0.0, 0.0, 0.033671085, 0.0, 0.0, 0...."


In [134]:
sow2v_test_domain_res_df = pd.DataFrame(sow2v_test_domain_res, columns=['project_name', 'module', 'concept', 'domains', 'weight_dist', 'sow2v_test_sim_score'])
sow2v_test_domain_res_df

Unnamed: 0,project_name,module,concept,domains,weight_dist,sow2v_test_sim_score
0,01_dubbo,dubbo-cluster,"[invoker, hello, menu, service, load, balance,...","[Bayesian inference, Containerization, File Tr...","[0.0006186695022802121, 0.001075709038256659, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,01_dubbo,dubbo-cluster,"[invoker, invocation, url, cluster, mock, resu...","[Bayesian inference, Containerization, File Tr...","[0.0006186695022802121, 0.001075709038256659, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,01_dubbo,dubbo-cluster,"[attachment, value, key, invoker, sticky, invo...","[Bayesian inference, Containerization, File Tr...","[0.0006186695022802121, 0.001075709038256659, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,01_dubbo,dubbo-cluster,"[url, model, merge, merger, module, router, pr...","[Bayesian inference, Containerization, File Tr...","[0.0006186695022802121, 0.001075709038256659, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,01_dubbo,dubbo-cluster,"[match, configurator, mock, bool, join, absent...","[Bayesian inference, Containerization, File Tr...","[0.0006186695022802121, 0.001075709038256659, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...,...,...,...
2218,45_archiva-components,spring-taskqueue,"[task, evaluator, evaluate, project, build, co...","[HTTP client, HTTP server, Web Components, Wor...","[0.024542570908181953, 0.021938531171453578, 0...","[0.0, 0.0, 0.0, 0.0, 0.072746545, 0.0, 0.0, 0...."
2219,45_archiva-components,spring-taskqueue,"[task, queue, executor, build, project, expect...","[HTTP client, HTTP server, Web Components, Wor...","[0.024542570908181953, 0.021938531171453578, 0...","[0.0, 0.0, 0.0, 0.0, 0.07758436, 0.0, 0.0, 0.0..."
2220,45_archiva-components,spring-taskqueue,"[task, evaluator, exit, evaluate, queue, expec...","[HTTP client, HTTP server, Web Components, Wor...","[0.024542570908181953, 0.021938531171453578, 0...","[0.0, 0.0, 0.0, 0.0, 0.02112553, 0.0, 0.0, 0.0..."
2221,45_archiva-components,spring-taskqueue,"[task, evaluate, entry, evaluator, exit, proje...","[HTTP client, HTTP server, Web Components, Wor...","[0.024542570908181953, 0.021938531171453578, 0...","[0.0, 0.0, 0.0, 0.0, 0.08187002, 0.0, 0.0, 0.0..."


In [143]:
sow2v_doc_domain_res_df = pd.DataFrame(sow2v_doc_domain_res, columns=['project_name', 'concept', 'domains', 'weight_dist', 'sow2v_doc_sim_score'])
sow2v_doc_domain_res_df

Unnamed: 0,project_name,concept,domains,weight_dist,sow2v_doc_sim_score
0,01_dubbo,"[service, see, issue, sample, project, github,...","[Bayesian inference, Containerization, File Tr...","[0.0006186695022802121, 0.001075709038256659, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,01_dubbo,"[contribute, issue, sample, project, service, ...","[Bayesian inference, Containerization, File Tr...","[0.0006186695022802121, 0.001075709038256659, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,01_dubbo,"[issue, service, project, list, see, github, r...","[Bayesian inference, Containerization, File Tr...","[0.0006186695022802121, 0.001075709038256659, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,01_dubbo,"[service, see, project, build, github, sample,...","[Bayesian inference, Containerization, File Tr...","[0.0006186695022802121, 0.001075709038256659, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,01_dubbo,"[see, issue, sample, service, contribute, guid...","[Bayesian inference, Containerization, File Tr...","[0.0006186695022802121, 0.001075709038256659, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...,...,...
343,45_archiva-components,"[repository, publish, web, mvn, site, use, bui...","[File Transfer Protocol, HTTP client, HTTP ser...","[0.1185900620822211, 0.023437672266698347, 0.0...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.10967601, 0...."
344,45_archiva-components,"[publish, repository, web, site, build, mvn, c...","[File Transfer Protocol, HTTP client, HTTP ser...","[0.1185900620822211, 0.023437672266698347, 0.0...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.070115216, 0..."
345,45_archiva-components,"[repository, mvn, publish, build, checkout, si...","[File Transfer Protocol, HTTP client, HTTP ser...","[0.1185900620822211, 0.023437672266698347, 0.0...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.117256194, 0..."
346,45_archiva-components,"[repository, publish, site, web, mvn, checkout...","[File Transfer Protocol, HTTP client, HTTP ser...","[0.1185900620822211, 0.023437672266698347, 0.0...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.117256194, 0..."


### scikit-LLM (OpenAI text-embedding-ada-002)

In [136]:
from skllm.models.gpt.vectorization import GPTVectorizer
from skllm.config import SKLLMConfig
from sklearn.metrics.pairwise import cosine_similarity
from dotenv import load_dotenv, find_dotenv
import os

In [144]:
_ = load_dotenv(find_dotenv()) # read local .env file

In [145]:
SKLLMConfig.set_openai_key(os.getenv("API_KEY"))
SKLLMConfig.set_openai_org(os.getenv("ORG"))

In [146]:
model = GPTVectorizer()

vectors = model.fit_transform(["how old are you?", "what is your age?"])

vector_1 = np.array(vectors[0]).reshape(1, -1)
vector_2 = np.array(vectors[1]).reshape(1, -1)

Batch size: 1


100%|██████████| 2/2 [00:01<00:00,  1.33it/s]


In [147]:
vector_1

array([[ 0.01409033, -0.01473137,  0.02737622, ...,  0.01300936,
         0.00062651, -0.0186279 ]])

In [148]:
vector_2

array([[ 0.01086601, -0.01645278,  0.00522158, ...,  0.00467059,
        -0.01438978, -0.0314448 ]])

In [149]:
cosine_similarity(vector_1, vector_2)

array([[0.94791596]])

In [152]:
# convert domains to vectors first.
# THIS HAS BEEN RUN, PLEASE UNCOMMENT TO RUN GPT VECTORISATION

# import time
# from numpy import asarray
# from numpy import save

# # dom_vec_list = []
# model = GPTVectorizer()
# ctr = 193
# domains_list_sub = domains_list[ctr:200]

# for dom in domains_list_sub: # 313 domains lists (no. of modules) to vectorise
#     curr_dom_vec = []
#     for d in dom:
#         sleep_time = 10
#         retries = 5
#         for x in range(0, retries):
#             try:
#                 curr_dom_vec.append(model.fit_transform([d]))
#                 break
#             except Exception as e:
#                 print("retrying...")
#                 time.sleep(sleep_time)

#     curr_dom_vec_arr = asarray(curr_dom_vec)
#     save(f"gpt_dom_vec\\{ctr}.npy", curr_dom_vec_arr)
#     ctr+=1

In [63]:
# convert concepts to vectors first.
# THIS HAS BEEN RUN, PLEASE UNCOMMENT TO RUN GPT VECTORISATION

# from numpy import asarray
# from numpy import save
# import ast

# model = GPTVectorizer()

# ctr = 0
# code_concept_list_sub = code_concept_list[ctr:100]

# for con_list in code_concept_list_sub: # 428 domains lists (no. of modules) to vectorise
#     curr_code_vec = []

#     con_list = ast.literal_eval(con_list)

#     for con in con_list:
#         con_str = " ".join(con)
#         curr_code_vec.append(model.fit_transform([con_str]))

#     curr_code_vec_arr = asarray(curr_code_vec)
#     save(f"gpt_code_vec\\{ctr}.npy", curr_code_vec_arr)
#     ctr+=1

In [64]:
convert concepts to vectors first.
THIS HAS BEEN RUN, PLEASE UNCOMMENT TO RUN GPT VECTORISATION

from numpy import asarray
from numpy import save

model = GPTVectorizer()

ctr = 0
test_concept_list_sub = test_concept_list[ctr:100]

for con_list in test_concept_list_sub: # 428 domains lists (no. of modules) to vectorise
    curr_test_vec = []

    for con in con_list:
        curr_test_vec.append(model.fit_transform([con]))

    curr_test_vec_arr = asarray(curr_test_vec)
    save(f"gpt_test_vec\\{ctr}.npy", curr_test_vec_arr)
    ctr+=1

In [None]:
# convert concepts to vectors first.
# THIS HAS BEEN RUN, PLEASE UNCOMMENT TO RUN GPT VECTORISATION

# from numpy import asarray
# from numpy import save

# model = GPTVectorizer()

# ctr = 0
# test_concept_list_sub = test_concept_list[ctr:]

# for con_list in test_concept_list_sub: # 428 domains lists (no. of modules) to vectorise
#     curr_test_vec = []

#     for con in con_list:
#         curr_test_vec.append(model.fit_transform([con]))

#     curr_test_vec_arr = asarray(curr_test_vec)
#     save(f"gpt_test_vec\\{ctr}.npy", curr_test_vec_arr)
#     ctr+=1

#### load the vectors

In [21]:
from numpy import load

dom_vec_list = []
code_vec_list = []
test_vec_list = []

for i in range(0, len(domains_list)): # 428 modules

    curr_dom_vec = load(f"gpt_vec\\{i}.npy")
    dom_vec_list.append(curr_dom_vec)

for i in range(0, len(domains_list)): # 428 modules

    curr_code_vec = load(f"gpt_code_vec\\{i}.npy")
    code_vec_list.append(curr_code_vec)

for i in range(0, len(domains_list)): # 428 modules

    curr_test_vec = load(f"gpt_test_vec\\{i}.npy")
    test_vec_list.append(curr_test_vec)

In [34]:
def comp_con_domains_gpt(con_vec, dom_vec):
    # list of domains should be given, now we return an array of similarity scores for all domains
    # one concept matching against all the domains for that concept
    sim_res = []

    con_vec = np.array(con_vec).reshape(1, -1)

    for dv in dom_vec:
        dv = np.array(dv).reshape(1, -1)
        sim_score = cosine_similarity(con_vec, dv)
        sim_res.append(sim_score[0][0])

    return sim_res

In [51]:
gpt_code_domain_res = []

for i, code_vec in enumerate(code_vec_list):

    curr_con_list = ast.literal_eval(code_concept_list[i])

    for k, cv in enumerate(code_vec):

        curr_concepts = curr_con_list[k]
        curr_domains = domains_list[i]
        curr_prob = prob_dist_list[i]

        sim_res_list = comp_con_domains_gpt(cv, dom_vec_list[i])

        proj_name = proj_names.values[i][0]
        module_name = proj_names.values[i][1]

        gpt_code_domain_res.append([proj_name, module_name, curr_concepts, curr_domains, curr_prob, sim_res_list])

In [52]:
gpt_test_domain_res = []

for i, test_vec in enumerate(test_vec_list):

    curr_con_list = ast.literal_eval(test_concept_list[i])

    for k, tv in enumerate(test_vec):

        curr_concepts = curr_con_list[k]
        curr_domains = domains_list[i]
        curr_prob = prob_dist_list[i]

        sim_res_list = comp_con_domains_gpt(tv, dom_vec_list[i])

        proj_name = proj_names.values[i][0]
        module_name = proj_names.values[i][1]

        gpt_test_domain_res.append([proj_name, module_name, curr_concepts, curr_domains, curr_prob, sim_res_list])

In [53]:
code_dom_gpt_sim_df = pd.DataFrame(gpt_code_domain_res, columns=['project_name', 'module', 'code concepts', 'domains', 'weight_dist', 'gpt_code_sim_score'])
test_dom_gpt_sim_df = pd.DataFrame(gpt_test_domain_res, columns=['project_name', 'module', 'test concepts', 'domains', 'weight_dist', 'gpt_test_sim_score'])

In [69]:
code_domain_sim_df = sow2v_code_domain_res_df.copy()
code_domain_sim_df['gpt_code_sim_score'] = code_dom_gpt_sim_df['gpt_code_sim_score']
code_domain_sim_df

Unnamed: 0,project_name,module,concept,domains,weight_dist,sow2v_code_sim_score,gpt_code_sim_score
0,01_dubbo,dubbo-cluster,"[merger, model, scope, aware, end, error, merg...","[Bayesian inference, Containerization, File Tr...","[0.0006186695022802121, 0.001075709038256659, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.7311602189366283, 0.7417190548602931, 0.703..."
1,01_dubbo,dubbo-cluster,"[match, value, bool, exact, weight, result, de...","[Bayesian inference, Containerization, File Tr...","[0.0006186695022802121, 0.001075709038256659, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.7017166388436102, 0.6982923951177212, 0.706..."
2,01_dubbo,dubbo-cluster,"[rule, mesh, listener, app, merger, map, merge...","[Bayesian inference, Containerization, File Tr...","[0.0006186695022802121, 0.001075709038256659, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.6569057015769233, 0.6808267372901955, 0.686..."
3,01_dubbo,dubbo-cluster,"[rule, key, invoker, node, router, destination...","[Bayesian inference, Containerization, File Tr...","[0.0006186695022802121, 0.001075709038256659, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.6962194262372328, 0.7197655294394439, 0.713..."
4,01_dubbo,dubbo-cluster,"[url, invoker, invocation, configurator, load,...","[Bayesian inference, Containerization, File Tr...","[0.0006186695022802121, 0.001075709038256659, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.7120789628683717, 0.7148573131041973, 0.702..."
...,...,...,...,...,...,...,...
3322,45_archiva-components,spring-registry,"[key, save, registry, add, path, builder, prop...","[File Transfer Protocol, HTTP server, World Wi...","[0.05355589451454959, 0.030031248756518632, 0....","[0.0, 0.0, 0.0, -0.03649073, 0.0, 0.0, -0.0692...","[0.7103701073130371, 0.7162612664178768, 0.708..."
3323,45_archiva-components,spring-registry,"[key, registry, add, value, path, builder, lis...","[File Transfer Protocol, HTTP server, World Wi...","[0.05355589451454959, 0.030031248756518632, 0....","[0.0, 0.0, 0.0, -0.03791273, 0.0, 0.0, -0.0609...","[0.6683675841386195, 0.6739302997298817, 0.650..."
3324,45_archiva-components,spring-taskqueue,"[task, evaluator, queue, executor, entry, viab...","[HTTP client, HTTP server, Web Components, Wor...","[0.024542570908181953, 0.021938531171453578, 0...","[0.0, 0.0, 0.0, 0.0, 0.063066274, 0.0, 0.0, 0....","[0.7088465085725367, 0.6858999021534108, 0.693..."
3325,45_archiva-components,spring-taskqueue,"[task, message, execution, queue, cause, throw...","[HTTP client, HTTP server, Web Components, Wor...","[0.024542570908181953, 0.021938531171453578, 0...","[0.0, 0.0, 0.0, 0.0, 0.033671085, 0.0, 0.0, 0....","[0.6818282981688972, 0.6697385290057885, 0.677..."


In [70]:
code_domain_sim_df.to_csv("code_domain_sim_df.csv", index=False)

In [71]:
test_domain_sim_df = sow2v_test_domain_res_df.copy()
test_domain_sim_df['gpt_test_sim_score'] = test_dom_gpt_sim_df['gpt_test_sim_score']
test_domain_sim_df

Unnamed: 0,project_name,module,concept,domains,weight_dist,sow2v_sim_score,gpt_test_sim_score
0,01_dubbo,dubbo-cluster,"[invoker, hello, menu, service, load, balance,...","[Bayesian inference, Containerization, File Tr...","[0.0006186695022802121, 0.001075709038256659, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.6970149849400307, 0.6991927097576484, 0.688..."
1,01_dubbo,dubbo-cluster,"[invoker, invocation, url, cluster, mock, resu...","[Bayesian inference, Containerization, File Tr...","[0.0006186695022802121, 0.001075709038256659, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.7105901433756564, 0.7263724106567162, 0.707..."
2,01_dubbo,dubbo-cluster,"[attachment, value, key, invoker, sticky, invo...","[Bayesian inference, Containerization, File Tr...","[0.0006186695022802121, 0.001075709038256659, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.6911908680024789, 0.7036007183990732, 0.702..."
3,01_dubbo,dubbo-cluster,"[url, model, merge, merger, module, router, pr...","[Bayesian inference, Containerization, File Tr...","[0.0006186695022802121, 0.001075709038256659, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.6903310848927185, 0.703847739693023, 0.7124..."
4,01_dubbo,dubbo-cluster,"[match, configurator, mock, bool, join, absent...","[Bayesian inference, Containerization, File Tr...","[0.0006186695022802121, 0.001075709038256659, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.6864053460660187, 0.6989138347806538, 0.689..."
...,...,...,...,...,...,...,...
2962,45_archiva-components,spring-taskqueue,"[task, evaluator, evaluate, project, build, co...","[HTTP client, HTTP server, Web Components, Wor...","[0.024542570908181953, 0.021938531171453578, 0...","[0.0, 0.0, 0.0, 0.0, 0.072746545, 0.0, 0.0, 0....","[0.6965285681648198, 0.6897861800252568, 0.696..."
2963,45_archiva-components,spring-taskqueue,"[task, queue, executor, build, project, expect...","[HTTP client, HTTP server, Web Components, Wor...","[0.024542570908181953, 0.021938531171453578, 0...","[0.0, 0.0, 0.0, 0.0, 0.07758436, 0.0, 0.0, 0.0...","[0.7134705343452065, 0.7066419716846432, 0.707..."
2964,45_archiva-components,spring-taskqueue,"[task, evaluator, exit, evaluate, queue, expec...","[HTTP client, HTTP server, Web Components, Wor...","[0.024542570908181953, 0.021938531171453578, 0...","[0.0, 0.0, 0.0, 0.0, 0.02112553, 0.0, 0.0, 0.0...","[0.7195561702420139, 0.7138746536780918, 0.716..."
2965,45_archiva-components,spring-taskqueue,"[task, evaluate, entry, evaluator, exit, proje...","[HTTP client, HTTP server, Web Components, Wor...","[0.024542570908181953, 0.021938531171453578, 0...","[0.0, 0.0, 0.0, 0.0, 0.08187002, 0.0, 0.0, 0.0...","[0.7091316903562347, 0.6954968901204565, 0.716..."


In [72]:
test_domain_sim_df.to_csv("test_domain_sim_df.csv", index=False)