## Concept/Domain coverage analysis: Matching with App Domains

This notebook extracts corpus keywords and analyse the concepts emerging from three sources: source code, documentation and tests

## Show topics

In [23]:
import pandas as pd

topics_df = pd.read_csv("topics_res_df.csv")
topics_df.head()

Unnamed: 0,project_name,module,code_num_topics,test_num_topics,code_shown_topics,code_topics,test_shown_topics,test_topics,doc_num_topics,doc_shown_topics,doc_topics
0,01_dubbo,dubbo-cluster,9,6,"[(0, [('merger', 0.11644696), ('model', 0.0760...","[['merger', 'model', 'scope', 'aware', 'end', ...","[(0, [('invoker', 0.10217771), ('hello', 0.059...","[['invoker', 'hello', 'menu', 'service', 'load...",8,"[(0, [('service', 0.005111123), ('see', 0.0051...","[['service', 'see', 'issue', 'sample', 'projec..."
1,01_dubbo,dubbo-common,9,9,"[(0, [('map', 0.071077), ('extension', 0.05852...","[['map', 'extension', 'loader', 'property', 'm...","[(0, [('address', 0.07335682), ('country', 0.0...","[['address', 'country', 'phone', 'full', 'size...",8,"[(0, [('service', 0.005111123), ('see', 0.0051...","[['service', 'see', 'issue', 'sample', 'projec..."
2,01_dubbo,dubbo-compatible,8,9,"[(0, [('invocation', 0.17978409), ('invoker', ...","[['invocation', 'invoker', 'attachment', 'argu...","[(0, [('consumer', 0.08403326), ('service', 0....","[['consumer', 'service', 'argument', 'applicat...",8,"[(0, [('service', 0.005111123), ('see', 0.0051...","[['service', 'see', 'issue', 'sample', 'projec..."
3,01_dubbo,dubbo-config,7,8,"[(0, [('application', 0.19198503), ('model', 0...","[['application', 'model', 'module', 'context',...","[(0, [('box', 0.0995611), ('service', 0.097135...","[['box', 'service', 'demo', 'user', 'say', 'im...",8,"[(0, [('service', 0.005111123), ('see', 0.0051...","[['service', 'see', 'issue', 'sample', 'projec..."
4,01_dubbo,dubbo-configcenter,6,7,"[(0, [('dynamic', 0.2249236), ('factory', 0.12...","[['dynamic', 'factory', 'url', 'zookeeper', 'a...","[(0, [('namespace', 0.01471425), ('map', 0.014...","[['namespace', 'map', 'context', 'mock', 'prop...",8,"[(0, [('service', 0.005111123), ('see', 0.0051...","[['service', 'see', 'issue', 'sample', 'projec..."


## Process annotated modules

In [2]:
anno_df = pd.read_csv("module_annotation.csv")
anno_sub_df = anno_df[["labels", "distribution"]]
anno_df

Unnamed: 0,project,module,labels,distribution
0,dubbo,dubbo-configcenter,"['3D computer graphics', '3D modeling', 'Bayes...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0358421722524..."
1,dubbo,dubbo-remoting,"['3D computer graphics', '3D modeling', 'Bayes...","[0.003827634543563279, 0.0007640459047594847, ..."
2,dubbo,dubbo-spring-boot,"['3D computer graphics', '3D modeling', 'Bayes...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.002485344724677403..."
3,dubbo,dubbo-serialization,"['3D computer graphics', '3D modeling', 'Bayes...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1949338529501..."
4,dubbo,dubbo-native,"['3D computer graphics', '3D modeling', 'Bayes...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0785512761148..."
...,...,...,...,...
478,archiva-components,spring-apacheds,"['3D computer graphics', '3D modeling', 'Bayes...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1170141864804..."
479,archiva-components,spring-registry,"['3D computer graphics', '3D modeling', 'Bayes...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0535558945145..."
480,archiva-components,spring-quartz,"['3D computer graphics', '3D modeling', 'Bayes...","[0.0, 0.028540502180797137, 0.0, 0.0, 0.0, 0.0..."
481,archiva-components,rest-util,"['3D computer graphics', '3D modeling', 'Bayes...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.04058421758514396,..."


In [3]:
# unify the project names
proj_list_b = list(dict.fromkeys(topics_df['project_name']))
proj_list_a = list(dict.fromkeys(anno_df['project']))
to_chg = dict(zip(proj_list_a, proj_list_b))


In [4]:
import ast

updated_rows = []

for row in anno_sub_df.values:

    updated_labels = []
    updated_dist = []
    this_row = []

    labels = row[0]
    values = row[1]

    labels = ast.literal_eval(labels)
    values = ast.literal_eval(values)

    for i, l in enumerate(labels):
        if float(values[i]) > 0:
            this_row.append((l,values[i]))

    for vals in this_row:
        updated_labels.append(vals[0])
        updated_dist.append(vals[1])

    updated_rows.append((updated_labels, updated_dist))

In [5]:
updated_anno_df = pd.concat([anno_df, pd.DataFrame(updated_rows, columns=["domains", "prob_dist"])], axis=1).drop(columns=["labels","distribution"])
updated_anno_df['project_name'] = updated_anno_df['project'].apply(lambda x: to_chg[x])
updated_anno_df['module'] = updated_anno_df['module'].str.replace("modules", "modules\\vfs-class-loader")
updated_anno_df = updated_anno_df[['project_name', 'module', 'domains', 'prob_dist']]
updated_anno_df

Unnamed: 0,project_name,module,domains,prob_dist
0,01_dubbo,dubbo-configcenter,"[File Transfer Protocol, HTTP client, HTTP ser...","[0.035842172252428264, 0.040560328491004986, 0..."
1,01_dubbo,dubbo-remoting,"[3D computer graphics, 3D modeling, File Trans...","[0.003827634543563279, 0.0007640459047594847, ..."
2,01_dubbo,dubbo-spring-boot,"[DevOps, File Transfer Protocol, HTTP client, ...","[0.0024853447246774036, 0.011500804409850242, ..."
3,01_dubbo,dubbo-serialization,"[File Transfer Protocol, HTTP client, HTTP ser...","[0.1949338529501777, 0.014838969973809206, 0.0..."
4,01_dubbo,dubbo-native,"[File Transfer Protocol, HTTP client, HTTP ser...","[0.07855127611486389, 0.171984535082097, 0.271..."
...,...,...,...,...
478,45_archiva-components,spring-apacheds,"[File Transfer Protocol, HTTP server, World Wi...","[0.11701418648043749, 0.2335993974088256, 0.05..."
479,45_archiva-components,spring-registry,"[File Transfer Protocol, HTTP server, World Wi...","[0.05355589451454959, 0.030031248756518632, 0...."
480,45_archiva-components,spring-quartz,"[3D modeling, Web Components, World Wide Web, ...","[0.028540502180797137, 0.014346165183265333, 0..."
481,45_archiva-components,rest-util,"[DevOps, World Wide Web, algorithmic trading, ...","[0.04058421758514396, 0.14341622866869835, 0.0..."


In [6]:
updated_anno_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 483 entries, 0 to 482
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   project_name  483 non-null    object
 1   module        483 non-null    object
 2   domains       483 non-null    object
 3   prob_dist     483 non-null    object
dtypes: object(4)
memory usage: 15.2+ KB


In [7]:
module_df = topics_df.merge(updated_anno_df, on=['project_name', 'module'])
module_df

Unnamed: 0,project_name,module,code_num_topics,test_num_topics,code_shown_topics,code_topics,test_shown_topics,test_topics,doc_num_topics,doc_shown_topics,doc_topics,domains,prob_dist
0,01_dubbo,dubbo-cluster,9,6,"[(0, [('merger', 0.11644696), ('model', 0.0760...","[['merger', 'model', 'scope', 'aware', 'end', ...","[(0, [('invoker', 0.10217771), ('hello', 0.059...","[['invoker', 'hello', 'menu', 'service', 'load...",8,"[(0, [('service', 0.005111123), ('see', 0.0051...","[['service', 'see', 'issue', 'sample', 'projec...","[Bayesian inference, Containerization, File Tr...","[0.0006186695022802121, 0.001075709038256659, ..."
1,01_dubbo,dubbo-common,9,9,"[(0, [('map', 0.071077), ('extension', 0.05852...","[['map', 'extension', 'loader', 'property', 'm...","[(0, [('address', 0.07335682), ('country', 0.0...","[['address', 'country', 'phone', 'full', 'size...",8,"[(0, [('service', 0.005111123), ('see', 0.0051...","[['service', 'see', 'issue', 'sample', 'projec...","[3D computer graphics, 3D modeling, Bidirectio...","[0.00022489652756263503, 0.019850776647796192,..."
2,01_dubbo,dubbo-compatible,8,9,"[(0, [('invocation', 0.17978409), ('invoker', ...","[['invocation', 'invoker', 'attachment', 'argu...","[(0, [('consumer', 0.08403326), ('service', 0....","[['consumer', 'service', 'argument', 'applicat...",8,"[(0, [('service', 0.005111123), ('see', 0.0051...","[['service', 'see', 'issue', 'sample', 'projec...","[Bayesian inference, File Transfer Protocol, H...","[0.0014187991538512738, 0.040202071048378446, ..."
3,01_dubbo,dubbo-config,7,8,"[(0, [('application', 0.19198503), ('model', 0...","[['application', 'model', 'module', 'context',...","[(0, [('box', 0.0995611), ('service', 0.097135...","[['box', 'service', 'demo', 'user', 'say', 'im...",8,"[(0, [('service', 0.005111123), ('see', 0.0051...","[['service', 'see', 'issue', 'sample', 'projec...","[3D modeling, Bayesian inference, Containeriza...","[0.005673695513894933, 0.0016755344658510142, ..."
4,01_dubbo,dubbo-configcenter,6,7,"[(0, [('dynamic', 0.2249236), ('factory', 0.12...","[['dynamic', 'factory', 'url', 'zookeeper', 'a...","[(0, [('namespace', 0.01471425), ('map', 0.014...","[['namespace', 'map', 'context', 'mock', 'prop...",8,"[(0, [('service', 0.005111123), ('see', 0.0051...","[['service', 'see', 'issue', 'sample', 'projec...","[File Transfer Protocol, HTTP client, HTTP ser...","[0.035842172252428264, 0.040560328491004986, 0..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
308,45_archiva-components,spring-apacheds,9,1,"[(0, [('partition', 0.038462438), ('context', ...","[['partition', 'context', 'password', 'port', ...","[(0, [('context', 0.21739134), ('create', 0.13...","[['context', 'create', 'dir', 'attribute', 'ba...",8,"[(0, [('publish', 0.008338352), ('mvn', 0.0083...","[['publish', 'mvn', 'repository', 'checkout', ...","[File Transfer Protocol, HTTP server, World Wi...","[0.11701418648043749, 0.2335993974088256, 0.05..."
309,45_archiva-components,spring-cache,6,3,"[(0, [('cache', 0.3110731), ('creator', 0.1051...","[['cache', 'creator', 'hint', 'factory', 'key'...","[(0, [('cache', 0.3525324), ('refre', 0.102800...","[['cache', 'refre', 'wine', 'second', 'never',...",8,"[(0, [('publish', 0.008338352), ('mvn', 0.0083...","[['publish', 'mvn', 'repository', 'checkout', ...","[3D modeling, File Transfer Protocol, HTTP ser...","[0.004910765950067891, 0.020708469498936777, 0..."
310,45_archiva-components,spring-quartz,8,4,"[(0, [('job', 0.29270846), ('context', 0.07773...","[['job', 'context', 'execution', 'map', 'liste...","[(0, [('job', 0.24515942), ('context', 0.16984...","[['job', 'context', 'one', 'execution', 'execu...",8,"[(0, [('publish', 0.008338352), ('mvn', 0.0083...","[['publish', 'mvn', 'repository', 'checkout', ...","[3D modeling, Web Components, World Wide Web, ...","[0.028540502180797137, 0.014346165183265333, 0..."
311,45_archiva-components,spring-registry,9,6,"[(0, [('key', 0.1410248), ('registry', 0.08333...","[['key', 'registry', 'builder', 'add', 'save',...","[(0, [('registry', 0.024453199), ('property', ...","[['registry', 'property', 'change', 'event', '...",8,"[(0, [('publish', 0.008338352), ('mvn', 0.0083...","[['publish', 'mvn', 'repository', 'checkout', ...","[File Transfer Protocol, HTTP server, World Wi...","[0.05355589451454959, 0.030031248756518632, 0...."


In [8]:
module_df[:5]

Unnamed: 0,project_name,module,code_num_topics,test_num_topics,code_shown_topics,code_topics,test_shown_topics,test_topics,doc_num_topics,doc_shown_topics,doc_topics,domains,prob_dist
0,01_dubbo,dubbo-cluster,9,6,"[(0, [('merger', 0.11644696), ('model', 0.0760...","[['merger', 'model', 'scope', 'aware', 'end', ...","[(0, [('invoker', 0.10217771), ('hello', 0.059...","[['invoker', 'hello', 'menu', 'service', 'load...",8,"[(0, [('service', 0.005111123), ('see', 0.0051...","[['service', 'see', 'issue', 'sample', 'projec...","[Bayesian inference, Containerization, File Tr...","[0.0006186695022802121, 0.001075709038256659, ..."
1,01_dubbo,dubbo-common,9,9,"[(0, [('map', 0.071077), ('extension', 0.05852...","[['map', 'extension', 'loader', 'property', 'm...","[(0, [('address', 0.07335682), ('country', 0.0...","[['address', 'country', 'phone', 'full', 'size...",8,"[(0, [('service', 0.005111123), ('see', 0.0051...","[['service', 'see', 'issue', 'sample', 'projec...","[3D computer graphics, 3D modeling, Bidirectio...","[0.00022489652756263503, 0.019850776647796192,..."
2,01_dubbo,dubbo-compatible,8,9,"[(0, [('invocation', 0.17978409), ('invoker', ...","[['invocation', 'invoker', 'attachment', 'argu...","[(0, [('consumer', 0.08403326), ('service', 0....","[['consumer', 'service', 'argument', 'applicat...",8,"[(0, [('service', 0.005111123), ('see', 0.0051...","[['service', 'see', 'issue', 'sample', 'projec...","[Bayesian inference, File Transfer Protocol, H...","[0.0014187991538512738, 0.040202071048378446, ..."
3,01_dubbo,dubbo-config,7,8,"[(0, [('application', 0.19198503), ('model', 0...","[['application', 'model', 'module', 'context',...","[(0, [('box', 0.0995611), ('service', 0.097135...","[['box', 'service', 'demo', 'user', 'say', 'im...",8,"[(0, [('service', 0.005111123), ('see', 0.0051...","[['service', 'see', 'issue', 'sample', 'projec...","[3D modeling, Bayesian inference, Containeriza...","[0.005673695513894933, 0.0016755344658510142, ..."
4,01_dubbo,dubbo-configcenter,6,7,"[(0, [('dynamic', 0.2249236), ('factory', 0.12...","[['dynamic', 'factory', 'url', 'zookeeper', 'a...","[(0, [('namespace', 0.01471425), ('map', 0.014...","[['namespace', 'map', 'context', 'mock', 'prop...",8,"[(0, [('service', 0.005111123), ('see', 0.0051...","[['service', 'see', 'issue', 'sample', 'projec...","[File Transfer Protocol, HTTP client, HTTP ser...","[0.035842172252428264, 0.040560328491004986, 0..."


In [9]:
module_df[-5:]

Unnamed: 0,project_name,module,code_num_topics,test_num_topics,code_shown_topics,code_topics,test_shown_topics,test_topics,doc_num_topics,doc_shown_topics,doc_topics,domains,prob_dist
308,45_archiva-components,spring-apacheds,9,1,"[(0, [('partition', 0.038462438), ('context', ...","[['partition', 'context', 'password', 'port', ...","[(0, [('context', 0.21739134), ('create', 0.13...","[['context', 'create', 'dir', 'attribute', 'ba...",8,"[(0, [('publish', 0.008338352), ('mvn', 0.0083...","[['publish', 'mvn', 'repository', 'checkout', ...","[File Transfer Protocol, HTTP server, World Wi...","[0.11701418648043749, 0.2335993974088256, 0.05..."
309,45_archiva-components,spring-cache,6,3,"[(0, [('cache', 0.3110731), ('creator', 0.1051...","[['cache', 'creator', 'hint', 'factory', 'key'...","[(0, [('cache', 0.3525324), ('refre', 0.102800...","[['cache', 'refre', 'wine', 'second', 'never',...",8,"[(0, [('publish', 0.008338352), ('mvn', 0.0083...","[['publish', 'mvn', 'repository', 'checkout', ...","[3D modeling, File Transfer Protocol, HTTP ser...","[0.004910765950067891, 0.020708469498936777, 0..."
310,45_archiva-components,spring-quartz,8,4,"[(0, [('job', 0.29270846), ('context', 0.07773...","[['job', 'context', 'execution', 'map', 'liste...","[(0, [('job', 0.24515942), ('context', 0.16984...","[['job', 'context', 'one', 'execution', 'execu...",8,"[(0, [('publish', 0.008338352), ('mvn', 0.0083...","[['publish', 'mvn', 'repository', 'checkout', ...","[3D modeling, Web Components, World Wide Web, ...","[0.028540502180797137, 0.014346165183265333, 0..."
311,45_archiva-components,spring-registry,9,6,"[(0, [('key', 0.1410248), ('registry', 0.08333...","[['key', 'registry', 'builder', 'add', 'save',...","[(0, [('registry', 0.024453199), ('property', ...","[['registry', 'property', 'change', 'event', '...",8,"[(0, [('publish', 0.008338352), ('mvn', 0.0083...","[['publish', 'mvn', 'repository', 'checkout', ...","[File Transfer Protocol, HTTP server, World Wi...","[0.05355589451454959, 0.030031248756518632, 0...."
312,45_archiva-components,spring-taskqueue,3,5,"[(0, [('task', 0.33837277), ('evaluator', 0.12...","[['task', 'evaluator', 'queue', 'executor', 'e...","[(0, [('task', 0.32901022), ('evaluator', 0.10...","[['task', 'evaluator', 'evaluate', 'project', ...",8,"[(0, [('publish', 0.008338352), ('mvn', 0.0083...","[['publish', 'mvn', 'repository', 'checkout', ...","[HTTP client, HTTP server, Web Components, Wor...","[0.024542570908181953, 0.021938531171453578, 0..."


In [10]:
module_df.to_csv("module_df.csv", index=False)

In [10]:
### separate the doc concepts

module_docs_df = module_df[['project_name', 'doc_topics']].drop_duplicates(ignore_index=True)
module_docs_df[:5]

Unnamed: 0,project_name,doc_topics
0,01_dubbo,"[['service', 'see', 'issue', 'sample', 'projec..."
1,02_skywalking,"[['trace', 'support', 'metric', 'mail', 'nativ..."
2,03_flink,"[['scala', 'processing', 'java', 'intellij', '..."
3,04_rocketmq,"[['software', 'use', 'message', 'run', 'cluste..."
4,05_shardingsphere,"[['database', 'link', 'amp', 'provide', 'suppo..."


## LLM Matching of AD to Concepts

### Using embeddings then calculate semantic similarity to match with domains

- StackOverflow w2v
- text-embedding-ada-002 (openAI + scikit-LLM)

In [11]:
code_concept_list = module_df['code_topics']
test_concept_list = module_df['test_topics']
doc_concept_list = module_df['doc_topics']
prob_dist_list = module_df['prob_dist']
domains_list = module_df['domains']

In [12]:
proj_names = module_df[['project_name', 'module']]
proj_names

Unnamed: 0,project_name,module
0,01_dubbo,dubbo-cluster
1,01_dubbo,dubbo-common
2,01_dubbo,dubbo-compatible
3,01_dubbo,dubbo-config
4,01_dubbo,dubbo-configcenter
...,...,...
308,45_archiva-components,spring-apacheds
309,45_archiva-components,spring-cache
310,45_archiva-components,spring-quartz
311,45_archiva-components,spring-registry


In [13]:
len(domains_list)

313

## StackOverflow W2V

In [14]:
from gensim.models.keyedvectors import KeyedVectors

so_w2v_model = KeyedVectors.load_word2vec_format('C:/Users/biadge/OneDrive - BP/PhD/extraction/SO_vectors_200.bin', binary=True)

In [15]:
# test
so_w2v_model.n_similarity(['test', 'case'], ['quality'])

0.1252212

In [16]:
def comp_con_domains_sow2v(so_w2v_model, con, domains):
    # list of domains should be given, now we return an array of similarity scores for all domains
    # one concept matching against all the domains for that concept
    sim_res = []

    for dom in domains:
        sim_score = so_w2v_model.n_similarity(con, [dom])
        sim_res.append(sim_score)

    return sim_res

### Match concepts to domains

In [17]:
import ast

sow2v_code_domain_res = []

for i, con in enumerate(code_concept_list):

    c = ast.literal_eval(con)

    for con_str in c:

        curr_domains = domains_list[i]
        curr_prob = prob_dist_list[i]

        sim_res_list = comp_con_domains_sow2v(so_w2v_model, con_str, curr_domains)

        proj_name = proj_names.values[i][0]
        module_name = proj_names.values[i][1]

        sow2v_code_domain_res.append([proj_name, module_name, con_str, curr_domains, curr_prob, sim_res_list])

In [18]:
sow2v_test_domain_res = []

for i, con in enumerate(test_concept_list):

    c = ast.literal_eval(con)

    for con_str in c:

        curr_domains = domains_list[i]
        curr_prob = prob_dist_list[i]

        sim_res_list = comp_con_domains_sow2v(so_w2v_model, con_str, curr_domains)

        proj_name = proj_names.values[i][0]
        module_name = proj_names.values[i][1]

        sow2v_test_domain_res.append([proj_name, module_name, con_str, curr_domains, curr_prob, sim_res_list])

In [19]:
sow2v_doc_domain_res = []

for i, con in enumerate(doc_concept_list):

    c = ast.literal_eval(con)

    for con_str in c:

        curr_domains = domains_list[i]
        curr_prob = prob_dist_list[i]

        sim_res_list = comp_con_domains_sow2v(so_w2v_model, con_str, curr_domains)

        proj_name = proj_names.values[i][0]
        module_name = proj_names.values[i][1]

        sow2v_doc_domain_res.append([proj_name, module_name, con_str, curr_domains, curr_prob, sim_res_list])

In [20]:
sow2v_code_domain_res_df = pd.DataFrame(sow2v_code_domain_res, columns=['project_name', 'module', 'concept', 'domains', 'weight_dist', 'sow2v_code_sim_score'])
sow2v_test_domain_res_df = pd.DataFrame(sow2v_test_domain_res, columns=['project_name', 'module', 'concept', 'domains', 'weight_dist', 'sow2v_test_sim_score'])
sow2v_doc_domain_res_df = pd.DataFrame(sow2v_doc_domain_res, columns=['project_name', 'module', 'concept', 'domains', 'weight_dist', 'sow2v_doc_sim_score'])

### scikit-LLM (OpenAI text-embedding-ada-002)

In [21]:
from skllm.models.gpt.vectorization import GPTVectorizer
from skllm.config import SKLLMConfig
from sklearn.metrics.pairwise import cosine_similarity
from dotenv import load_dotenv, find_dotenv
import os

In [22]:
_ = load_dotenv(find_dotenv()) # read local .env file

In [23]:
SKLLMConfig.set_openai_key(os.getenv("API_KEY"))
SKLLMConfig.set_openai_org(os.getenv("ORG"))

In [24]:
# model = GPTVectorizer()

# vectors = model.fit_transform(["how old are you?", "what is your age?"])

# vector_1 = np.array(vectors[0]).reshape(1, -1)
# vector_2 = np.array(vectors[1]).reshape(1, -1)

In [25]:
# vector_1

In [26]:
# vector_2

In [27]:
# cosine_similarity(vector_1, vector_2)

In [28]:
# convert domains to vectors first.
# THIS HAS BEEN RUN, PLEASE UNCOMMENT TO RUN GPT VECTORISATION

# import time
# from numpy import asarray
# from numpy import save

# # dom_vec_list = []
# model = GPTVectorizer()
# ctr = 193
# domains_list_sub = domains_list[ctr:200]

# for dom in domains_list_sub: # 313 domains lists (no. of modules) to vectorise
#     curr_dom_vec = []
#     for d in dom:
#         sleep_time = 10
#         retries = 5
#         for x in range(0, retries):
#             try:
#                 curr_dom_vec.append(model.fit_transform([d]))
#                 break
#             except Exception as e:
#                 print("retrying...")
#                 time.sleep(sleep_time)

#     curr_dom_vec_arr = asarray(curr_dom_vec)
#     save(f"gpt_dom_vec\\{ctr}.npy", curr_dom_vec_arr)
#     ctr+=1

In [29]:
#convert concepts to vectors first.
#THIS HAS BEEN RUN, PLEASE UNCOMMENT TO RUN GPT VECTORISATION

# from numpy import asarray
# from numpy import save
# import ast

# model = GPTVectorizer()

# ctr = 0
# code_concept_list_sub = code_concept_list[ctr:100]

# for con_list in code_concept_list_sub: # 313 domains lists (no. of modules) to vectorise
#     curr_code_vec = []

#     con_list = ast.literal_eval(con_list)

#     for con in con_list:
#         con_str = " ".join(con)
#         curr_code_vec.append(model.fit_transform([con_str]))

#     curr_code_vec_arr = asarray(curr_code_vec)
#     save(f"gpt_code_vec\\{ctr}.npy", curr_code_vec_arr)
#     ctr+=1

In [30]:
# convert concepts to vectors first.
# THIS HAS BEEN RUN, PLEASE UNCOMMENT TO RUN GPT VECTORISATION

# from numpy import asarray
# from numpy import save

# model = GPTVectorizer()

# ctr = 0
# test_concept_list_sub = test_concept_list[ctr:50]

# for con_list in test_concept_list_sub: # 313 domains lists (no. of modules) to vectorise
#     curr_test_vec = []

#     con_list = ast.literal_eval(con_list)

#     for con in con_list:
#         con_str = " ".join(con)
#         curr_test_vec.append(model.fit_transform([con_str]))

#     curr_test_vec_arr = asarray(curr_test_vec)
#     save(f"gpt_test_vec\\{ctr}.npy", curr_test_vec_arr)
#     ctr+=1

In [31]:
# convert concepts to vectors first.
# THIS HAS BEEN RUN, PLEASE UNCOMMENT TO RUN GPT VECTORISATION

# from numpy import asarray
# from numpy import save

# model = GPTVectorizer()

# ctr = 0

# for con_list in doc_concept_list:
#     curr_doc_vec = []
#     con_list = ast.literal_eval(con_list)

#     for con in con_list:
#         con_str = " ".join(con)
#         curr_doc_vec.append(model.fit_transform([con_str]))

#     curr_doc_vec_arr = asarray(curr_doc_vec)
#     save(f"gpt_doc_vec\\{ctr}.npy", curr_doc_vec_arr)
#     ctr+=1

#### load the vectors

In [32]:
from numpy import load

dom_vec_list = []
code_vec_list = []
test_vec_list = []

for i in range(0, len(domains_list)): # 313 modules

    curr_dom_vec = load(f"gpt_dom_vec\\{i}.npy")
    dom_vec_list.append(curr_dom_vec)

for i in range(0, len(code_concept_list)): # 313 modules

    curr_code_vec = load(f"gpt_code_vec\\{i}.npy")
    code_vec_list.append(curr_code_vec)

for i in range(0, len(test_concept_list)): # 313 modules

    curr_test_vec = load(f"gpt_test_vec\\{i}.npy")
    test_vec_list.append(curr_test_vec)

In [33]:
# for doc we have to map out the proj/module mapping

proj_id = proj_names['project_name'].str[:2].astype(int).tolist()
len(proj_id)

313

In [34]:
doc_vec_list = []

for pi in proj_id:

    file_id = pi-1

    curr_doc_vec = load(f"gpt_doc_vec\\{str(file_id)}.npy")
    doc_vec_list.append(curr_doc_vec)

In [35]:
len(doc_vec_list)

313

In [36]:
def comp_con_domains_gpt(con_vec, dom_vec):
    # list of domains should be given, now we return an array of similarity scores for all domains
    # one concept matching against all the domains for that concept
    sim_res = []

    con_vec = np.array(con_vec).reshape(1, -1)

    for dv in dom_vec:
        dv = np.array(dv).reshape(1, -1)
        sim_score = cosine_similarity(con_vec, dv)
        sim_res.append(sim_score[0][0])

    return sim_res

In [37]:
gpt_code_domain_res = []

for i, code_vec in enumerate(code_vec_list):

    curr_con_list = ast.literal_eval(code_concept_list[i])

    for k, cv in enumerate(code_vec):

        curr_concepts = curr_con_list[k]
        curr_domains = domains_list[i]
        curr_prob = prob_dist_list[i]

        sim_res_list = comp_con_domains_gpt(cv, dom_vec_list[i])

        proj_name = proj_names.values[i][0]
        module_name = proj_names.values[i][1]

        gpt_code_domain_res.append([proj_name, module_name, curr_concepts, curr_domains, curr_prob, sim_res_list])

In [38]:
gpt_test_domain_res = []

for i, test_vec in enumerate(test_vec_list):

    curr_con_list = ast.literal_eval(test_concept_list[i])

    for k, tv in enumerate(test_vec):

        curr_concepts = curr_con_list[k]
        curr_domains = domains_list[i]
        curr_prob = prob_dist_list[i]

        sim_res_list = comp_con_domains_gpt(tv, dom_vec_list[i])

        proj_name = proj_names.values[i][0]
        module_name = proj_names.values[i][1]

        gpt_test_domain_res.append([proj_name, module_name, curr_concepts, curr_domains, curr_prob, sim_res_list])

In [39]:
gpt_doc_domain_res = []

for i, doc_vec in enumerate(doc_vec_list):

    curr_con_list = ast.literal_eval(module_df['doc_topics'][i])

    for k, dv in enumerate(doc_vec):

        curr_concepts = curr_con_list[k]
        curr_domains = domains_list[i]
        curr_prob = prob_dist_list[i]

        sim_res_list = comp_con_domains_gpt(dv, dom_vec_list[i])

        proj_name = proj_names.values[i][0]
        module_name = proj_names.values[i][1]

        gpt_doc_domain_res.append([proj_name, module_name, curr_concepts, curr_domains, curr_prob, sim_res_list])

In [40]:
code_dom_gpt_sim_df = pd.DataFrame(gpt_code_domain_res, columns=['project_name', 'module', 'code concepts', 'domains', 'weight_dist', 'gpt_code_sim_score'])
test_dom_gpt_sim_df = pd.DataFrame(gpt_test_domain_res, columns=['project_name', 'module', 'test concepts', 'domains', 'weight_dist', 'gpt_test_sim_score'])
doc_dom_gpt_sim_df = pd.DataFrame(gpt_doc_domain_res, columns=['project_name', 'module', 'doc concepts', 'domains', 'weight_dist', 'gpt_doc_sim_score'])

### UAE-Large-V1

In [41]:
import torch
torch.cuda.is_available()

True

In [42]:
import torch
from transformers import AutoModel, AutoTokenizer

def angle_vec(tokenizer, model, input):

    tok = tokenizer([input], return_tensors='pt')

    for k, v in tok.items():
        tok[k] = v.cuda()

    hs = model(**tok).last_hidden_state

    vec = (hs[:, 0] + torch.mean(hs, dim=1)) / 2.0

    return vec.detach().cpu().numpy()

In [43]:
tokenizer = AutoTokenizer.from_pretrained('./UAE-Large-V1')
angle_model = AutoModel.from_pretrained('./UAE-Large-V1').cuda()  
v1 = angle_vec(tokenizer, angle_model, "king")
v2 = angle_vec(tokenizer, angle_model, "queen")

In [44]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_similarity(v1, v2)

array([[0.7001157]], dtype=float32)

In [45]:
# convert concepts to vectors first.
# THIS HAS BEEN RUN, PLEASE UNCOMMENT TO RUN ANGLE VECTORISATION
# from numpy import asarray
# from numpy import save
# import ast

# tokenizer = AutoTokenizer.from_pretrained('./UAE-Large-V1')
# angle_model = AutoModel.from_pretrained('./UAE-Large-V1').cuda()  

# ctr = 0
# code_concept_list_sub = code_concept_list[ctr:]

# for con_list in code_concept_list_sub: # 313 domains lists (no. of modules) to vectorise
#     curr_code_vec = []

#     con_list = ast.literal_eval(con_list)

#     for con in con_list:
#         con_str = " ".join(con)
#         curr_code_vec.append(angle_vec(tokenizer, angle_model, con_str))

#     curr_code_vec_arr = asarray(curr_code_vec)
#     save(f"angle_code_vec\\{ctr}.npy", curr_code_vec_arr)
#     ctr+=1

In [46]:
# convert concepts to vectors first.
# THIS HAS BEEN RUN, PLEASE UNCOMMENT TO RUN ANGLE VECTORISATION
# from numpy import asarray
# from numpy import save
# import ast

# # tokenizer = AutoTokenizer.from_pretrained('./UAE-Large-V1')
# # angle_model = AutoModel.from_pretrained('./UAE-Large-V1').cuda()  

# ctr = 0
# test_concept_list_sub = test_concept_list[ctr:]

# for con_list in test_concept_list_sub: # 313 domains lists (no. of modules) to vectorise
#     curr_test_vec = []

#     con_list = ast.literal_eval(con_list)

#     for con in con_list:
#         con_str = " ".join(con)
#         curr_test_vec.append(angle_vec(tokenizer, angle_model, con_str))

#     curr_test_vec_arr = asarray(curr_test_vec)
#     save(f"angle_test_vec\\{ctr}.npy", curr_test_vec_arr)
#     ctr+=1

In [50]:
# convert concepts to vectors first.
# THIS HAS BEEN RUN, PLEASE UNCOMMENT TO RUN ANGLE VECTORISATION
# from numpy import asarray
# from numpy import save
# import ast

# ctr = 0

# for con_list in doc_concept_list:
#     curr_doc_vec = []

#     con_list = ast.literal_eval(con_list)

#     for con in con_list:
#         con_str = " ".join(con)
#         curr_doc_vec.append(angle_vec(tokenizer, angle_model, con_str))

#     curr_doc_vec_arr = asarray(curr_doc_vec)
#     save(f"angle_doc_vec\\{ctr}.npy", curr_doc_vec_arr)
#     ctr+=1

In [48]:
# convert concepts to vectors first.
# THIS HAS BEEN RUN, PLEASE UNCOMMENT TO RUN ANGLE VECTORISATION
# from numpy import asarray
# from numpy import save

# ctr = 216
# domains_list_sub = domains_list[ctr:]

# for dom in domains_list_sub: # 313 domains lists (no. of modules) to vectorise
#     curr_dom_vec = []
#     for d in dom:
#         curr_dom_vec.append(angle_vec(tokenizer, angle_model, d))

#     curr_dom_vec_arr = asarray(curr_dom_vec)
#     save(f"angle_dom_vec\\{ctr}.npy", curr_dom_vec_arr)
#     ctr+=1

#### load the vectors

In [75]:
from numpy import load

dom_vec_list = []
code_vec_list = []
test_vec_list = []
doc_vec_list = []

for i in range(0, len(domains_list)): # 313 modules

    curr_dom_vec = load(f"angle_dom_vec\\{i}.npy")
    dom_vec_list.append(curr_dom_vec)

for i in range(0, len(code_concept_list)): # 313 modules

    curr_code_vec = load(f"angle_code_vec\\{i}.npy")
    code_vec_list.append(curr_code_vec)

for i in range(0, len(test_concept_list)): # 313 modules

    curr_test_vec = load(f"angle_test_vec\\{i}.npy")
    test_vec_list.append(curr_test_vec)

for i in range(0, len(doc_concept_list)): # 313 modules

    curr_doc_vec = load(f"angle_doc_vec\\{i}.npy")
    doc_vec_list.append(curr_doc_vec)

In [71]:
def comp_con_domains_angle(con_vec, dom_vec):
    # list of domains should be given, now we return an array of similarity scores for all domains
    # one concept matching against all the domains for that concept
    sim_res = []

    for dv in dom_vec:
        sim_score = cosine_similarity(con_vec, dv)
        sim_res.append(sim_score[0][0])

    return sim_res

In [72]:
angle_code_domain_res = []

for i, code_vec in enumerate(code_vec_list):

    curr_con_list = ast.literal_eval(code_concept_list[i])

    for k, cv in enumerate(code_vec):

        curr_concepts = curr_con_list[k]
        curr_domains = domains_list[i]
        curr_prob = prob_dist_list[i]

        sim_res_list = comp_con_domains_angle(cv, dom_vec_list[i])

        proj_name = proj_names.values[i][0]
        module_name = proj_names.values[i][1]

        angle_code_domain_res.append([proj_name, module_name, curr_concepts, curr_domains, curr_prob, sim_res_list])

In [73]:
angle_test_domain_res = []

for i, test_vec in enumerate(test_vec_list):

    curr_con_list = ast.literal_eval(test_concept_list[i])

    for k, tv in enumerate(test_vec):

        curr_concepts = curr_con_list[k]
        curr_domains = domains_list[i]
        curr_prob = prob_dist_list[i]

        sim_res_list = comp_con_domains_angle(tv, dom_vec_list[i])

        proj_name = proj_names.values[i][0]
        module_name = proj_names.values[i][1]

        angle_test_domain_res.append([proj_name, module_name, curr_concepts, curr_domains, curr_prob, sim_res_list])

In [76]:
angle_doc_domain_res = []

for i, doc_vec in enumerate(doc_vec_list):

    curr_con_list = ast.literal_eval(doc_concept_list[i])

    for k, dv in enumerate(doc_vec):

        curr_concepts = curr_con_list[k]
        curr_domains = domains_list[i]
        curr_prob = prob_dist_list[i]

        sim_res_list = comp_con_domains_angle(dv, dom_vec_list[i])

        proj_name = proj_names.values[i][0]
        module_name = proj_names.values[i][1]

        angle_doc_domain_res.append([proj_name, module_name, curr_concepts, curr_domains, curr_prob, sim_res_list])

In [77]:
code_dom_angle_sim_df = pd.DataFrame(angle_code_domain_res, columns=['project_name', 'module', 'code concepts', 'domains', 'weight_dist', 'angle_code_sim_score'])
test_dom_angle_sim_df = pd.DataFrame(angle_test_domain_res, columns=['project_name', 'module', 'test concepts', 'domains', 'weight_dist', 'angle_test_sim_score'])
doc_dom_angle_sim_df = pd.DataFrame(angle_doc_domain_res, columns=['project_name', 'module', 'doc concepts', 'domains', 'weight_dist', 'angle_doc_sim_score'])

#### Merge all sim scores

In [78]:
code_domain_sim_df = sow2v_code_domain_res_df.copy()
code_domain_sim_df['gpt_code_sim_score'] = code_dom_gpt_sim_df['gpt_code_sim_score']
code_domain_sim_df['angle_code_sim_score'] = code_dom_angle_sim_df['angle_code_sim_score']
code_domain_sim_df

Unnamed: 0,project_name,module,concept,domains,weight_dist,sow2v_code_sim_score,gpt_code_sim_score,angle_code_sim_score
0,01_dubbo,dubbo-cluster,"[merger, model, scope, aware, end, error, merg...","[Bayesian inference, Containerization, File Tr...","[0.0006186695022802121, 0.001075709038256659, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.7311602189366283, 0.7417190548602931, 0.703...","[0.5299579, 0.56805325, 0.4777574, 0.44822916,..."
1,01_dubbo,dubbo-cluster,"[match, value, bool, exact, weight, result, de...","[Bayesian inference, Containerization, File Tr...","[0.0006186695022802121, 0.001075709038256659, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.7016929118277905, 0.6982882861146025, 0.706...","[0.54784065, 0.46327564, 0.5400166, 0.5001681,..."
2,01_dubbo,dubbo-cluster,"[rule, mesh, listener, app, merger, map, merge...","[Bayesian inference, Containerization, File Tr...","[0.0006186695022802121, 0.001075709038256659, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.6558771698809155, 0.6792273271454538, 0.685...","[0.46307766, 0.5018077, 0.4271498, 0.41547033,..."
3,01_dubbo,dubbo-cluster,"[rule, key, invoker, node, router, destination...","[Bayesian inference, Containerization, File Tr...","[0.0006186695022802121, 0.001075709038256659, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.6962194262372328, 0.7197655294394439, 0.713...","[0.44639945, 0.49696502, 0.51973915, 0.5151202..."
4,01_dubbo,dubbo-cluster,"[url, invoker, invocation, configurator, load,...","[Bayesian inference, Containerization, File Tr...","[0.0006186695022802121, 0.001075709038256659, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.7120789628683717, 0.7148573131041973, 0.702...","[0.4117705, 0.52084935, 0.48110706, 0.579143, ..."
...,...,...,...,...,...,...,...,...
2394,45_archiva-components,spring-registry,"[key, save, registry, add, path, builder, prop...","[File Transfer Protocol, HTTP server, World Wi...","[0.05355589451454959, 0.030031248756518632, 0....","[0.0, 0.0, 0.0, -0.03649073, 0.0, 0.0, -0.0692...","[0.7103701073130371, 0.7162612664178768, 0.708...","[0.47138402, 0.43102366, 0.473483, 0.40904754,..."
2395,45_archiva-components,spring-registry,"[key, registry, add, value, path, builder, lis...","[File Transfer Protocol, HTTP server, World Wi...","[0.05355589451454959, 0.030031248756518632, 0....","[0.0, 0.0, 0.0, -0.03791273, 0.0, 0.0, -0.0609...","[0.6681940471261923, 0.6737569169317236, 0.650...","[0.47591305, 0.4573872, 0.48915863, 0.4374768,..."
2396,45_archiva-components,spring-taskqueue,"[task, evaluator, queue, executor, entry, viab...","[HTTP client, HTTP server, Web Components, Wor...","[0.024542570908181953, 0.021938531171453578, 0...","[0.0, 0.0, 0.0, 0.0, 0.063066274, 0.0, 0.0, 0....","[0.7087320067665482, 0.6858538376084059, 0.693...","[0.46891826, 0.46494, 0.44505328, 0.40470618, ..."
2397,45_archiva-components,spring-taskqueue,"[task, message, execution, queue, cause, throw...","[HTTP client, HTTP server, Web Components, Wor...","[0.024542570908181953, 0.021938531171453578, 0...","[0.0, 0.0, 0.0, 0.0, 0.033671085, 0.0, 0.0, 0....","[0.6818282981688972, 0.6697385290057885, 0.677...","[0.44870278, 0.46203914, 0.45386016, 0.435794,..."


In [79]:
code_domain_sim_df.to_csv("sim_code_domain_df.csv", index=False)

In [80]:
test_domain_sim_df = sow2v_test_domain_res_df.copy()
test_domain_sim_df['gpt_test_sim_score'] = test_dom_gpt_sim_df['gpt_test_sim_score']
test_domain_sim_df['angle_test_sim_score'] = test_dom_angle_sim_df['angle_test_sim_score']
test_domain_sim_df

Unnamed: 0,project_name,module,concept,domains,weight_dist,sow2v_test_sim_score,gpt_test_sim_score,angle_test_sim_score
0,01_dubbo,dubbo-cluster,"[invoker, hello, menu, service, load, balance,...","[Bayesian inference, Containerization, File Tr...","[0.0006186695022802121, 0.001075709038256659, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.6969462948905291, 0.699122397759385, 0.6885...","[0.46962628, 0.51499, 0.4604619, 0.5574089, 0...."
1,01_dubbo,dubbo-cluster,"[invoker, invocation, url, cluster, mock, resu...","[Bayesian inference, Containerization, File Tr...","[0.0006186695022802121, 0.001075709038256659, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.7105901433756564, 0.7263724106567162, 0.707...","[0.42823136, 0.49603176, 0.4982496, 0.57298434..."
2,01_dubbo,dubbo-cluster,"[attachment, value, key, invoker, sticky, invo...","[Bayesian inference, Containerization, File Tr...","[0.0006186695022802121, 0.001075709038256659, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.6911600078814237, 0.7035368565205871, 0.702...","[0.40690482, 0.49188542, 0.5312029, 0.56280786..."
3,01_dubbo,dubbo-cluster,"[url, model, merge, merger, module, router, pr...","[Bayesian inference, Containerization, File Tr...","[0.0006186695022802121, 0.001075709038256659, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.6902347290707492, 0.7037540389490645, 0.712...","[0.52358633, 0.5582562, 0.5021834, 0.5214653, ..."
4,01_dubbo,dubbo-cluster,"[match, configurator, mock, bool, join, absent...","[Bayesian inference, Containerization, File Tr...","[0.0006186695022802121, 0.001075709038256659, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.6863980044930774, 0.6988732122595529, 0.689...","[0.4701079, 0.47720772, 0.42111737, 0.5253796,..."
...,...,...,...,...,...,...,...,...
2218,45_archiva-components,spring-taskqueue,"[task, evaluator, evaluate, project, build, co...","[HTTP client, HTTP server, Web Components, Wor...","[0.024542570908181953, 0.021938531171453578, 0...","[0.0, 0.0, 0.0, 0.0, 0.072746545, 0.0, 0.0, 0....","[0.6963655362704815, 0.689622213606539, 0.6963...","[0.45968315, 0.45377582, 0.530491, 0.47489896,..."
2219,45_archiva-components,spring-taskqueue,"[task, queue, executor, build, project, expect...","[HTTP client, HTTP server, Web Components, Wor...","[0.024542570908181953, 0.021938531171453578, 0...","[0.0, 0.0, 0.0, 0.0, 0.07758436, 0.0, 0.0, 0.0...","[0.713515348347259, 0.706661825473746, 0.70759...","[0.46313748, 0.46530777, 0.45497382, 0.4139059..."
2220,45_archiva-components,spring-taskqueue,"[task, evaluator, exit, evaluate, queue, expec...","[HTTP client, HTTP server, Web Components, Wor...","[0.024542570908181953, 0.021938531171453578, 0...","[0.0, 0.0, 0.0, 0.0, 0.02112553, 0.0, 0.0, 0.0...","[0.7195561702420139, 0.7138746536780918, 0.716...","[0.45056364, 0.45415366, 0.46568096, 0.4344633..."
2221,45_archiva-components,spring-taskqueue,"[task, evaluate, entry, evaluator, exit, proje...","[HTTP client, HTTP server, Web Components, Wor...","[0.024542570908181953, 0.021938531171453578, 0...","[0.0, 0.0, 0.0, 0.0, 0.08187002, 0.0, 0.0, 0.0...","[0.7091316903562347, 0.6954968901204565, 0.716...","[0.47785676, 0.47983515, 0.49708745, 0.4551291..."


In [81]:
test_domain_sim_df.to_csv("sim_test_domain_df.csv", index=False)

In [82]:
doc_domain_sim_df = sow2v_doc_domain_res_df.copy()
doc_domain_sim_df['gpt_doc_sim_score'] = doc_dom_gpt_sim_df['gpt_doc_sim_score']
doc_domain_sim_df['angle_doc_sim_score'] = doc_dom_angle_sim_df['angle_doc_sim_score']
doc_domain_sim_df

Unnamed: 0,project_name,module,concept,domains,weight_dist,sow2v_doc_sim_score,gpt_doc_sim_score,angle_doc_sim_score
0,01_dubbo,dubbo-cluster,"[service, see, issue, sample, project, github,...","[Bayesian inference, Containerization, File Tr...","[0.0006186695022802121, 0.001075709038256659, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.7151548075641246, 0.7452139517589933, 0.726...","[0.4223302, 0.54775983, 0.4744687, 0.528439, 0..."
1,01_dubbo,dubbo-cluster,"[contribute, issue, sample, project, service, ...","[Bayesian inference, Containerization, File Tr...","[0.0006186695022802121, 0.001075709038256659, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.7262556385432636, 0.7367756398724646, 0.720...","[0.38913843, 0.53583735, 0.45912963, 0.4984625..."
2,01_dubbo,dubbo-cluster,"[issue, service, project, list, see, github, r...","[Bayesian inference, Containerization, File Tr...","[0.0006186695022802121, 0.001075709038256659, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.7266753311234634, 0.7361773056848603, 0.727...","[0.4397305, 0.5552181, 0.4954759, 0.53429127, ..."
3,01_dubbo,dubbo-cluster,"[service, see, project, build, github, sample,...","[Bayesian inference, Containerization, File Tr...","[0.0006186695022802121, 0.001075709038256659, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.7181566198682772, 0.7366335309908381, 0.725...","[0.39362937, 0.5181406, 0.4197607, 0.5035975, ..."
4,01_dubbo,dubbo-cluster,"[see, issue, sample, service, contribute, guid...","[Bayesian inference, Containerization, File Tr...","[0.0006186695022802121, 0.001075709038256659, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.7274142866301703, 0.7355761559871075, 0.720...","[0.45590687, 0.56785774, 0.5061829, 0.53482467..."
...,...,...,...,...,...,...,...,...
2294,45_archiva-components,spring-taskqueue,"[repository, publish, web, mvn, site, use, bui...","[HTTP client, HTTP server, Web Components, Wor...","[0.024542570908181953, 0.021938531171453578, 0...","[0.0, 0.0, 0.0, 0.0, 0.26086873, 0.0, 0.0, 0.0...","[0.745366006597648, 0.7616155131537632, 0.7709...","[0.5224836, 0.49912876, 0.57825845, 0.54174364..."
2295,45_archiva-components,spring-taskqueue,"[publish, repository, web, site, build, mvn, c...","[HTTP client, HTTP server, Web Components, Wor...","[0.024542570908181953, 0.021938531171453578, 0...","[0.0, 0.0, 0.0, 0.0, 0.31279874, 0.0, 0.0, 0.0...","[0.7540390295412137, 0.7645617605796236, 0.766...","[0.54529005, 0.53203326, 0.6424372, 0.58816844..."
2296,45_archiva-components,spring-taskqueue,"[repository, mvn, publish, build, checkout, si...","[HTTP client, HTTP server, Web Components, Wor...","[0.024542570908181953, 0.021938531171453578, 0...","[0.0, 0.0, 0.0, 0.0, 0.3412402, 0.0, 0.0, 0.0,...","[0.7577008045568522, 0.766975947736308, 0.7863...","[0.52580225, 0.49857324, 0.633036, 0.5670292, ..."
2297,45_archiva-components,spring-taskqueue,"[repository, publish, site, web, mvn, checkout...","[HTTP client, HTTP server, Web Components, Wor...","[0.024542570908181953, 0.021938531171453578, 0...","[0.0, 0.0, 0.0, 0.0, 0.3412402, 0.0, 0.0, 0.0,...","[0.7578298813154625, 0.7653744358564094, 0.783...","[0.5275348, 0.5032817, 0.6318431, 0.5592589, 0..."


In [83]:
doc_domain_sim_df.to_csv("sim_doc_domain_df.csv", index=False)

## Represent concepts in a vector space (RQ1)

In [26]:
import pandas as pd

code_domain_sim_df = pd.read_csv("sim_code_domain_df.csv")
test_domain_sim_df = pd.read_csv("sim_test_domain_df.csv")
doc_domain_sim_df = pd.read_csv("sim_doc_domain_df.csv")

In [27]:
topics_df = pd.read_csv("topics_res_df.csv")
doc_num_list = topics_df[["project_name", "doc_num_topics"]].drop_duplicates(ignore_index=True)['doc_num_topics'].tolist()

In [28]:
new_doc_df_list = []

for prefix in range(1,46):
    if prefix<10:
        prefix = '0' + str(prefix)
    new_doc_df_list.append(doc_domain_sim_df[doc_domain_sim_df['project_name'].str.startswith(str(prefix))][['project_name', 'concept']][:doc_num_list[int(prefix)-1]])

In [29]:
new_doc_df = pd.concat(new_doc_df_list, ignore_index=True)
new_doc_df

Unnamed: 0,project_name,concept
0,01_dubbo,"['service', 'see', 'issue', 'sample', 'project..."
1,01_dubbo,"['contribute', 'issue', 'sample', 'project', '..."
2,01_dubbo,"['issue', 'service', 'project', 'list', 'see',..."
3,01_dubbo,"['service', 'see', 'project', 'build', 'github..."
4,01_dubbo,"['see', 'issue', 'sample', 'service', 'contrib..."
...,...,...
343,45_archiva-components,"['repository', 'publish', 'web', 'mvn', 'site'..."
344,45_archiva-components,"['publish', 'repository', 'web', 'site', 'buil..."
345,45_archiva-components,"['repository', 'mvn', 'publish', 'build', 'che..."
346,45_archiva-components,"['repository', 'publish', 'site', 'web', 'mvn'..."


In [41]:
module_df = pd.read_csv("module_df.csv")
code_concept_list = module_df['code_topics']
test_concept_list = module_df['test_topics']
doc_concept_list = module_df['doc_topics']
prob_dist_list = module_df['prob_dist']
domains_list = module_df['domains']

In [42]:
gpt_doc_concept_list = doc_concept_list.drop_duplicates(ignore_index=True)

In [43]:
from numpy import load

gpt_code_vec_list = []
gpt_test_vec_list = []
gpt_doc_vec_list = []
angle_code_vec_list = []
angle_test_vec_list = []
angle_doc_vec_list = []

for i in range(0, len(code_concept_list)): # 313 modules

    curr_code_vec = load(f"gpt_code_vec\\{i}.npy")
    gpt_code_vec_list.append(curr_code_vec)

for i in range(0, len(test_concept_list)): # 313 modules

    curr_test_vec = load(f"gpt_test_vec\\{i}.npy")
    gpt_test_vec_list.append(curr_test_vec)

for i in range(0, len(gpt_doc_concept_list)):

    curr_doc_vec = load(f"gpt_doc_vec\\{i}.npy")
    gpt_doc_vec_list.append(curr_doc_vec)

for i in range(0, len(code_concept_list)): # 313 modules

    curr_code_vec = load(f"angle_code_vec\\{i}.npy")
    angle_code_vec_list.append(curr_code_vec)

for i in range(0, len(test_concept_list)): # 313 modules

    curr_test_vec = load(f"angle_test_vec\\{i}.npy")
    angle_test_vec_list.append(curr_test_vec)

for i in range(0, len(doc_concept_list)):

    curr_doc_vec = load(f"angle_doc_vec\\{i}.npy")
    angle_doc_vec_list.append(curr_doc_vec)

In [44]:
gpt_code_flattened_vec_list = [x for sub in gpt_code_vec_list for x in sub]
angle_code_flattened_vec_list = [x for sub in angle_code_vec_list for x in sub]

gpt_code_flattened_vec_list = [x[0] for x in gpt_code_flattened_vec_list]
angle_code_flattened_vec_list = [x[0] for x in angle_code_flattened_vec_list]

code_domain_sim_df["gpt_code_vec"] = gpt_code_flattened_vec_list
code_domain_sim_df["angle_code_vec"] = angle_code_flattened_vec_list

code_vec_df = code_domain_sim_df[["project_name", "module", "concept", "gpt_code_vec", "angle_code_vec"]]
code_vec_df

Unnamed: 0,project_name,module,concept,gpt_code_vec,angle_code_vec
0,01_dubbo,dubbo-cluster,"['merger', 'model', 'scope', 'aware', 'end', '...","[-0.026622535660862923, 0.0065000769682228565,...","[-0.18775713, -0.37221402, -0.102094874, 0.436..."
1,01_dubbo,dubbo-cluster,"['match', 'value', 'bool', 'exact', 'weight', ...","[-0.019484085962176323, -0.0039446307346224785...","[0.03253293, 0.5728816, 0.579907, 0.67069566, ..."
2,01_dubbo,dubbo-cluster,"['rule', 'mesh', 'listener', 'app', 'merger', ...","[-0.02054414711892605, 0.010302042588591576, 0...","[-0.019751057, -0.33930963, -0.1023085, 0.2829..."
3,01_dubbo,dubbo-cluster,"['rule', 'key', 'invoker', 'node', 'router', '...","[-0.014745050109922886, -0.01198360975831747, ...","[-0.553166, -0.27252257, -0.06934874, 0.168628..."
4,01_dubbo,dubbo-cluster,"['url', 'invoker', 'invocation', 'configurator...","[0.004753707442432642, -0.000175158231286332, ...","[-0.49914038, -0.24755313, -0.028283862, 0.168..."
...,...,...,...,...,...
2394,45_archiva-components,spring-registry,"['key', 'save', 'registry', 'add', 'path', 'bu...","[0.003726670518517494, 0.006566038820892572, 0...","[0.04506687, 0.37849292, -0.23495513, -0.19402..."
2395,45_archiva-components,spring-registry,"['key', 'registry', 'add', 'value', 'path', 'b...","[-0.0022075516171753407, -0.011396508663892746...","[0.0833093, 0.3905984, -0.40362155, -0.0959535..."
2396,45_archiva-components,spring-taskqueue,"['task', 'evaluator', 'queue', 'executor', 'en...","[-0.012877748347818851, -0.009656411595642567,...","[-0.5880894, -0.05917392, -0.44300538, 0.04839..."
2397,45_archiva-components,spring-taskqueue,"['task', 'message', 'execution', 'queue', 'cau...","[-0.02647557482123375, -0.02341029793024063, -...","[-0.8051155, -0.54449457, -0.28186893, 0.86290..."


In [34]:
gpt_test_flattened_vec_list = [x for sub in gpt_test_vec_list for x in sub]
angle_test_flattened_vec_list = [x for sub in angle_test_vec_list for x in sub]

gpt_test_flattened_vec_list = [x[0] for x in gpt_test_flattened_vec_list]
angle_test_flattened_vec_list = [x[0] for x in angle_test_flattened_vec_list]

test_domain_sim_df["gpt_test_vec"] = gpt_test_flattened_vec_list
test_domain_sim_df["angle_test_vec"] = angle_test_flattened_vec_list

test_vec_df = test_domain_sim_df[["project_name", "module", "concept", "gpt_test_vec", "angle_test_vec"]]
test_vec_df

Unnamed: 0,project_name,module,concept,gpt_test_vec,angle_test_vec
0,01_dubbo,dubbo-cluster,"['invoker', 'hello', 'menu', 'service', 'load'...","[-0.015958335250616074, -0.015475192107260227,...","[0.014179217, -0.26719624, -0.15324931, -0.072..."
1,01_dubbo,dubbo-cluster,"['invoker', 'invocation', 'url', 'cluster', 'm...","[-0.0025778282433748245, -0.011829978786408901...","[-0.25283864, -0.16967408, -0.18421198, -0.022..."
2,01_dubbo,dubbo-cluster,"['attachment', 'value', 'key', 'invoker', 'sti...","[-0.03428419679403305, -0.0033837941009551287,...","[-0.4825757, -0.3559093, -0.28272846, 0.468342..."
3,01_dubbo,dubbo-cluster,"['url', 'model', 'merge', 'merger', 'module', ...","[-0.0011146004544571042, 0.014147793874144554,...","[0.40440214, -0.44396412, 0.09869711, -0.20433..."
4,01_dubbo,dubbo-cluster,"['match', 'configurator', 'mock', 'bool', 'joi...","[-0.022632991895079613, -0.01612619124352932, ...","[-0.31522655, -0.66237473, 0.0850316, 0.209996..."
...,...,...,...,...,...
2218,45_archiva-components,spring-taskqueue,"['task', 'evaluator', 'evaluate', 'project', '...","[-0.008565530180931091, -0.01933959499001503, ...","[-0.31950122, -0.025664303, -0.66371596, -0.07..."
2219,45_archiva-components,spring-taskqueue,"['task', 'queue', 'executor', 'build', 'projec...","[-0.029290348291397095, -0.0031447862274944782...","[-0.88024205, 0.12703937, -0.5504483, -0.00638..."
2220,45_archiva-components,spring-taskqueue,"['task', 'evaluator', 'exit', 'evaluate', 'que...","[-0.016612650826573372, -0.009333686903119087,...","[-0.71355224, -0.14168549, -0.6765636, -0.0947..."
2221,45_archiva-components,spring-taskqueue,"['task', 'evaluate', 'entry', 'evaluator', 'ex...","[-0.017499882727861404, -0.0047294944524765015...","[-0.29466522, -0.017070707, -0.49079585, -0.34..."


In [35]:
gpt_doc_flattened_vec_list = [x for sub in gpt_doc_vec_list for x in sub] # 45 vecs
angle_doc_flattened_vec_list = [x for sub in angle_doc_vec_list for x in sub] # 313 vecs

gpt_doc_flattened_vec_list = [x[0] for x in gpt_doc_flattened_vec_list]
angle_doc_flattened_vec_list = [x[0] for x in angle_doc_flattened_vec_list]

new_doc_df["gpt_doc_vec"] = gpt_doc_flattened_vec_list
doc_domain_sim_df["angle_doc_vec"] = angle_doc_flattened_vec_list
doc_domain_sim_df = doc_domain_sim_df[["project_name", "concept", "angle_doc_vec"]]

new_doc_df_list = []

for prefix in range(1,46):
    if prefix<10:
        prefix = '0' + str(prefix)
    new_doc_df_list.append(doc_domain_sim_df[doc_domain_sim_df['project_name'].str.startswith(str(prefix))][['project_name', 'concept', 'angle_doc_vec']][:doc_num_list[int(prefix)-1]])

new_angle_doc_df = pd.concat(new_doc_df_list, ignore_index=True)
doc_vec_df = new_doc_df.copy()
doc_vec_df["angle_doc_vec"] = new_angle_doc_df["angle_doc_vec"]
doc_vec_df

Unnamed: 0,project_name,concept,gpt_doc_vec,angle_doc_vec
0,01_dubbo,"['service', 'see', 'issue', 'sample', 'project...","[0.005654162261635065, -0.0016180375823751092,...","[-0.2622697, -0.02699811, -0.11021085, -0.2081..."
1,01_dubbo,"['contribute', 'issue', 'sample', 'project', '...","[-0.0033876176457852125, -0.01359404157847166,...","[-0.42579025, -0.19378607, -0.31018537, -0.093..."
2,01_dubbo,"['issue', 'service', 'project', 'list', 'see',...","[-0.005690885242074728, -0.01893075555562973, ...","[-0.3254578, -0.15225579, -0.40020263, 0.05321..."
3,01_dubbo,"['service', 'see', 'project', 'build', 'github...","[0.0017545023001730442, -0.0064876810647547245...","[-0.09142876, -0.45110118, -0.17707597, -0.150..."
4,01_dubbo,"['see', 'issue', 'sample', 'service', 'contrib...","[0.003152343910187483, 0.0006888388306833804, ...","[-0.38099384, 0.13213581, -0.082547046, -0.096..."
...,...,...,...,...
343,45_archiva-components,"['repository', 'publish', 'web', 'mvn', 'site'...","[-0.0015625524101778865, -0.001334094093181192...","[-0.98970336, 0.5057261, -0.31873137, 0.095206..."
344,45_archiva-components,"['publish', 'repository', 'web', 'site', 'buil...","[-0.0021771923638880253, -0.011147775687277317...","[-0.74474394, 0.029094877, -0.48263723, 0.2167..."
345,45_archiva-components,"['repository', 'mvn', 'publish', 'build', 'che...","[-0.0009208479314111173, -0.000892394746188074...","[-0.77469325, 0.3202895, -0.4091273, 0.5168214..."
346,45_archiva-components,"['repository', 'publish', 'site', 'web', 'mvn'...","[0.0018515546107664704, -0.003041225718334317,...","[-0.76814437, 0.21244615, -0.49484473, 0.26931..."


In [45]:
import plotly.express as px
from sklearn.manifold import TSNE
import numpy as np

tsne_model = TSNE(
    n_components = 2,
    perplexity = 15,
    random_state = 42,
    init = 'random',
    learning_rate = 200
)

emb_arr = np.array(code_vec_df["gpt_code_vec"].to_list())

tsne_embeddings = tsne_model.fit_transform(emb_arr)

gpt_code_data = pd.DataFrame(
    {'x': tsne_embeddings[:, 0],
     'y': tsne_embeddings[:, 1]}
)

# Create the scatter plot using Plotly Express
plot = px.scatter(
    gpt_code_data,
    x = 'x',
    y = 'y',
    opacity = 0.3
)

plot.update_layout(
    width = 650,
    height = 650
)

plot.show()

In [47]:
tsne_model = TSNE(
    n_components = 2,
    perplexity = 15,
    random_state = 42,
    init = 'random',
    learning_rate = 200
)

emb_arr = np.array(test_vec_df["gpt_test_vec"].to_list())

tsne_embeddings = tsne_model.fit_transform(emb_arr)

gpt_code_data = pd.DataFrame(
    {'x': tsne_embeddings[:, 0],
     'y': tsne_embeddings[:, 1]}
)

# Create the scatter plot using Plotly Express
plot = px.scatter(
    gpt_code_data,
    x = 'x',
    y = 'y',
    opacity = 0.3
)

plot.update_layout(
    width = 650,
    height = 650
)

plot.show()

In [48]:
tsne_model = TSNE(
    n_components = 2,
    perplexity = 15,
    random_state = 42,
    init = 'random',
    learning_rate = 200
)

emb_arr = np.array(doc_vec_df["gpt_doc_vec"].to_list())

tsne_embeddings = tsne_model.fit_transform(emb_arr)

gpt_code_data = pd.DataFrame(
    {'x': tsne_embeddings[:, 0],
     'y': tsne_embeddings[:, 1]}
)

# Create the scatter plot using Plotly Express
plot = px.scatter(
    gpt_code_data,
    x = 'x',
    y = 'y',
    opacity = 0.3
)

plot.update_layout(
    width = 650,
    height = 650
)

plot.show()