## Vectorising concepts and application domains

This notebook extracts corpus keywords and analyse the concepts emerging from three sources: source code, documentation and tests

## Show topics

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

topics_df = pd.read_csv("topics_res_df.csv")
topics_df.head()

Unnamed: 0,project_name,module,code_num_topics,test_num_topics,code_shown_topics,code_topics,test_shown_topics,test_topics,doc_num_topics,doc_shown_topics,doc_topics
0,01_dubbo,dubbo-cluster,9,6,"[(0, [('merger', 0.11644696), ('model', 0.0760...","[['merger', 'model', 'scope', 'aware', 'end', ...","[(0, [('invoker', 0.10217771), ('hello', 0.059...","[['invoker', 'hello', 'menu', 'service', 'load...",8,"[(0, [('service', 0.005111123), ('see', 0.0051...","[['service', 'see', 'issue', 'sample', 'projec..."
1,01_dubbo,dubbo-common,9,9,"[(0, [('map', 0.071077), ('extension', 0.05852...","[['map', 'extension', 'loader', 'property', 'm...","[(0, [('address', 0.07335682), ('country', 0.0...","[['address', 'country', 'phone', 'full', 'size...",8,"[(0, [('service', 0.005111123), ('see', 0.0051...","[['service', 'see', 'issue', 'sample', 'projec..."
2,01_dubbo,dubbo-compatible,8,9,"[(0, [('invocation', 0.17978409), ('invoker', ...","[['invocation', 'invoker', 'attachment', 'argu...","[(0, [('consumer', 0.08403326), ('service', 0....","[['consumer', 'service', 'argument', 'applicat...",8,"[(0, [('service', 0.005111123), ('see', 0.0051...","[['service', 'see', 'issue', 'sample', 'projec..."
3,01_dubbo,dubbo-config,7,8,"[(0, [('application', 0.19198503), ('model', 0...","[['application', 'model', 'module', 'context',...","[(0, [('box', 0.0995611), ('service', 0.097135...","[['box', 'service', 'demo', 'user', 'say', 'im...",8,"[(0, [('service', 0.005111123), ('see', 0.0051...","[['service', 'see', 'issue', 'sample', 'projec..."
4,01_dubbo,dubbo-configcenter,6,7,"[(0, [('dynamic', 0.2249236), ('factory', 0.12...","[['dynamic', 'factory', 'url', 'zookeeper', 'a...","[(0, [('namespace', 0.01471425), ('map', 0.014...","[['namespace', 'map', 'context', 'mock', 'prop...",8,"[(0, [('service', 0.005111123), ('see', 0.0051...","[['service', 'see', 'issue', 'sample', 'projec..."


In [2]:
topics_df.tail()

Unnamed: 0,project_name,module,code_num_topics,test_num_topics,code_shown_topics,code_topics,test_shown_topics,test_topics,doc_num_topics,doc_shown_topics,doc_topics
156,15_zeppelin,zeppelin-jupyter,6,1,"[(0, [('paragraph', 0.15356873), ('author', 0....","[['paragraph', 'author', 'kernelspec', 'langua...","[(0, [('note', 0.2), ('format', 0.13333333), (...","[['note', 'format', 'jupyter', 'nbformat', 'te...",9,"[(0, [('build', 0.020426339), ('install', 0.02...","[['build', 'install', 'source', 'notebook', 'p..."
157,15_zeppelin,zeppelin-jupyter-interpreter,4,8,"[(0, [('predicate', 0.26850826), ('python', 0....","[['predicate', 'python', 'request', 'kernel', ...","[(0, [('interpreter', 0.053031124), ('context'...","[['interpreter', 'context', 'resource', 'pool'...",9,"[(0, [('build', 0.020426339), ('install', 0.02...","[['build', 'install', 'source', 'notebook', 'p..."
158,15_zeppelin,zeppelin-plugins,7,8,"[(0, [('pod', 0.05951644), ('interpreter', 0.0...","[['pod', 'interpreter', 'spark', 'port', 'driv...","[(0, [('phase', 0.097639434), ('server', 0.097...","[['phase', 'server', 'kubernetes', 'interrupt'...",9,"[(0, [('build', 0.020426339), ('install', 0.02...","[['build', 'install', 'source', 'notebook', 'p..."
159,15_zeppelin,zeppelin-server,8,9,"[(0, [('session', 0.10803968), ('response', 0....","[['session', 'response', 'status', 'interprete...","[(0, [('note', 0.08035307), ('cluster', 0.0790...","[['note', 'cluster', 'service', 'event', 'note...",9,"[(0, [('build', 0.020426339), ('install', 0.02...","[['build', 'install', 'source', 'notebook', 'p..."
160,15_zeppelin,zeppelin-zengine,9,9,"[(0, [('path', 0.093719326), ('info', 0.083396...","[['path', 'info', 'notebook', 'save', 'setting...","[(0, [('interpreter', 0.09700542), ('storage',...","[['interpreter', 'storage', 'valid', 'repo', '...",9,"[(0, [('build', 0.020426339), ('install', 0.02...","[['build', 'install', 'source', 'notebook', 'p..."


## Process annotated modules

In [9]:
anno_df = pd.read_csv("module_annotation.csv")
anno_sub_df = anno_df[["labels", "distribution"]]
anno_df

Unnamed: 0,project,module,labels,distribution
0,dubbo,dubbo-configcenter,"['3D computer graphics', '3D modeling', 'Bayes...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0358421722524..."
1,dubbo,dubbo-remoting,"['3D computer graphics', '3D modeling', 'Bayes...","[0.003827634543563279, 0.0007640459047594847, ..."
2,dubbo,dubbo-spring-boot,"['3D computer graphics', '3D modeling', 'Bayes...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.002485344724677403..."
3,dubbo,dubbo-serialization,"['3D computer graphics', '3D modeling', 'Bayes...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1949338529501..."
4,dubbo,dubbo-native,"['3D computer graphics', '3D modeling', 'Bayes...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0785512761148..."
...,...,...,...,...
257,zeppelin,helium-dev,"['3D computer graphics', '3D modeling', 'Bayes...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0720749970156..."
258,zeppelin,zeppelin-server,"['3D computer graphics', '3D modeling', 'Bayes...","[0.0, 0.0, 0.0, 0.0, 0.005797929594438749, 0.0..."
259,zeppelin,jdbc,"['3D computer graphics', '3D modeling', 'Bayes...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0448625332560..."
260,zeppelin,shell,"['3D computer graphics', '3D modeling', 'Bayes...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0958522928349..."


In [10]:
# unify the project names
proj_list_b = list(dict.fromkeys(topics_df['project_name']))
proj_list_a = list(dict.fromkeys(anno_df['project']))
to_chg = dict(zip(proj_list_a, proj_list_b))


In [11]:
to_chg

{'dubbo': '01_dubbo',
 'skywalking': '02_skywalking',
 'flink': '03_flink',
 'rocketmq': '04_rocketmq',
 'shardingsphere': '05_shardingsphere',
 'hadoop': '06_hadoop',
 'druid': '07_druid',
 'pulsar': '08_pulsar',
 'zookeeper': '09_zookeeper',
 'dolphinscheduler': '10_dolphinscheduler',
 'shardingsphere-elasticjob': '11_shardingsphere-elasticjob',
 'shenyu': '12_shenyu',
 'tomcat': '13_tomcat',
 'storm': '14_storm',
 'zeppelin': '15_zeppelin'}

In [12]:
import ast

updated_rows = []

for row in anno_sub_df.values:

    updated_labels = []
    updated_dist = []
    this_row = []

    labels = row[0]
    values = row[1]

    labels = ast.literal_eval(labels)
    values = ast.literal_eval(values)

    for i, l in enumerate(labels):
        if float(values[i]) > 0:
            this_row.append((l,values[i]))

    for vals in this_row:
        updated_labels.append(vals[0])
        updated_dist.append(vals[1])

    updated_rows.append((updated_labels, updated_dist))

In [13]:
updated_anno_df = pd.concat([anno_df, pd.DataFrame(updated_rows, columns=["domains", "weight_dist"])], axis=1).drop(columns=["labels","distribution"])
updated_anno_df['project_name'] = updated_anno_df['project'].apply(lambda x: to_chg[x] if x in to_chg else x)
updated_anno_df['module'] = updated_anno_df['module'].str.replace("modules", "modules\\vfs-class-loader")
updated_anno_df = updated_anno_df[['project_name', 'module', 'domains', 'weight_dist']]
updated_anno_df

Unnamed: 0,project_name,module,domains,weight_dist
0,01_dubbo,dubbo-configcenter,"[File Transfer Protocol, HTTP client, HTTP ser...","[0.035842172252428264, 0.040560328491004986, 0..."
1,01_dubbo,dubbo-remoting,"[3D computer graphics, 3D modeling, File Trans...","[0.003827634543563279, 0.0007640459047594847, ..."
2,01_dubbo,dubbo-spring-boot,"[DevOps, File Transfer Protocol, HTTP client, ...","[0.0024853447246774036, 0.011500804409850242, ..."
3,01_dubbo,dubbo-serialization,"[File Transfer Protocol, HTTP client, HTTP ser...","[0.1949338529501777, 0.014838969973809206, 0.0..."
4,01_dubbo,dubbo-native,"[File Transfer Protocol, HTTP client, HTTP ser...","[0.07855127611486389, 0.171984535082097, 0.271..."
...,...,...,...,...
257,15_zeppelin,helium-dev,"[File Transfer Protocol, World Wide Web, analy...","[0.07207499701564471, 0.018018749253911177, 0...."
258,15_zeppelin,zeppelin-server,"[Containerization, DevOps, File Transfer Proto...","[0.005797929594438749, 0.0014377855269520061, ..."
259,15_zeppelin,jdbc,"[File Transfer Protocol, HTTP server, WebSocke...","[0.044862533256027314, 0.02102187412956304, 0...."
260,15_zeppelin,shell,"[File Transfer Protocol, HTTP server, WebSocke...","[0.09585229283498493, 0.08504810822949835, 0.0..."


In [14]:
updated_anno_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 262 entries, 0 to 261
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   project_name  262 non-null    object
 1   module        262 non-null    object
 2   domains       262 non-null    object
 3   weight_dist   262 non-null    object
dtypes: object(4)
memory usage: 8.3+ KB


In [15]:
module_df = topics_df.merge(updated_anno_df, on=['project_name', 'module'])

In [16]:
module_df[:5]

Unnamed: 0,project_name,module,code_num_topics,test_num_topics,code_shown_topics,code_topics,test_shown_topics,test_topics,doc_num_topics,doc_shown_topics,doc_topics,domains,weight_dist
0,01_dubbo,dubbo-cluster,9,6,"[(0, [('merger', 0.11644696), ('model', 0.0760...","[['merger', 'model', 'scope', 'aware', 'end', ...","[(0, [('invoker', 0.10217771), ('hello', 0.059...","[['invoker', 'hello', 'menu', 'service', 'load...",8,"[(0, [('service', 0.005111123), ('see', 0.0051...","[['service', 'see', 'issue', 'sample', 'projec...","[Bayesian inference, Containerization, File Tr...","[0.0006186695022802121, 0.001075709038256659, ..."
1,01_dubbo,dubbo-common,9,9,"[(0, [('map', 0.071077), ('extension', 0.05852...","[['map', 'extension', 'loader', 'property', 'm...","[(0, [('address', 0.07335682), ('country', 0.0...","[['address', 'country', 'phone', 'full', 'size...",8,"[(0, [('service', 0.005111123), ('see', 0.0051...","[['service', 'see', 'issue', 'sample', 'projec...","[3D computer graphics, 3D modeling, Bidirectio...","[0.00022489652756263503, 0.019850776647796192,..."
2,01_dubbo,dubbo-compatible,8,9,"[(0, [('invocation', 0.17978409), ('invoker', ...","[['invocation', 'invoker', 'attachment', 'argu...","[(0, [('consumer', 0.08403326), ('service', 0....","[['consumer', 'service', 'argument', 'applicat...",8,"[(0, [('service', 0.005111123), ('see', 0.0051...","[['service', 'see', 'issue', 'sample', 'projec...","[Bayesian inference, File Transfer Protocol, H...","[0.0014187991538512738, 0.040202071048378446, ..."
3,01_dubbo,dubbo-config,7,8,"[(0, [('application', 0.19198503), ('model', 0...","[['application', 'model', 'module', 'context',...","[(0, [('box', 0.0995611), ('service', 0.097135...","[['box', 'service', 'demo', 'user', 'say', 'im...",8,"[(0, [('service', 0.005111123), ('see', 0.0051...","[['service', 'see', 'issue', 'sample', 'projec...","[3D modeling, Bayesian inference, Containeriza...","[0.005673695513894933, 0.0016755344658510142, ..."
4,01_dubbo,dubbo-configcenter,6,7,"[(0, [('dynamic', 0.2249236), ('factory', 0.12...","[['dynamic', 'factory', 'url', 'zookeeper', 'a...","[(0, [('namespace', 0.01471425), ('map', 0.014...","[['namespace', 'map', 'context', 'mock', 'prop...",8,"[(0, [('service', 0.005111123), ('see', 0.0051...","[['service', 'see', 'issue', 'sample', 'projec...","[File Transfer Protocol, HTTP client, HTTP ser...","[0.035842172252428264, 0.040560328491004986, 0..."


In [17]:
module_df[-5:]

Unnamed: 0,project_name,module,code_num_topics,test_num_topics,code_shown_topics,code_topics,test_shown_topics,test_topics,doc_num_topics,doc_shown_topics,doc_topics,domains,weight_dist
155,15_zeppelin,zeppelin-jupyter,6,1,"[(0, [('paragraph', 0.15356873), ('author', 0....","[['paragraph', 'author', 'kernelspec', 'langua...","[(0, [('note', 0.2), ('format', 0.13333333), (...","[['note', 'format', 'jupyter', 'nbformat', 'te...",9,"[(0, [('build', 0.020426339), ('install', 0.02...","[['build', 'install', 'source', 'notebook', 'p...","[File Transfer Protocol, HTTP client, HTTP ser...","[0.013766653177161867, 0.011088461079329955, 0..."
156,15_zeppelin,zeppelin-jupyter-interpreter,4,8,"[(0, [('predicate', 0.26850826), ('python', 0....","[['predicate', 'python', 'request', 'kernel', ...","[(0, [('interpreter', 0.053031124), ('context'...","[['interpreter', 'context', 'resource', 'pool'...",9,"[(0, [('build', 0.020426339), ('install', 0.02...","[['build', 'install', 'source', 'notebook', 'p...","[File Transfer Protocol, HTTP client, HTTP ser...","[0.0919622192878365, 0.030497265536391625, 0.0..."
157,15_zeppelin,zeppelin-plugins,7,8,"[(0, [('pod', 0.05951644), ('interpreter', 0.0...","[['pod', 'interpreter', 'spark', 'port', 'driv...","[(0, [('phase', 0.097639434), ('server', 0.097...","[['phase', 'server', 'kubernetes', 'interrupt'...",9,"[(0, [('build', 0.020426339), ('install', 0.02...","[['build', 'install', 'source', 'notebook', 'p...","[File Transfer Protocol, HTTP server, Parser c...","[0.1478003536751886, 0.02761933053530807, 0.00..."
158,15_zeppelin,zeppelin-server,8,9,"[(0, [('session', 0.10803968), ('response', 0....","[['session', 'response', 'status', 'interprete...","[(0, [('note', 0.08035307), ('cluster', 0.0790...","[['note', 'cluster', 'service', 'event', 'note...",9,"[(0, [('build', 0.020426339), ('install', 0.02...","[['build', 'install', 'source', 'notebook', 'p...","[Containerization, DevOps, File Transfer Proto...","[0.005797929594438749, 0.0014377855269520061, ..."
159,15_zeppelin,zeppelin-zengine,9,9,"[(0, [('path', 0.093719326), ('info', 0.083396...","[['path', 'info', 'notebook', 'save', 'setting...","[(0, [('interpreter', 0.09700542), ('storage',...","[['interpreter', 'storage', 'valid', 'repo', '...",9,"[(0, [('build', 0.020426339), ('install', 0.02...","[['build', 'install', 'source', 'notebook', 'p...","[3D modeling, Bayesian inference, Bidirectiona...","[0.0006337118082907259, 0.0002334781055880513,..."


In [18]:
module_df.to_csv("module_df.csv", index=False, encoding="utf-8")

In [19]:
### separate the doc concepts

module_docs_df = module_df[['project_name', 'doc_topics']].drop_duplicates(ignore_index=True)
module_docs_df[:5]

Unnamed: 0,project_name,doc_topics
0,01_dubbo,"[['service', 'see', 'issue', 'sample', 'projec..."
1,02_skywalking,"[['trace', 'support', 'metric', 'mail', 'nativ..."
2,03_flink,"[['scala', 'processing', 'java', 'intellij', '..."
3,04_rocketmq,"[['software', 'use', 'message', 'run', 'cluste..."
4,05_shardingsphere,"[['database', 'link', 'amp', 'provide', 'suppo..."


## LLM Matching of AD to Concepts

### Using embeddings then calculate semantic similarity to match with domains

- StackOverflow w2v
- text-embedding-ada-002 (openAI + scikit-LLM)

In [20]:
code_concept_list = module_df['code_topics']
test_concept_list = module_df['test_topics']
doc_concept_list = module_df['doc_topics']
weight_dist_list = module_df['weight_dist']
domains_list = module_df['domains']

In [21]:
proj_names = module_df[['project_name', 'module']]
proj_names

Unnamed: 0,project_name,module
0,01_dubbo,dubbo-cluster
1,01_dubbo,dubbo-common
2,01_dubbo,dubbo-compatible
3,01_dubbo,dubbo-config
4,01_dubbo,dubbo-configcenter
...,...,...
155,15_zeppelin,zeppelin-jupyter
156,15_zeppelin,zeppelin-jupyter-interpreter
157,15_zeppelin,zeppelin-plugins
158,15_zeppelin,zeppelin-server


In [22]:
len(domains_list)

160

## StackOverflow W2V

In [23]:
from gensim.models.keyedvectors import KeyedVectors

so_w2v_model = KeyedVectors.load_word2vec_format('C:/Users/biadge/OneDrive - BP/PhD/extraction/SO_vectors_200.bin', binary=True)

In [24]:
# test
so_w2v_model.n_similarity(['test', 'case'], ['quality'])

0.1252212

In [25]:
def comp_con_domains_sow2v(so_w2v_model, con, domains):
    # list of domains should be given, now we return an array of similarity scores for all domains
    # one concept matching against all the domains for that concept
    sim_res = []

    for dom in domains:
        sim_score = so_w2v_model.n_similarity(con, [dom])
        sim_res.append(sim_score)

    return sim_res

### Match concepts to domains

In [30]:
import ast

sow2v_code_domain_res = []

for i, con in enumerate(code_concept_list):

    c = ast.literal_eval(con)

    for con_str in c:

        curr_domains = domains_list[i]
        curr_prob = weight_dist_list[i]

        sim_res_list = comp_con_domains_sow2v(so_w2v_model, con_str, curr_domains)

        proj_name = proj_names.values[i][0]
        module_name = proj_names.values[i][1]

        sow2v_code_domain_res.append([proj_name, module_name, con_str, curr_domains, curr_prob, sim_res_list])

In [31]:
sow2v_test_domain_res = []

for i, con in enumerate(test_concept_list):

    c = ast.literal_eval(con)

    for con_str in c:

        curr_domains = domains_list[i]
        curr_prob = weight_dist_list[i]

        sim_res_list = comp_con_domains_sow2v(so_w2v_model, con_str, curr_domains)

        proj_name = proj_names.values[i][0]
        module_name = proj_names.values[i][1]

        sow2v_test_domain_res.append([proj_name, module_name, con_str, curr_domains, curr_prob, sim_res_list])

In [32]:
sow2v_doc_domain_res = []

for i, con in enumerate(doc_concept_list):

    c = ast.literal_eval(con)

    for con_str in c:

        curr_domains = domains_list[i]
        curr_prob = weight_dist_list[i]

        sim_res_list = comp_con_domains_sow2v(so_w2v_model, con_str, curr_domains)

        proj_name = proj_names.values[i][0]
        module_name = proj_names.values[i][1]

        sow2v_doc_domain_res.append([proj_name, module_name, con_str, curr_domains, curr_prob, sim_res_list])

In [33]:
sow2v_code_domain_res_df = pd.DataFrame(sow2v_code_domain_res, columns=['project_name', 'module', 'concept', 'domains', 'weight_dist', 'sow2v_code_sim_score'])
sow2v_test_domain_res_df = pd.DataFrame(sow2v_test_domain_res, columns=['project_name', 'module', 'concept', 'domains', 'weight_dist', 'sow2v_test_sim_score'])
sow2v_doc_domain_res_df = pd.DataFrame(sow2v_doc_domain_res, columns=['project_name', 'module', 'concept', 'domains', 'weight_dist', 'sow2v_doc_sim_score'])

### scikit-LLM (OpenAI text-embedding-ada-002)

In [34]:
from skllm.models.gpt.vectorization import GPTVectorizer
from skllm.config import SKLLMConfig
from dotenv import load_dotenv, find_dotenv
import os

In [35]:
_ = load_dotenv(find_dotenv()) # read local .env file

In [36]:
SKLLMConfig.set_openai_key(os.getenv("API_KEY"))
SKLLMConfig.set_openai_org(os.getenv("ORG"))

In [37]:
model = GPTVectorizer()

vectors = model.fit_transform(["how old are you?", "what is your age?"])

vector_1 = np.array(vectors[0]).reshape(1, -1)
vector_2 = np.array(vectors[1]).reshape(1, -1)

Batch size: 1


100%|██████████| 2/2 [00:13<00:00,  6.98s/it]


In [38]:
vector_1

array([[ 0.01430825, -0.01481162,  0.02763494, ...,  0.01296174,
         0.00077786, -0.01851138]])

In [39]:
vector_2

array([[ 0.01097485, -0.01662272,  0.00532376, ...,  0.00476218,
        -0.01456895, -0.03142272]])

In [40]:
cosine_similarity(vector_1, vector_2)

array([[0.94738381]])

In [29]:
# convert domains to vectors first.
# THIS HAS BEEN RUN, PLEASE UNCOMMENT TO RUN GPT VECTORISATION

# from numpy import asarray
# from numpy import save

# # dom_vec_list = []
# model = GPTVectorizer()
# ctr = 42
# domains_list_sub = domains_list[ctr:50]

# for dom in domains_list_sub: # 313 domains lists (no. of modules) to vectorise
#     curr_dom_vec = []
#     for d in dom:
#         curr_dom_vec.append(model.fit_transform([d]))

#     curr_dom_vec_arr = asarray(curr_dom_vec)
#     save(f"gpt_dom_vec\\{ctr}.npy", curr_dom_vec_arr)
#     ctr+=1

In [30]:
#convert concepts to vectors first.
#THIS HAS BEEN RUN, PLEASE UNCOMMENT TO RUN GPT VECTORISATION

# from numpy import asarray
# from numpy import save
# import ast

# model = GPTVectorizer()

# ctr = 0
# code_concept_list_sub = code_concept_list[ctr:100]

# for con_list in code_concept_list_sub: # 313 domains lists (no. of modules) to vectorise
#     curr_code_vec = []

#     con_list = ast.literal_eval(con_list)

#     for con in con_list:
#         con_str = " ".join(con)
#         curr_code_vec.append(model.fit_transform([con_str]))

#     curr_code_vec_arr = asarray(curr_code_vec)
#     save(f"gpt_code_vec\\{ctr}.npy", curr_code_vec_arr)
#     ctr+=1

In [31]:
# convert concepts to vectors first.
# THIS HAS BEEN RUN, PLEASE UNCOMMENT TO RUN GPT VECTORISATION

# from numpy import asarray
# from numpy import save

# model = GPTVectorizer()

# ctr = 0
# test_concept_list_sub = test_concept_list[ctr:50]

# for con_list in test_concept_list_sub: # 313 domains lists (no. of modules) to vectorise
#     curr_test_vec = []

#     con_list = ast.literal_eval(con_list)

#     for con in con_list:
#         con_str = " ".join(con)
#         curr_test_vec.append(model.fit_transform([con_str]))

#     curr_test_vec_arr = asarray(curr_test_vec)
#     save(f"gpt_test_vec\\{ctr}.npy", curr_test_vec_arr)
#     ctr+=1

In [32]:
# convert concepts to vectors first.
# THIS HAS BEEN RUN, PLEASE UNCOMMENT TO RUN GPT VECTORISATION

# from numpy import asarray
# from numpy import save

# model = GPTVectorizer()

# ctr = 0

# for con_list in doc_concept_list:
#     curr_doc_vec = []
#     con_list = ast.literal_eval(con_list)

#     for con in con_list:
#         con_str = " ".join(con)
#         curr_doc_vec.append(model.fit_transform([con_str]))

#     curr_doc_vec_arr = asarray(curr_doc_vec)
#     save(f"gpt_doc_vec\\{ctr}.npy", curr_doc_vec_arr)
#     ctr+=1

#### load the vectors

In [41]:
from numpy import load

dom_vec_list = []
code_vec_list = []
test_vec_list = []

for i in range(0, len(domains_list)): # 160 modules

    curr_dom_vec = load(f"gpt_dom_vec\\{i}.npy")
    dom_vec_list.append(curr_dom_vec)

for i in range(0, len(code_concept_list)): # 160 modules

    curr_code_vec = load(f"gpt_code_vec\\{i}.npy")
    code_vec_list.append(curr_code_vec)

for i in range(0, len(test_concept_list)): # 160 modules

    curr_test_vec = load(f"gpt_test_vec\\{i}.npy")
    test_vec_list.append(curr_test_vec)

In [42]:
# for doc we have to map out the proj/module mapping

proj_id = proj_names['project_name'].str[:2].astype(int).tolist()
len(proj_id)

160

In [43]:
doc_vec_list = []

for pi in proj_id:

    file_id = pi-1

    curr_doc_vec = load(f"gpt_doc_vec\\{str(file_id)}.npy")
    doc_vec_list.append(curr_doc_vec)

In [44]:
len(doc_vec_list)

160

In [45]:
def comp_con_domains_gpt(con_vec, dom_vec):
    # list of domains should be given, now we return an array of similarity scores for all domains
    # one concept matching against all the domains for that concept
    sim_res = []

    con_vec = np.array(con_vec).reshape(1, -1)

    for dv in dom_vec:
        dv = np.array(dv).reshape(1, -1)
        sim_score = cosine_similarity(con_vec, dv)
        sim_res.append(sim_score[0][0])

    return sim_res

In [46]:
gpt_code_domain_res = []

for i, code_vec in enumerate(code_vec_list):

    curr_con_list = ast.literal_eval(code_concept_list[i])

    for k, cv in enumerate(code_vec):

        curr_concepts = curr_con_list[k]
        curr_domains = domains_list[i]
        curr_prob = weight_dist_list[i]

        sim_res_list = comp_con_domains_gpt(cv, dom_vec_list[i])

        proj_name = proj_names.values[i][0]
        module_name = proj_names.values[i][1]

        gpt_code_domain_res.append([proj_name, module_name, curr_concepts, curr_domains, curr_prob, sim_res_list])

In [47]:
gpt_test_domain_res = []

for i, test_vec in enumerate(test_vec_list):

    curr_con_list = ast.literal_eval(test_concept_list[i])

    for k, tv in enumerate(test_vec):

        curr_concepts = curr_con_list[k]
        curr_domains = domains_list[i]
        curr_prob = weight_dist_list[i]

        sim_res_list = comp_con_domains_gpt(tv, dom_vec_list[i])

        proj_name = proj_names.values[i][0]
        module_name = proj_names.values[i][1]

        gpt_test_domain_res.append([proj_name, module_name, curr_concepts, curr_domains, curr_prob, sim_res_list])

In [48]:
gpt_doc_domain_res = []

for i, doc_vec in enumerate(doc_vec_list):

    curr_con_list = ast.literal_eval(module_df['doc_topics'][i])

    for k, dv in enumerate(doc_vec):

        curr_concepts = curr_con_list[k]
        curr_domains = domains_list[i]
        curr_prob = weight_dist_list[i]

        sim_res_list = comp_con_domains_gpt(dv, dom_vec_list[i])

        proj_name = proj_names.values[i][0]
        module_name = proj_names.values[i][1]

        gpt_doc_domain_res.append([proj_name, module_name, curr_concepts, curr_domains, curr_prob, sim_res_list])

In [49]:
dv_list = []

for dv in dom_vec_list:
    dv_list.append(len(dv))

In [50]:
code_dom_gpt_sim_df = pd.DataFrame(gpt_code_domain_res, columns=['project_name', 'module', 'code concepts', 'domains', 'weight_dist', 'gpt_code_sim_score'])
test_dom_gpt_sim_df = pd.DataFrame(gpt_test_domain_res, columns=['project_name', 'module', 'test concepts', 'domains', 'weight_dist', 'gpt_test_sim_score'])
doc_dom_gpt_sim_df = pd.DataFrame(gpt_doc_domain_res, columns=['project_name', 'module', 'doc concepts', 'domains', 'weight_dist', 'gpt_doc_sim_score'])

### UAE-Large-V1

In [51]:
import torch
torch.cuda.is_available()

True

In [52]:
import torch
from transformers import AutoModel, AutoTokenizer

def angle_vec(tokenizer, model, input):

    tok = tokenizer([input], return_tensors='pt')

    for k, v in tok.items():
        tok[k] = v.cuda()

    hs = model(**tok).last_hidden_state

    vec = (hs[:, 0] + torch.mean(hs, dim=1)) / 2.0

    return vec.detach().cpu().numpy()

In [53]:
tokenizer = AutoTokenizer.from_pretrained('./UAE-Large-V1')
angle_model = AutoModel.from_pretrained('./UAE-Large-V1').cuda()  
v1 = angle_vec(tokenizer, angle_model, "king")
v2 = angle_vec(tokenizer, angle_model, "queen")

In [54]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_similarity(v1, v2)

array([[0.7001157]], dtype=float32)

In [55]:
# convert concepts to vectors first.
# THIS HAS BEEN RUN, PLEASE UNCOMMENT TO RUN ANGLE VECTORISATION
# from numpy import asarray
# from numpy import save
# import ast

# tokenizer = AutoTokenizer.from_pretrained('./UAE-Large-V1')
# angle_model = AutoModel.from_pretrained('./UAE-Large-V1').cuda()  

# ctr = 0
# code_concept_list_sub = code_concept_list[ctr:]

# for con_list in code_concept_list_sub: # 313 domains lists (no. of modules) to vectorise
#     curr_code_vec = []

#     con_list = ast.literal_eval(con_list)

#     for con in con_list:
#         con_str = " ".join(con)
#         curr_code_vec.append(angle_vec(tokenizer, angle_model, con_str))

#     curr_code_vec_arr = asarray(curr_code_vec)
#     save(f"angle_code_vec\\{ctr}.npy", curr_code_vec_arr)
#     ctr+=1

In [56]:
# convert concepts to vectors first.
# THIS HAS BEEN RUN, PLEASE UNCOMMENT TO RUN ANGLE VECTORISATION
# from numpy import asarray
# from numpy import save
# import ast

# # tokenizer = AutoTokenizer.from_pretrained('./UAE-Large-V1')
# # angle_model = AutoModel.from_pretrained('./UAE-Large-V1').cuda()  

# ctr = 0
# test_concept_list_sub = test_concept_list[ctr:]

# for con_list in test_concept_list_sub: # 313 domains lists (no. of modules) to vectorise
#     curr_test_vec = []

#     con_list = ast.literal_eval(con_list)

#     for con in con_list:
#         con_str = " ".join(con)
#         curr_test_vec.append(angle_vec(tokenizer, angle_model, con_str))

#     curr_test_vec_arr = asarray(curr_test_vec)
#     save(f"angle_test_vec\\{ctr}.npy", curr_test_vec_arr)
#     ctr+=1

In [57]:
# convert concepts to vectors first.
# THIS HAS BEEN RUN, PLEASE UNCOMMENT TO RUN ANGLE VECTORISATION
# from numpy import asarray
# from numpy import save
# import ast

# ctr = 0

# for con_list in doc_concept_list:
#     curr_doc_vec = []

#     con_list = ast.literal_eval(con_list)

#     for con in con_list:
#         con_str = " ".join(con)
#         curr_doc_vec.append(angle_vec(tokenizer, angle_model, con_str))

#     curr_doc_vec_arr = asarray(curr_doc_vec)
#     save(f"angle_doc_vec\\{ctr}.npy", curr_doc_vec_arr)
#     ctr+=1

In [58]:
# convert concepts to vectors first.
# THIS HAS BEEN RUN, PLEASE UNCOMMENT TO RUN ANGLE VECTORISATION
# from numpy import asarray
# from numpy import save

# ctr = 216
# domains_list_sub = domains_list[ctr:]

# for dom in domains_list_sub: # 313 domains lists (no. of modules) to vectorise
#     curr_dom_vec = []
#     for d in dom:
#         curr_dom_vec.append(angle_vec(tokenizer, angle_model, d))

#     curr_dom_vec_arr = asarray(curr_dom_vec)
#     save(f"angle_dom_vec\\{ctr}.npy", curr_dom_vec_arr)
#     ctr+=1

#### load the vectors

In [59]:
from numpy import load

dom_vec_list = []
code_vec_list = []
test_vec_list = []
doc_vec_list = []

for i in range(0, len(domains_list)): # 313 modules

    curr_dom_vec = load(f"angle_dom_vec\\{i}.npy")
    dom_vec_list.append(curr_dom_vec)

for i in range(0, len(code_concept_list)): # 313 modules

    curr_code_vec = load(f"angle_code_vec\\{i}.npy")
    code_vec_list.append(curr_code_vec)

for i in range(0, len(test_concept_list)): # 313 modules

    curr_test_vec = load(f"angle_test_vec\\{i}.npy")
    test_vec_list.append(curr_test_vec)

for i in range(0, len(doc_concept_list)): # 313 modules

    curr_doc_vec = load(f"angle_doc_vec\\{i}.npy")
    doc_vec_list.append(curr_doc_vec)

In [60]:
def comp_con_domains_angle(con_vec, dom_vec):
    # list of domains should be given, now we return an array of similarity scores for all domains
    # one concept matching against all the domains for that concept
    sim_res = []

    for dv in dom_vec:
        sim_score = cosine_similarity(con_vec, dv)
        sim_res.append(sim_score[0][0])

    return sim_res

In [61]:
angle_code_domain_res = []

for i, code_vec in enumerate(code_vec_list):

    curr_con_list = ast.literal_eval(code_concept_list[i])

    for k, cv in enumerate(code_vec):

        curr_concepts = curr_con_list[k]
        curr_domains = domains_list[i]
        curr_prob = weight_dist_list[i]

        sim_res_list = comp_con_domains_angle(cv, dom_vec_list[i])

        proj_name = proj_names.values[i][0]
        module_name = proj_names.values[i][1]

        angle_code_domain_res.append([proj_name, module_name, curr_concepts, curr_domains, curr_prob, sim_res_list])

In [62]:
angle_test_domain_res = []

for i, test_vec in enumerate(test_vec_list):

    curr_con_list = ast.literal_eval(test_concept_list[i])

    for k, tv in enumerate(test_vec):

        curr_concepts = curr_con_list[k]
        curr_domains = domains_list[i]
        curr_prob = weight_dist_list[i]

        sim_res_list = comp_con_domains_angle(tv, dom_vec_list[i])

        proj_name = proj_names.values[i][0]
        module_name = proj_names.values[i][1]

        angle_test_domain_res.append([proj_name, module_name, curr_concepts, curr_domains, curr_prob, sim_res_list])

In [63]:
angle_doc_domain_res = []

for i, doc_vec in enumerate(doc_vec_list):

    curr_con_list = ast.literal_eval(doc_concept_list[i])

    for k, dv in enumerate(doc_vec):

        curr_concepts = curr_con_list[k]
        curr_domains = domains_list[i]
        curr_prob = weight_dist_list[i]

        sim_res_list = comp_con_domains_angle(dv, dom_vec_list[i])

        proj_name = proj_names.values[i][0]
        module_name = proj_names.values[i][1]

        angle_doc_domain_res.append([proj_name, module_name, curr_concepts, curr_domains, curr_prob, sim_res_list])

In [64]:
code_dom_angle_sim_df = pd.DataFrame(angle_code_domain_res, columns=['project_name', 'module', 'code concepts', 'domains', 'weight_dist', 'angle_code_sim_score'])
test_dom_angle_sim_df = pd.DataFrame(angle_test_domain_res, columns=['project_name', 'module', 'test concepts', 'domains', 'weight_dist', 'angle_test_sim_score'])
doc_dom_angle_sim_df = pd.DataFrame(angle_doc_domain_res, columns=['project_name', 'module', 'doc concepts', 'domains', 'weight_dist', 'angle_doc_sim_score'])

#### Merge all sim scores

In [65]:
code_domain_sim_df = sow2v_code_domain_res_df.copy()
code_domain_sim_df['gpt_code_sim_score'] = code_dom_gpt_sim_df['gpt_code_sim_score']
code_domain_sim_df['angle_code_sim_score'] = code_dom_angle_sim_df['angle_code_sim_score']
code_domain_sim_df

Unnamed: 0,project_name,module,concept,domains,weight_dist,sow2v_code_sim_score,gpt_code_sim_score,angle_code_sim_score
0,01_dubbo,dubbo-cluster,"[merger, model, scope, aware, end, error, merg...","[Bayesian inference, Containerization, File Tr...","[0.0006186695022802121, 0.001075709038256659, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.7311602189366283, 0.7417190548602931, 0.703...","[0.5299579, 0.56805325, 0.4777574, 0.44822916,..."
1,01_dubbo,dubbo-cluster,"[match, value, bool, exact, weight, result, de...","[Bayesian inference, Containerization, File Tr...","[0.0006186695022802121, 0.001075709038256659, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.7016929118277905, 0.6982882861146025, 0.706...","[0.54784065, 0.46327564, 0.5400166, 0.5001681,..."
2,01_dubbo,dubbo-cluster,"[rule, mesh, listener, app, merger, map, merge...","[Bayesian inference, Containerization, File Tr...","[0.0006186695022802121, 0.001075709038256659, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.6558771698809155, 0.6792273271454538, 0.685...","[0.46307766, 0.5018077, 0.4271498, 0.41547033,..."
3,01_dubbo,dubbo-cluster,"[rule, key, invoker, node, router, destination...","[Bayesian inference, Containerization, File Tr...","[0.0006186695022802121, 0.001075709038256659, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.6962194262372328, 0.7197655294394439, 0.713...","[0.44639945, 0.49696502, 0.51973915, 0.5151202..."
4,01_dubbo,dubbo-cluster,"[url, invoker, invocation, configurator, load,...","[Bayesian inference, Containerization, File Tr...","[0.0006186695022802121, 0.001075709038256659, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.7120789628683717, 0.7148573131041973, 0.702...","[0.4117705, 0.52084935, 0.48110706, 0.579143, ..."
...,...,...,...,...,...,...,...,...
1256,15_zeppelin,zeppelin-zengine,"[helium, process, remote, plugin, bundle, dir,...","[3D modeling, Bayesian inference, Bidirectiona...","[0.0006337118082907259, 0.0002334781055880513,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.7111559290377295, 0.6898266990435676, 0.688...","[0.42404875, 0.40588626, 0.3966069, 0.60110354..."
1257,15_zeppelin,zeppelin-zengine,"[interpreter, set, group, property, context, r...","[3D modeling, Bayesian inference, Bidirectiona...","[0.0006337118082907259, 0.0002334781055880513,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.6814844359999269, 0.6976504115294633, 0.692...","[0.40882564, 0.46137673, 0.46576136, 0.5701189..."
1258,15_zeppelin,zeppelin-zengine,"[job, scheduler, ticket, status, remote, buffe...","[3D modeling, Bayesian inference, Bidirectiona...","[0.0006337118082907259, 0.0002334781055880513,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.696078478966435, 0.7029464672120006, 0.7095...","[0.32848954, 0.43342304, 0.4947282, 0.45656368..."
1259,15_zeppelin,zeppelin-zengine,"[note, paragraph, info, event, authentication,...","[3D modeling, Bayesian inference, Bidirectiona...","[0.0006337118082907259, 0.0002334781055880513,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.6956173248572144, 0.6982366190310776, 0.698...","[0.43078762, 0.44798276, 0.41389444, 0.4858161..."


In [66]:
code_domain_sim_df.to_csv("sim_code_domain_df.csv", index=False)

In [67]:
test_domain_sim_df = sow2v_test_domain_res_df.copy()
test_domain_sim_df['gpt_test_sim_score'] = test_dom_gpt_sim_df['gpt_test_sim_score']
test_domain_sim_df['angle_test_sim_score'] = test_dom_angle_sim_df['angle_test_sim_score']
test_domain_sim_df

Unnamed: 0,project_name,module,concept,domains,weight_dist,sow2v_test_sim_score,gpt_test_sim_score,angle_test_sim_score
0,01_dubbo,dubbo-cluster,"[invoker, hello, menu, service, load, balance,...","[Bayesian inference, Containerization, File Tr...","[0.0006186695022802121, 0.001075709038256659, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.6969462948905291, 0.699122397759385, 0.6885...","[0.46962628, 0.51499, 0.4604619, 0.5574089, 0...."
1,01_dubbo,dubbo-cluster,"[invoker, invocation, url, cluster, mock, resu...","[Bayesian inference, Containerization, File Tr...","[0.0006186695022802121, 0.001075709038256659, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.7105901433756564, 0.7263724106567162, 0.707...","[0.42823136, 0.49603176, 0.4982496, 0.57298434..."
2,01_dubbo,dubbo-cluster,"[attachment, value, key, invoker, sticky, invo...","[Bayesian inference, Containerization, File Tr...","[0.0006186695022802121, 0.001075709038256659, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.6911600078814237, 0.7035368565205871, 0.702...","[0.40690482, 0.49188542, 0.5312029, 0.56280786..."
3,01_dubbo,dubbo-cluster,"[url, model, merge, merger, module, router, pr...","[Bayesian inference, Containerization, File Tr...","[0.0006186695022802121, 0.001075709038256659, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.6902347290707492, 0.7037540389490645, 0.712...","[0.52358633, 0.5582562, 0.5021834, 0.5214653, ..."
4,01_dubbo,dubbo-cluster,"[match, configurator, mock, bool, join, absent...","[Bayesian inference, Containerization, File Tr...","[0.0006186695022802121, 0.001075709038256659, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.6863980044930774, 0.6988732122595529, 0.689...","[0.4701079, 0.47720772, 0.42111737, 0.5253796,..."
...,...,...,...,...,...,...,...,...
1194,15_zeppelin,zeppelin-zengine,"[note, path, info, authentication, subject, st...","[3D modeling, Bayesian inference, Bidirectiona...","[0.0006337118082907259, 0.0002334781055880513,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.7360471411812224, 0.7425425978374396, 0.736...","[0.43738577, 0.45902747, 0.46823826, 0.4533974..."
1195,15_zeppelin,zeppelin-zengine,"[note, interrupt, paragraph, text, search, man...","[3D modeling, Bayesian inference, Bidirectiona...","[0.0006337118082907259, 0.0002334781055880513,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.7281207279861407, 0.7382133358280061, 0.738...","[0.3866217, 0.45353967, 0.4427986, 0.41480595,..."
1196,15_zeppelin,zeppelin-zengine,"[interpreter, context, property, form, angular...","[3D modeling, Bayesian inference, Bidirectiona...","[0.0006337118082907259, 0.0002334781055880513,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.6709798560249572, 0.6777394418624301, 0.658...","[0.42819172, 0.4348434, 0.46844795, 0.48737687..."
1197,15_zeppelin,zeppelin-zengine,"[text, paragraph, property, quote, local, unfi...","[3D modeling, Bayesian inference, Bidirectiona...","[0.0006337118082907259, 0.0002334781055880513,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.6847751387253405, 0.6855594594874639, 0.671...","[0.3988889, 0.45447856, 0.45586926, 0.4631729,..."


In [68]:
test_domain_sim_df.to_csv("sim_test_domain_df.csv", index=False)

In [69]:
doc_domain_sim_df = sow2v_doc_domain_res_df.copy()
doc_domain_sim_df['gpt_doc_sim_score'] = doc_dom_gpt_sim_df['gpt_doc_sim_score']
doc_domain_sim_df['angle_doc_sim_score'] = doc_dom_angle_sim_df['angle_doc_sim_score']
doc_domain_sim_df

Unnamed: 0,project_name,module,concept,domains,weight_dist,sow2v_doc_sim_score,gpt_doc_sim_score,angle_doc_sim_score
0,01_dubbo,dubbo-cluster,"[service, see, issue, sample, project, github,...","[Bayesian inference, Containerization, File Tr...","[0.0006186695022802121, 0.001075709038256659, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.7151548075641246, 0.7452139517589933, 0.726...","[0.4223302, 0.54775983, 0.4744687, 0.528439, 0..."
1,01_dubbo,dubbo-cluster,"[contribute, issue, sample, project, service, ...","[Bayesian inference, Containerization, File Tr...","[0.0006186695022802121, 0.001075709038256659, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.7262556385432636, 0.7367756398724646, 0.720...","[0.38913843, 0.53583735, 0.45912963, 0.4984625..."
2,01_dubbo,dubbo-cluster,"[issue, service, project, list, see, github, r...","[Bayesian inference, Containerization, File Tr...","[0.0006186695022802121, 0.001075709038256659, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.7266753311234634, 0.7361773056848603, 0.727...","[0.4397305, 0.5552181, 0.4954759, 0.53429127, ..."
3,01_dubbo,dubbo-cluster,"[service, see, project, build, github, sample,...","[Bayesian inference, Containerization, File Tr...","[0.0006186695022802121, 0.001075709038256659, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.7181566198682772, 0.7366335309908381, 0.725...","[0.39362937, 0.5181406, 0.4197607, 0.5035975, ..."
4,01_dubbo,dubbo-cluster,"[see, issue, sample, service, contribute, guid...","[Bayesian inference, Containerization, File Tr...","[0.0006186695022802121, 0.001075709038256659, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.7274142866301703, 0.7355761559871075, 0.720...","[0.45590687, 0.56785774, 0.5061829, 0.53482467..."
...,...,...,...,...,...,...,...,...
1179,15_zeppelin,zeppelin-zengine,"[install, mailing, build, please, notebook, so...","[3D modeling, Bayesian inference, Bidirectiona...","[0.0006337118082907259, 0.0002334781055880513,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.7316896906545427, 0.7193866844897687, 0.702...","[0.48917058, 0.4360385, 0.39520195, 0.50214887..."
1180,15_zeppelin,zeppelin-zengine,"[build, install, binary, source, package, inte...","[3D modeling, Bayesian inference, Bidirectiona...","[0.0006337118082907259, 0.0002334781055880513,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.7170258619727808, 0.7116449009313943, 0.705...","[0.4644228, 0.43371212, 0.35374638, 0.5482631,..."
1181,15_zeppelin,zeppelin-zengine,"[mailing, list, package, notebook, make, scala...","[3D modeling, Bayesian inference, Bidirectiona...","[0.0006337118082907259, 0.0002334781055880513,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.709517643991769, 0.7211271133586368, 0.7093...","[0.42710003, 0.43926078, 0.4172133, 0.5322217,..."
1182,15_zeppelin,zeppelin-zengine,"[mailing, list, package, notebook, make, scala...","[3D modeling, Bayesian inference, Bidirectiona...","[0.0006337118082907259, 0.0002334781055880513,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.709517643991769, 0.7211271133586368, 0.7093...","[0.42710003, 0.43926078, 0.4172133, 0.5322217,..."


In [70]:
doc_domain_sim_df.to_csv("sim_doc_domain_df.csv", index=False)

## Represent concepts in a vector space (RQ1)

In [71]:
import pandas as pd

code_domain_sim_df = pd.read_csv("sim_code_domain_df.csv")
test_domain_sim_df = pd.read_csv("sim_test_domain_df.csv")
doc_domain_sim_df = pd.read_csv("sim_doc_domain_df.csv")

In [72]:
topics_df = pd.read_csv("topics_res_df.csv")
doc_num_list = topics_df[["project_name", "doc_num_topics"]].drop_duplicates(ignore_index=True)['doc_num_topics'].tolist()

In [74]:
new_doc_df_list = []

for prefix in range(1,16):
    if prefix<10:
        prefix = '0' + str(prefix)
    new_doc_df_list.append(doc_domain_sim_df[doc_domain_sim_df['project_name'].str.startswith(str(prefix))][['project_name', 'concept']][:doc_num_list[int(prefix)-1]])

In [75]:
new_doc_df = pd.concat(new_doc_df_list, ignore_index=True)
new_doc_df

Unnamed: 0,project_name,concept
0,01_dubbo,"['service', 'see', 'issue', 'sample', 'project..."
1,01_dubbo,"['contribute', 'issue', 'sample', 'project', '..."
2,01_dubbo,"['issue', 'service', 'project', 'list', 'see',..."
3,01_dubbo,"['service', 'see', 'project', 'build', 'github..."
4,01_dubbo,"['see', 'issue', 'sample', 'service', 'contrib..."
...,...,...
108,15_zeppelin,"['install', 'mailing', 'build', 'please', 'not..."
109,15_zeppelin,"['build', 'install', 'binary', 'source', 'pack..."
110,15_zeppelin,"['mailing', 'list', 'package', 'notebook', 'ma..."
111,15_zeppelin,"['mailing', 'list', 'package', 'notebook', 'ma..."


In [76]:
module_df = pd.read_csv("module_df.csv")
code_concept_list = module_df['code_topics']
test_concept_list = module_df['test_topics']
doc_concept_list = module_df['doc_topics']
weight_dist_list = module_df['weight_dist']
domains_list = module_df['domains']

In [77]:
gpt_doc_concept_list = doc_concept_list.drop_duplicates(ignore_index=True)

In [78]:
from numpy import load

gpt_code_vec_list = []
gpt_test_vec_list = []
gpt_doc_vec_list = []
angle_code_vec_list = []
angle_test_vec_list = []
angle_doc_vec_list = []

for i in range(0, len(code_concept_list)): # 313 modules

    curr_code_vec = load(f"gpt_code_vec\\{i}.npy")
    gpt_code_vec_list.append(curr_code_vec)

for i in range(0, len(test_concept_list)): # 313 modules

    curr_test_vec = load(f"gpt_test_vec\\{i}.npy")
    gpt_test_vec_list.append(curr_test_vec)

for i in range(0, len(gpt_doc_concept_list)):

    curr_doc_vec = load(f"gpt_doc_vec\\{i}.npy")
    gpt_doc_vec_list.append(curr_doc_vec)

for i in range(0, len(code_concept_list)): # 313 modules

    curr_code_vec = load(f"angle_code_vec\\{i}.npy")
    angle_code_vec_list.append(curr_code_vec)

for i in range(0, len(test_concept_list)): # 313 modules

    curr_test_vec = load(f"angle_test_vec\\{i}.npy")
    angle_test_vec_list.append(curr_test_vec)

for i in range(0, len(doc_concept_list)):

    curr_doc_vec = load(f"angle_doc_vec\\{i}.npy")
    angle_doc_vec_list.append(curr_doc_vec)

In [79]:
from gensim.models.keyedvectors import KeyedVectors

so_w2v_model = KeyedVectors.load_word2vec_format('C:/Users/biadge/OneDrive - BP/PhD/extraction/SO_vectors_200.bin', binary=True)

def run_vec_sow2v(str_list):
    return so_w2v_model.get_mean_vector(str_list, ignore_missing = True)

In [80]:
gpt_code_flattened_vec_list = [x for sub in gpt_code_vec_list for x in sub]
angle_code_flattened_vec_list = [x for sub in angle_code_vec_list for x in sub]

gpt_code_flattened_vec_list = [x[0] for x in gpt_code_flattened_vec_list]
angle_code_flattened_vec_list = [x[0] for x in angle_code_flattened_vec_list]


code_domain_sim_df["sow2v_code_vec"] = code_domain_sim_df["concept"].apply(lambda x: list(run_vec_sow2v(x)))
code_domain_sim_df["gpt_code_vec"] = gpt_code_flattened_vec_list
code_domain_sim_df["angle_code_vec"] = angle_code_flattened_vec_list

code_vec_df = code_domain_sim_df[["project_name", "module", "concept", "sow2v_code_vec", "gpt_code_vec", "angle_code_vec"]]
code_vec_df

Unnamed: 0,project_name,module,concept,sow2v_code_vec,gpt_code_vec,angle_code_vec
0,01_dubbo,dubbo-cluster,"['merger', 'model', 'scope', 'aware', 'end', '...","[0.05225145, -0.039243843, -0.1073313, -0.0605...","[-0.026622535660862923, 0.0065000769682228565,...","[-0.18775713, -0.37221402, -0.102094874, 0.436..."
1,01_dubbo,dubbo-cluster,"['match', 'value', 'bool', 'exact', 'weight', ...","[0.07469578, -0.03465767, -0.085746214, -0.050...","[-0.019484085962176323, -0.0039446307346224785...","[0.03253293, 0.5728816, 0.579907, 0.67069566, ..."
2,01_dubbo,dubbo-cluster,"['rule', 'mesh', 'listener', 'app', 'merger', ...","[0.059995174, -0.054836363, -0.0915457, -0.070...","[-0.02054414711892605, 0.010302042588591576, 0...","[-0.019751057, -0.33930963, -0.1023085, 0.2829..."
3,01_dubbo,dubbo-cluster,"['rule', 'key', 'invoker', 'node', 'router', '...","[0.072560035, -0.042324353, -0.080996744, -0.0...","[-0.014745050109922886, -0.01198360975831747, ...","[-0.553166, -0.27252257, -0.06934874, 0.168628..."
4,01_dubbo,dubbo-cluster,"['url', 'invoker', 'invocation', 'configurator...","[0.06503355, -0.03390938, -0.099120185, -0.053...","[0.004753707442432642, -0.000175158231286332, ...","[-0.49914038, -0.24755313, -0.028283862, 0.168..."
...,...,...,...,...,...,...
1256,15_zeppelin,zeppelin-zengine,"['helium', 'process', 'remote', 'plugin', 'bun...","[0.064236544, -0.04604693, -0.08446985, -0.062...","[0.012220008298754692, -0.003928628750145435, ...","[-0.10722275, 0.36593512, -0.27212584, 0.01309..."
1257,15_zeppelin,zeppelin-zengine,"['interpreter', 'set', 'group', 'property', 'c...","[0.052275404, -0.04475691, -0.09443279, -0.059...","[-0.03034324012696743, 0.0005833391915075481, ...","[-0.6689614, -0.38853735, -0.73910934, 0.48395..."
1258,15_zeppelin,zeppelin-zengine,"['job', 'scheduler', 'ticket', 'status', 'remo...","[0.08014172, -0.032463744, -0.082225345, -0.06...","[-0.027434268966317177, -0.02186468429863453, ...","[-0.14411262, 0.03918158, -0.57518387, 0.54724..."
1259,15_zeppelin,zeppelin-zengine,"['note', 'paragraph', 'info', 'event', 'authen...","[0.061807495, -0.030924492, -0.07913145, -0.04...","[-0.03780198469758034, 0.01199312973767519, -0...","[-0.17626308, -0.15659976, 0.2237904, 0.339523..."


In [81]:
gpt_test_flattened_vec_list = [x for sub in gpt_test_vec_list for x in sub]
angle_test_flattened_vec_list = [x for sub in angle_test_vec_list for x in sub]

gpt_test_flattened_vec_list = [x[0] for x in gpt_test_flattened_vec_list]
angle_test_flattened_vec_list = [x[0] for x in angle_test_flattened_vec_list]

test_domain_sim_df["sow2v_test_vec"] = test_domain_sim_df["concept"].apply(lambda x: list(run_vec_sow2v(x)))
test_domain_sim_df["gpt_test_vec"] = gpt_test_flattened_vec_list
test_domain_sim_df["angle_test_vec"] = angle_test_flattened_vec_list

test_vec_df = test_domain_sim_df[["project_name", "module", "concept", "sow2v_test_vec", "gpt_test_vec", "angle_test_vec"]]
test_vec_df

Unnamed: 0,project_name,module,concept,sow2v_test_vec,gpt_test_vec,angle_test_vec
0,01_dubbo,dubbo-cluster,"['invoker', 'hello', 'menu', 'service', 'load'...","[0.06538757, -0.038762722, -0.09320979, -0.057...","[-0.015958335250616074, -0.015475192107260227,...","[0.014179217, -0.26719624, -0.15324931, -0.072..."
1,01_dubbo,dubbo-cluster,"['invoker', 'invocation', 'url', 'cluster', 'm...","[0.06413624, -0.03734391, -0.09991639, -0.0585...","[-0.0025778282433748245, -0.011829978786408901...","[-0.25283864, -0.16967408, -0.18421198, -0.022..."
2,01_dubbo,dubbo-cluster,"['attachment', 'value', 'key', 'invoker', 'sti...","[0.0645962, -0.04175846, -0.09586649, -0.05556...","[-0.03428419679403305, -0.0033837941009551287,...","[-0.4825757, -0.3559093, -0.28272846, 0.468342..."
3,01_dubbo,dubbo-cluster,"['url', 'model', 'merge', 'merger', 'module', ...","[0.06821527, -0.04794634, -0.09212922, -0.0694...","[-0.0011146004544571042, 0.014147793874144554,...","[0.40440214, -0.44396412, 0.09869711, -0.20433..."
4,01_dubbo,dubbo-cluster,"['match', 'configurator', 'mock', 'bool', 'joi...","[0.062228434, -0.033033144, -0.089854315, -0.0...","[-0.022632991895079613, -0.01612619124352932, ...","[-0.31522655, -0.66237473, 0.0850316, 0.209996..."
...,...,...,...,...,...,...
1194,15_zeppelin,zeppelin-zengine,"['note', 'path', 'info', 'authentication', 'su...","[0.0579622, -0.027154582, -0.079765946, -0.053...","[-0.02901407517492771, 0.016029158607125282, 0...","[-0.15111251, -0.34685683, -0.1725686, 0.02235..."
1195,15_zeppelin,zeppelin-zengine,"['note', 'interrupt', 'paragraph', 'text', 'se...","[0.05478231, -0.04358726, -0.08302527, -0.0588...","[-0.02484974078834057, 0.009228460490703583, 0...","[-0.22413367, -0.4963468, 0.005586869, 0.49931..."
1196,15_zeppelin,zeppelin-zengine,"['interpreter', 'context', 'property', 'form',...","[0.05898075, -0.04256597, -0.09212959, -0.0561...","[-0.006929933559149504, -0.000873541459441185,...","[-0.38648567, -0.6907771, -0.4526555, -0.27825..."
1197,15_zeppelin,zeppelin-zengine,"['text', 'paragraph', 'property', 'quote', 'lo...","[0.079118036, -0.04170034, -0.06910472, -0.046...","[-0.010172299109399319, 0.017780661582946777, ...","[-0.4615311, -0.4929341, -0.05493663, 0.502399..."


In [83]:
gpt_doc_flattened_vec_list = [x for sub in gpt_doc_vec_list for x in sub] # 15 vecs
angle_doc_flattened_vec_list = [x for sub in angle_doc_vec_list for x in sub] # 160 vecs

gpt_doc_flattened_vec_list = [x[0] for x in gpt_doc_flattened_vec_list]
angle_doc_flattened_vec_list = [x[0] for x in angle_doc_flattened_vec_list]

new_doc_df["gpt_doc_vec"] = gpt_doc_flattened_vec_list
doc_domain_sim_df["angle_doc_vec"] = angle_doc_flattened_vec_list
doc_domain_sim_df = doc_domain_sim_df[["project_name", "concept", "angle_doc_vec"]]

new_doc_df_list = []

for prefix in range(1,16):
    if prefix<10:
        prefix = '0' + str(prefix)
    new_doc_df_list.append(doc_domain_sim_df[doc_domain_sim_df['project_name'].str.startswith(str(prefix))][['project_name', 'concept', 'angle_doc_vec']][:doc_num_list[int(prefix)-1]])

new_angle_doc_df = pd.concat(new_doc_df_list, ignore_index=True)
doc_vec_df = new_doc_df.copy()
doc_vec_df["angle_doc_vec"] = new_angle_doc_df["angle_doc_vec"]
doc_vec_df["sow2v_doc_vec"] = doc_vec_df["concept"].apply(lambda x: list(run_vec_sow2v(x)))
doc_vec_df

Unnamed: 0,project_name,concept,gpt_doc_vec,angle_doc_vec,sow2v_doc_vec
0,01_dubbo,"['service', 'see', 'issue', 'sample', 'project...","[0.005654162261635065, -0.0016180375823751092,...","[-0.2622697, -0.02699811, -0.11021085, -0.2081...","[0.08472025, -0.02117448, -0.0758657, -0.05463..."
1,01_dubbo,"['contribute', 'issue', 'sample', 'project', '...","[-0.0033876176457852125, -0.01359404157847166,...","[-0.42579025, -0.19378607, -0.31018537, -0.093...","[0.0759664, -0.026296437, -0.08504799, -0.0534..."
2,01_dubbo,"['issue', 'service', 'project', 'list', 'see',...","[-0.005690885242074728, -0.01893075555562973, ...","[-0.3254578, -0.15225579, -0.40020263, 0.05321...","[0.0752811, -0.029863613, -0.08484851, -0.0549..."
3,01_dubbo,"['service', 'see', 'project', 'build', 'github...","[0.0017545023001730442, -0.0064876810647547245...","[-0.09142876, -0.45110118, -0.17707597, -0.150...","[0.087012574, -0.029808724, -0.07877463, -0.05..."
4,01_dubbo,"['see', 'issue', 'sample', 'service', 'contrib...","[0.003152343910187483, 0.0006888388306833804, ...","[-0.38099384, 0.13213581, -0.082547046, -0.096...","[0.07527791, -0.02495413, -0.080793396, -0.055..."
...,...,...,...,...,...
108,15_zeppelin,"['install', 'mailing', 'build', 'please', 'not...","[0.0006966929649934173, 0.007134430110454559, ...","[0.1349908, -0.41935748, 0.03542897, 0.1538129...","[0.08277418, -0.037765346, -0.08174845, -0.064..."
109,15_zeppelin,"['build', 'install', 'binary', 'source', 'pack...","[-0.02117352932691574, -0.002007524250075221, ...","[0.18003023, -0.30404034, 0.11309576, 0.079601...","[0.08204969, -0.03756449, -0.08740187, -0.0539..."
110,15_zeppelin,"['mailing', 'list', 'package', 'notebook', 'ma...","[0.008029499091207981, -0.009200154803693295, ...","[-0.2138662, -0.5567057, 0.09736484, 0.0146132...","[0.06572739, -0.03389141, -0.10656112, -0.0558..."
111,15_zeppelin,"['mailing', 'list', 'package', 'notebook', 'ma...","[0.008029499091207981, -0.009200154803693295, ...","[-0.2138662, -0.5567057, 0.09736484, 0.0146132...","[0.06572739, -0.03389141, -0.10656112, -0.0558..."


### Show concepts for 01_dubbo

In [84]:
dubbo_code_df = code_vec_df[code_vec_df['project_name'].str.startswith("01")]
dubbo_test_df = test_vec_df[test_vec_df['project_name'].str.startswith("01")]
dubbo_doc_df = doc_vec_df[doc_vec_df['project_name'].str.startswith("01")]
dubbo_code_df

Unnamed: 0,project_name,module,concept,sow2v_code_vec,gpt_code_vec,angle_code_vec
0,01_dubbo,dubbo-cluster,"['merger', 'model', 'scope', 'aware', 'end', '...","[0.05225145, -0.039243843, -0.1073313, -0.0605...","[-0.026622535660862923, 0.0065000769682228565,...","[-0.18775713, -0.37221402, -0.102094874, 0.436..."
1,01_dubbo,dubbo-cluster,"['match', 'value', 'bool', 'exact', 'weight', ...","[0.07469578, -0.03465767, -0.085746214, -0.050...","[-0.019484085962176323, -0.0039446307346224785...","[0.03253293, 0.5728816, 0.579907, 0.67069566, ..."
2,01_dubbo,dubbo-cluster,"['rule', 'mesh', 'listener', 'app', 'merger', ...","[0.059995174, -0.054836363, -0.0915457, -0.070...","[-0.02054414711892605, 0.010302042588591576, 0...","[-0.019751057, -0.33930963, -0.1023085, 0.2829..."
3,01_dubbo,dubbo-cluster,"['rule', 'key', 'invoker', 'node', 'router', '...","[0.072560035, -0.042324353, -0.080996744, -0.0...","[-0.014745050109922886, -0.01198360975831747, ...","[-0.553166, -0.27252257, -0.06934874, 0.168628..."
4,01_dubbo,dubbo-cluster,"['url', 'invoker', 'invocation', 'configurator...","[0.06503355, -0.03390938, -0.099120185, -0.053...","[0.004753707442432642, -0.000175158231286332, ...","[-0.49914038, -0.24755313, -0.028283862, 0.168..."
...,...,...,...,...,...,...
122,01_dubbo,dubbo-xds,"['key', 'istio', 'path', 'secret', 'jwt', 'clu...","[0.06526281, -0.039973218, -0.10442113, -0.060...","[0.017854617908596992, -0.010608749464154243, ...","[0.22748196, 0.38303995, -0.57838404, -0.26077..."
123,01_dubbo,dubbo-xds,"['consumer', 'observe', 'map', 'result', 'endp...","[0.05611692, -0.041434396, -0.08906147, -0.056...","[0.0015994592104107141, -0.011639793403446674,...","[-0.69973636, 0.08320053, -0.841619, 0.0749328..."
124,01_dubbo,dubbo-xds,"['endpoint', 'cluster', 'server', 'weight', 'i...","[0.05542404, -0.036790773, -0.090690106, -0.06...","[-0.002195612294599414, -0.02597239799797535, ...","[-0.07411003, -0.23463039, -0.64296293, 0.0885..."
125,01_dubbo,dubbo-xds,"['discovery', 'observer', 'response', 'request...","[0.059809525, -0.04411117, -0.09158495, -0.066...","[-0.01950066350400448, 0.004034237004816532, -...","[-0.22820416, 0.06541349, 0.09582858, 0.070130..."


In [85]:
dubbo_test_df

Unnamed: 0,project_name,module,concept,sow2v_test_vec,gpt_test_vec,angle_test_vec
0,01_dubbo,dubbo-cluster,"['invoker', 'hello', 'menu', 'service', 'load'...","[0.06538757, -0.038762722, -0.09320979, -0.057...","[-0.015958335250616074, -0.015475192107260227,...","[0.014179217, -0.26719624, -0.15324931, -0.072..."
1,01_dubbo,dubbo-cluster,"['invoker', 'invocation', 'url', 'cluster', 'm...","[0.06413624, -0.03734391, -0.09991639, -0.0585...","[-0.0025778282433748245, -0.011829978786408901...","[-0.25283864, -0.16967408, -0.18421198, -0.022..."
2,01_dubbo,dubbo-cluster,"['attachment', 'value', 'key', 'invoker', 'sti...","[0.0645962, -0.04175846, -0.09586649, -0.05556...","[-0.03428419679403305, -0.0033837941009551287,...","[-0.4825757, -0.3559093, -0.28272846, 0.468342..."
3,01_dubbo,dubbo-cluster,"['url', 'model', 'merge', 'merger', 'module', ...","[0.06821527, -0.04794634, -0.09212922, -0.0694...","[-0.0011146004544571042, 0.014147793874144554,...","[0.40440214, -0.44396412, 0.09869711, -0.20433..."
4,01_dubbo,dubbo-cluster,"['match', 'configurator', 'mock', 'bool', 'joi...","[0.062228434, -0.033033144, -0.089854315, -0.0...","[-0.022632991895079613, -0.01612619124352932, ...","[-0.31522655, -0.66237473, 0.0850316, 0.209996..."
...,...,...,...,...,...,...
119,01_dubbo,dubbo-xds,"['endpoint', 'change', 'point', 'end', 'cluste...","[0.063921794, -0.024483405, -0.08707288, -0.05...","[0.00015126266225706786, -0.01217339001595974,...","[-0.94928396, 0.061658207, -0.85459405, 0.2661..."
120,01_dubbo,dubbo-xds,"['matcher', 'route', 'map', 'rule', 'endpoint'...","[0.05368861, -0.038989674, -0.08338601, -0.054...","[-0.0265257116407156, -0.008821831084787846, -...","[-0.35084802, 0.3498905, -0.22398137, -0.03473..."
121,01_dubbo,dubbo-xds,"['endpoint', 'cluster', 'end', 'change', 'poin...","[0.06607034, -0.03577423, -0.087813005, -0.054...","[-0.00456932233646512, -0.024403274059295654, ...","[-1.0170584, -0.33726072, -0.47389454, 0.41462..."
122,01_dubbo,dubbo-xds,"['rule', 'route', 'app', 'xds', 'manager', 'ch...","[0.06347599, -0.045654148, -0.08858261, -0.059...","[-0.003229746827855706, 0.013515480794012547, ...","[-0.5358087, -0.054393616, -0.12732497, -0.424..."


In [86]:
dubbo_doc_df

Unnamed: 0,project_name,concept,gpt_doc_vec,angle_doc_vec,sow2v_doc_vec
0,01_dubbo,"['service', 'see', 'issue', 'sample', 'project...","[0.005654162261635065, -0.0016180375823751092,...","[-0.2622697, -0.02699811, -0.11021085, -0.2081...","[0.08472025, -0.02117448, -0.0758657, -0.05463..."
1,01_dubbo,"['contribute', 'issue', 'sample', 'project', '...","[-0.0033876176457852125, -0.01359404157847166,...","[-0.42579025, -0.19378607, -0.31018537, -0.093...","[0.0759664, -0.026296437, -0.08504799, -0.0534..."
2,01_dubbo,"['issue', 'service', 'project', 'list', 'see',...","[-0.005690885242074728, -0.01893075555562973, ...","[-0.3254578, -0.15225579, -0.40020263, 0.05321...","[0.0752811, -0.029863613, -0.08484851, -0.0549..."
3,01_dubbo,"['service', 'see', 'project', 'build', 'github...","[0.0017545023001730442, -0.0064876810647547245...","[-0.09142876, -0.45110118, -0.17707597, -0.150...","[0.087012574, -0.029808724, -0.07877463, -0.05..."
4,01_dubbo,"['see', 'issue', 'sample', 'service', 'contrib...","[0.003152343910187483, 0.0006888388306833804, ...","[-0.38099384, 0.13213581, -0.082547046, -0.096...","[0.07527791, -0.02495413, -0.080793396, -0.055..."
5,01_dubbo,"['service', 'sample', 'see', 'issue', 'please'...","[-0.003034312278032303, -0.0045631458051502705...","[-0.4854178, 0.08935423, -0.009228911, 0.15076...","[0.07528599, -0.025522616, -0.080436565, -0.05..."
6,01_dubbo,"['service', 'issue', 'contribute', 'see', 'git...","[0.01160502154380083, -9.151901031145826e-05, ...","[-0.46851927, 0.069999725, -0.4154365, 0.00631...","[0.07766223, -0.02147056, -0.07729336, -0.0560..."
7,01_dubbo,"['service', 'see', 'issue', 'contribute', 'sam...","[0.00907079130411148, -0.0009530773968435824, ...","[-0.1939252, -0.23459983, -0.2911613, -0.08646...","[0.08069764, -0.01952648, -0.07685122, -0.0537..."


In [87]:
import plotly.express as px
import plotly.graph_objects as go
from sklearn.manifold import TSNE
import numpy as np


tsne_model = TSNE(
    n_components = 2,
    perplexity = 5,
    random_state = 42,
    init = "pca",
    learning_rate = "auto",
    # metric = "cosine"
)

code_emb_arr = np.array(dubbo_code_df["sow2v_code_vec"].to_list())
test_emb_arr = np.array(dubbo_test_df["sow2v_test_vec"].to_list())
doc_emb_arr = np.array(dubbo_doc_df["sow2v_doc_vec"].to_list())

code_tsne_embeddings = tsne_model.fit_transform(code_emb_arr)
test_tsne_embeddings = tsne_model.fit_transform(test_emb_arr)
doc_tsne_embeddings = tsne_model.fit_transform(doc_emb_arr)

sow2v_code_data = pd.DataFrame(
    {'x': code_tsne_embeddings[:, 0],
     'y': code_tsne_embeddings[:, 1]}
)

sow2v_test_data = pd.DataFrame(
    {'x': test_tsne_embeddings[:, 0],
     'y': test_tsne_embeddings[:, 1]}
)

sow2v_doc_data = pd.DataFrame(
    {'x': doc_tsne_embeddings[:, 0],
     'y': doc_tsne_embeddings[:, 1]}
)

dfs = {
    "code": sow2v_code_data,
    "test": sow2v_test_data,
    "doc": sow2v_doc_data
}

id_list = [dubbo_code_df['concept'].tolist(),
           dubbo_test_df['concept'].tolist(),
           dubbo_doc_df['concept'].tolist()]

fig = go.Figure()

for i, df in enumerate(dfs):
    fig = fig.add_trace(
        go.Scatter(
        x = dfs[df]['x'],
        y = dfs[df]['y'],
        name = df,
        text = id_list[i],
        showlegend = True,
        mode = 'markers',
        opacity = 0.7)
    )

fig.update_layout(
    width = 650,
    height = 650
)

fig.show()

In [88]:
import plotly.express as px
import plotly.graph_objects as go
from sklearn.manifold import TSNE
import numpy as np

tsne_model = TSNE(
    n_components = 2,
    perplexity = 5,
    random_state = 42,
    init = "pca",
    learning_rate = "auto",
    # metric = "cosine"
)

code_emb_arr = np.array(dubbo_code_df["gpt_code_vec"].to_list())
test_emb_arr = np.array(dubbo_test_df["gpt_test_vec"].to_list())
doc_emb_arr = np.array(dubbo_doc_df["gpt_doc_vec"].to_list())

code_tsne_embeddings = tsne_model.fit_transform(code_emb_arr)
test_tsne_embeddings = tsne_model.fit_transform(test_emb_arr)
doc_tsne_embeddings = tsne_model.fit_transform(doc_emb_arr)

gpt_code_data = pd.DataFrame(
    {'x': code_tsne_embeddings[:, 0],
     'y': code_tsne_embeddings[:, 1]}
)

gpt_test_data = pd.DataFrame(
    {'x': test_tsne_embeddings[:, 0],
     'y': test_tsne_embeddings[:, 1]}
)

gpt_doc_data = pd.DataFrame(
    {'x': doc_tsne_embeddings[:, 0],
     'y': doc_tsne_embeddings[:, 1]}
)

dfs = {
    "code": gpt_code_data,
    "test": gpt_test_data,
    "doc": gpt_doc_data
}

id_list = [dubbo_code_df['concept'].tolist(),
           dubbo_test_df['concept'].tolist(),
           dubbo_doc_df['concept'].tolist()]

fig = go.Figure()

for i, df in enumerate(dfs):
    fig = fig.add_trace(
        go.Scatter(
        x = dfs[df]['x'],
        y = dfs[df]['y'],
        name = df,
        text = id_list[i],
        showlegend = True,
        mode = 'markers',
        opacity = 0.7)
    )

fig.update_layout(
    width = 650,
    height = 650
)

fig.show()

In [89]:
import plotly.express as px
import plotly.graph_objects as go
from sklearn.manifold import TSNE
import numpy as np

tsne_model = TSNE(
    n_components = 2,
    perplexity = 5,
    random_state = 42,
    init = "pca",
    learning_rate = "auto",
    # metric = "cosine"
)

code_emb_arr = np.array(dubbo_code_df["angle_code_vec"].to_list())
test_emb_arr = np.array(dubbo_test_df["angle_test_vec"].to_list())
doc_emb_arr = np.array(dubbo_doc_df["angle_doc_vec"].to_list())

code_tsne_embeddings = tsne_model.fit_transform(code_emb_arr)
test_tsne_embeddings = tsne_model.fit_transform(test_emb_arr)
doc_tsne_embeddings = tsne_model.fit_transform(doc_emb_arr)

angle_code_data = pd.DataFrame(
    {'x': code_tsne_embeddings[:, 0],
     'y': code_tsne_embeddings[:, 1]}
)

angle_test_data = pd.DataFrame(
    {'x': test_tsne_embeddings[:, 0],
     'y': test_tsne_embeddings[:, 1]}
)

angle_doc_data = pd.DataFrame(
    {'x': doc_tsne_embeddings[:, 0],
     'y': doc_tsne_embeddings[:, 1]}
)

dfs = {
    "code": angle_code_data,
    "test": angle_test_data,
    "doc": angle_doc_data
}

id_list = [dubbo_code_df['concept'].tolist(),
           dubbo_test_df['concept'].tolist(),
           dubbo_doc_df['concept'].tolist()]

fig = go.Figure()

for i, df in enumerate(dfs):
    fig = fig.add_trace(
        go.Scatter(
        x = dfs[df]['x'],
        y = dfs[df]['y'],
        name = df,
        text = id_list[i],
        showlegend = True,
        mode = 'markers',
        opacity = 0.7)
    )

fig.update_layout(
    width = 650,
    height = 650
)

fig.show()