## Concept/Domain coverage analysis: Matching with App Domains

This notebook extracts corpus keywords and analyse the concepts emerging from three sources: source code, documentation and tests

## Show topics

In [3]:
import pandas as pd

topics_df = pd.read_csv("topics_res_df.csv")
topics_df

Unnamed: 0,project_name,module,code_num_topics,test_num_topics,code_shown_topics,code_topics,test_shown_topics,test_topics,doc_num_topics,doc_shown_topics,doc_topics
0,01_dubbo,dubbo-cluster,9,6,"[(0, [('merger', 0.11644696), ('model', 0.0760...","[['merger', 'model', 'scope', 'aware', 'end', ...","[(0, [('invoker', 0.10217771), ('hello', 0.059...","[['merger', 'model', 'scope', 'aware', 'end', ...",8,"[(0, [('service', 0.005111123), ('see', 0.0051...","[['service', 'see', 'issue', 'sample', 'projec..."
1,01_dubbo,dubbo-common,9,9,"[(0, [('map', 0.071077), ('extension', 0.05852...","[['map', 'extension', 'loader', 'property', 'm...","[(0, [('address', 0.07335682), ('country', 0.0...","[['map', 'extension', 'loader', 'property', 'm...",8,"[(0, [('service', 0.005111123), ('see', 0.0051...","[['service', 'see', 'issue', 'sample', 'projec..."
2,01_dubbo,dubbo-compatible,8,9,"[(0, [('invocation', 0.17978409), ('invoker', ...","[['invocation', 'invoker', 'attachment', 'argu...","[(0, [('consumer', 0.08403326), ('service', 0....","[['invocation', 'invoker', 'attachment', 'argu...",8,"[(0, [('service', 0.005111123), ('see', 0.0051...","[['service', 'see', 'issue', 'sample', 'projec..."
3,01_dubbo,dubbo-config,7,8,"[(0, [('application', 0.19198503), ('model', 0...","[['application', 'model', 'module', 'context',...","[(0, [('box', 0.0995611), ('service', 0.097135...","[['application', 'model', 'module', 'context',...",8,"[(0, [('service', 0.005111123), ('see', 0.0051...","[['service', 'see', 'issue', 'sample', 'projec..."
4,01_dubbo,dubbo-configcenter,6,7,"[(0, [('dynamic', 0.2249236), ('factory', 0.12...","[['dynamic', 'factory', 'url', 'zookeeper', 'a...","[(0, [('namespace', 0.01471425), ('map', 0.014...","[['dynamic', 'factory', 'url', 'zookeeper', 'a...",8,"[(0, [('service', 0.005111123), ('see', 0.0051...","[['service', 'see', 'issue', 'sample', 'projec..."
...,...,...,...,...,...,...,...,...,...,...,...
309,45_archiva-components,spring-apacheds,9,1,"[(0, [('partition', 0.038462438), ('context', ...","[['partition', 'context', 'password', 'port', ...","[(0, [('context', 0.21739134), ('create', 0.13...","[['partition', 'context', 'password', 'port', ...",8,"[(0, [('publish', 0.008338352), ('mvn', 0.0083...","[['publish', 'mvn', 'repository', 'checkout', ..."
310,45_archiva-components,spring-cache,6,3,"[(0, [('cache', 0.3110731), ('creator', 0.1051...","[['cache', 'creator', 'hint', 'factory', 'key'...","[(0, [('cache', 0.3525324), ('refre', 0.102800...","[['cache', 'creator', 'hint', 'factory', 'key'...",8,"[(0, [('publish', 0.008338352), ('mvn', 0.0083...","[['publish', 'mvn', 'repository', 'checkout', ..."
311,45_archiva-components,spring-quartz,8,4,"[(0, [('job', 0.29270846), ('context', 0.07773...","[['job', 'context', 'execution', 'map', 'liste...","[(0, [('job', 0.24515942), ('context', 0.16984...","[['job', 'context', 'execution', 'map', 'liste...",8,"[(0, [('publish', 0.008338352), ('mvn', 0.0083...","[['publish', 'mvn', 'repository', 'checkout', ..."
312,45_archiva-components,spring-registry,9,6,"[(0, [('key', 0.1410248), ('registry', 0.08333...","[['key', 'registry', 'builder', 'add', 'save',...","[(0, [('registry', 0.024453199), ('property', ...","[['key', 'registry', 'builder', 'add', 'save',...",8,"[(0, [('publish', 0.008338352), ('mvn', 0.0083...","[['publish', 'mvn', 'repository', 'checkout', ..."


## Show annotated modules

In [4]:
anno_df = pd.read_csv("module_annotation.csv")
anno_df = anno_df[["project", "module", "top", "labels"]]
anno_df.head()

Unnamed: 0,project,module,top,labels
0,dubbo,dubbo-configcenter,big data,"['big data', 'instant messaging', 'user interf..."
1,dubbo,dubbo-remoting,server,"['server', 'instant messaging', 'web service',..."
2,dubbo,dubbo-spring-boot,microservices,"['microservices', 'web service', 'instant mess..."
3,dubbo,dubbo-serialization,serialization,"['serialization', 'database', 'file system', '..."
4,dubbo,dubbo-native,web server,"['web server', 'instant messaging', 'web servi..."


In [5]:
anno_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 483 entries, 0 to 482
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   project  483 non-null    object
 1   module   483 non-null    object
 2   top      483 non-null    object
 3   labels   483 non-null    object
dtypes: object(4)
memory usage: 15.2+ KB


In [6]:
# pd.set_option('display.max_colwidth', None)
module_df = topics_df.merge(anno_df).drop(columns=["project"])
module_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 428 entries, 0 to 427
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   project_name       428 non-null    object
 1   module             428 non-null    object
 2   code_num_topics    428 non-null    int64 
 3   test_num_topics    428 non-null    int64 
 4   code_shown_topics  428 non-null    object
 5   code_topics        428 non-null    object
 6   test_shown_topics  428 non-null    object
 7   test_topics        428 non-null    object
 8   doc_num_topics     428 non-null    int64 
 9   doc_shown_topics   428 non-null    object
 10  doc_topics         428 non-null    object
 11  top                428 non-null    object
 12  labels             428 non-null    object
dtypes: int64(3), object(10)
memory usage: 43.6+ KB


In [7]:
module_df[:5]

Unnamed: 0,project_name,module,code_num_topics,test_num_topics,code_shown_topics,code_topics,test_shown_topics,test_topics,doc_num_topics,doc_shown_topics,doc_topics,top,labels
0,01_dubbo,dubbo-cluster,9,6,"[(0, [('merger', 0.11644696), ('model', 0.0760...","[['merger', 'model', 'scope', 'aware', 'end', ...","[(0, [('invoker', 0.10217771), ('hello', 0.059...","[['merger', 'model', 'scope', 'aware', 'end', ...",8,"[(0, [('service', 0.005111123), ('see', 0.0051...","[['service', 'see', 'issue', 'sample', 'projec...",big data,"['big data', 'instant messaging', 'microservic..."
1,01_dubbo,dubbo-common,9,9,"[(0, [('map', 0.071077), ('extension', 0.05852...","[['map', 'extension', 'loader', 'property', 'm...","[(0, [('address', 0.07335682), ('country', 0.0...","[['map', 'extension', 'loader', 'property', 'm...",8,"[(0, [('service', 0.005111123), ('see', 0.0051...","[['service', 'see', 'issue', 'sample', 'projec...",database,"['database', 'instant messaging', 'big data', ..."
2,01_dubbo,dubbo-compatible,8,9,"[(0, [('invocation', 0.17978409), ('invoker', ...","[['invocation', 'invoker', 'attachment', 'argu...","[(0, [('consumer', 0.08403326), ('service', 0....","[['invocation', 'invoker', 'attachment', 'argu...",8,"[(0, [('service', 0.005111123), ('see', 0.0051...","[['service', 'see', 'issue', 'sample', 'projec...",computer configuration,"['computer configuration', 'microservices', 'w..."
3,01_dubbo,dubbo-config,7,8,"[(0, [('application', 0.19198503), ('model', 0...","[['application', 'model', 'module', 'context',...","[(0, [('box', 0.0995611), ('service', 0.097135...","[['application', 'model', 'module', 'context',...",8,"[(0, [('service', 0.005111123), ('see', 0.0051...","[['service', 'see', 'issue', 'sample', 'projec...",instant messaging,"['instant messaging', 'web service', 'microser..."
4,01_dubbo,dubbo-configcenter,6,7,"[(0, [('dynamic', 0.2249236), ('factory', 0.12...","[['dynamic', 'factory', 'url', 'zookeeper', 'a...","[(0, [('namespace', 0.01471425), ('map', 0.014...","[['dynamic', 'factory', 'url', 'zookeeper', 'a...",8,"[(0, [('service', 0.005111123), ('see', 0.0051...","[['service', 'see', 'issue', 'sample', 'projec...",big data,"['big data', 'instant messaging', 'user interf..."


## LLM Matching of AD to Concepts

### Using embeddings then calculate semantic similarity to match with domains

- StackOverflow w2v
- text-embedding-ada-002 (openAI + scikit-LLM)

In [8]:
code_concept_list = module_df['code_topics']
test_concept_list = module_df['test_topics']
doc_concept_list = module_df['doc_topics']
domains_list = module_df['labels']

In [9]:
proj_names = module_df[['project_name', 'module']]
proj_names

Unnamed: 0,project_name,module
0,01_dubbo,dubbo-cluster
1,01_dubbo,dubbo-common
2,01_dubbo,dubbo-compatible
3,01_dubbo,dubbo-config
4,01_dubbo,dubbo-configcenter
...,...,...
423,45_archiva-components,spring-apacheds
424,45_archiva-components,spring-cache
425,45_archiva-components,spring-quartz
426,45_archiva-components,spring-registry


In [10]:
len(code_concept_list)

428

In [11]:
len(domains_list)

428

## StackOverflow W2V

In [12]:
from gensim.models.keyedvectors import KeyedVectors

so_w2v_model = KeyedVectors.load_word2vec_format('C:/Users/biadge/OneDrive - BP/PhD/extraction/SO_vectors_200.bin', binary=True)

In [13]:
# test
so_w2v_model.n_similarity(['test', 'case'], ['quality'])

0.1252212

In [31]:
def comp_con_domains_sow2v(so_w2v_model, con, domains):
    # list of domains should be given, now we return an array of similarity scores for all domains
    # one concept matching against all the domains for that concept
    sim_res = []

    for dom in domains:
        print(domains)
        sim_score = (dom, so_w2v_model.n_similarity(con, [dom]))
        sim_res.append(sim_score)
        print(sim_score)
        break

    sim_df = pd.DataFrame(sim_res, columns=['domain','sim_score']).sort_values(by='sim_score', ascending=False).reset_index(drop=True)
    # sim_df = sim_df[sim_df['sim_score']>0]

    return sim_df

### Match code concepts to domains

In [32]:
import ast

sow2v_code_domain_res = []

for i, con in enumerate(code_concept_list):

    c = ast.literal_eval(con)

    for con_str in c:

        # print("CONCEPT: " + str(con_str))
        # print(domains_list[i])

        df_res = comp_con_domains_sow2v(so_w2v_model, con_str, domains_list[i])

        print(df_res)

        df_res['concept'] = str(con_str)
        df_res = df_res[['concept', 'domain', 'sim_score']]
        sow2v_code_domain_res.append(df_res)
        break

    break

['big data', 'instant messaging', 'microservices', 'router', 'web service', 'cloud computing', 'web server', 'database', 'server', 'user interface', 'file system', 'client', 'data binding', 'object detection', 'distributed computing', 'data', 'data structure', 'analytics', 'web application security', 'penetration test', 'HTTP server', 'package management system', 'security', 'File Transfer Protocol', 'pattern matching', 'website', 'automation', 'regular expression', 'software testing', 'streaming media', 'natural language', 'web application', 'data compression', 'support vector machine', 'smart contract', 'command-line interface', 'machine learning', 'language model', 'World Wide Web', 'application performance management', 'Web Components', 'back end', 'test automation', 'telecommunications network', 'time series', 'graphical user interface', 'computer data storage', 'unit testing', 'static site generator', 'HTTP client', 'algorithm', 'web browser engine', 'object–relational mapping', 

In [44]:
sow2v_domain_res

[Empty DataFrame
 Columns: [concept, domain, sim_score]
 Index: []]

In [34]:
pd.set_option('display.max_colwidth', None)
sow2v_domain_match_df = pd.concat(sow2v_domain_res).reset_index(drop=True)
sow2v_domain_match_df = pd.concat([proj_names, sow2v_domain_match_df], axis=1)
sow2v_domain_match_df

Unnamed: 0,project_name,module,concept,domain,sim_score
0,01_dubbo,dubbo-cluster,,,
1,01_dubbo,dubbo-common,,,
2,01_dubbo,dubbo-compatible,,,
3,01_dubbo,dubbo-config,,,
4,01_dubbo,dubbo-configcenter,,,
...,...,...,...,...,...
423,45_archiva-components,spring-apacheds,,,
424,45_archiva-components,spring-cache,,,
425,45_archiva-components,spring-quartz,,,
426,45_archiva-components,spring-registry,,,


In [35]:
sow2v_domain_match_df.to_csv("sow2v_domain_match_df.csv", index=False)

### scikit-LLM (GPT4All all-MiniLM-L6-v2-f16)

In [26]:
# from gpt4all import Embed4All
# text = 'The quick brown fox jumps over the lazy dog'
# embedder = Embed4All()
# output = embedder.embed(text)
# print(output)

### scikit-LLM (OpenAI text-embedding-ada-002)

In [27]:
from skllm.models.gpt.vectorization import GPTVectorizer
from skllm.config import SKLLMConfig
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from dotenv import load_dotenv, find_dotenv
import os

In [28]:
_ = load_dotenv(find_dotenv()) # read local .env file

In [29]:
SKLLMConfig.set_openai_key(os.getenv("API_KEY"))
SKLLMConfig.set_openai_org(os.getenv("ORG"))

In [31]:
model = GPTVectorizer()

vectors = model.fit_transform(["how old are you?", "what is your age?"])

vector_1 = np.array(vectors[0]).reshape(1, -1)
vector_2 = np.array(vectors[1]).reshape(1, -1)

Batch size: 1


100%|██████████| 2/2 [00:02<00:00,  1.40s/it]


In [32]:
vector_1

array([[ 0.01409033, -0.01473137,  0.02737622, ...,  0.01300936,
         0.00062651, -0.0186279 ]])

In [33]:
vector_2

array([[ 0.01086601, -0.01645278,  0.00522158, ...,  0.00467059,
        -0.01438978, -0.0314448 ]])

In [34]:
cosine_similarity(vector_1, vector_2)

array([[0.94791596]])

In [None]:
import time

dom_vec_list = []

for dom in dl:
    dom_vectors = model.fit_transform([dom])
    dom_vec = np.array(dom_vectors[0]).reshape(1, -1)
    dom_vec_list.append((dom, dom_vec))
    time.sleep(0.5)

In [41]:
len(dom_vec_list)

267

In [48]:
def comp_con_domains_gpt(gpt_model, con, dom_vec_list):
    # list of domains should be given, now we return an array of similarity scores for all domains
    sim_res = []

    print("Working on concept {}".format(str(con)))
    print("No. of domains to run against: {}".format(str(len(dom_vec_list))))

    str_con = " ".join(con)

    con_vectors = gpt_model.fit_transform([str_con])
    con_vec = np.array(con_vectors[0]).reshape(1, -1)

    print(dom_vec_list[0][1])
    for dom_vec in dom_vec_list:
        sim_score = cosine_similarity(con_vec, dom_vec[1])
        sim_res.append((dom_vec[0], sim_score))
        
    sim_df = pd.DataFrame(sim_res, columns=['domain','sim_score']).sort_values(by='sim_score', ascending=False).reset_index(drop=True)
    sim_df = sim_df[sim_df['sim_score']>0]

    return sim_df

In [49]:
gpt_domain_res = []
gpt_model = GPTVectorizer()

for con in cl:
    df_res = comp_con_domains_gpt(gpt_model, con, dom_vec_list)
    df_res['concept'] = str(con)
    df_res = df_res[['concept', 'domain', 'sim_score']]
    gpt_domain_res.append(df_res)
    time.sleep(0.5)

Working on concept ['command', 'serializable', 'deserializable', 'task', 'number', 'builder', 'profile', 'duration', 'max', 'network']
No. of domains to run against: 267
Batch size: 1


100%|██████████| 1/1 [00:00<00:00,  1.10it/s]


[[-0.01526114 -0.02739326  0.00470422 ... -0.02261437 -0.00247834
   0.00118139]]
Working on concept ['command', 'builder', 'number', 'runtime', 'unsupported', 'path', 'add', 'serializable', 'serialize', 'trace']
No. of domains to run against: 267
Batch size: 1


100%|██████████| 1/1 [00:00<00:00,  1.50it/s]


[[-0.01526114 -0.02739326  0.00470422 ... -0.02261437 -0.00247834
   0.00118139]]
Working on concept ['trigger', 'ebpf', 'fix', 'gson', 'extension', 'process', 'update', 'target', 'task', 'command']
No. of domains to run against: 267
Batch size: 1


100%|██████████| 1/1 [00:00<00:00,  1.58it/s]


[[-0.01526114 -0.02739326  0.00470422 ... -0.02261437 -0.00247834
   0.00118139]]
Working on concept ['task', 'profile', 'command', 'duration', 'time', 'min', 'max', 'endpoint', 'count', 'dump']
No. of domains to run against: 267
Batch size: 1


100%|██████████| 1/1 [00:00<00:00,  1.37it/s]


[[-0.01526114 -0.02739326  0.00470422 ... -0.02261437 -0.00247834
   0.00118139]]
Working on concept ['command', 'discovery', 'uuid', 'number', 'deserializable', 'serializable', 'key', 'value', 'pair', 'deserialize']
No. of domains to run against: 267
Batch size: 1


100%|██████████| 1/1 [00:00<00:00,  1.66it/s]


[[-0.01526114 -0.02739326  0.00470422 ... -0.02261437 -0.00247834
   0.00118139]]
Working on concept ['command', 'serializable', 'deserializable', 'profile', 'number', 'duration', 'builder', 'task', 'unsupported', 'uuid']
No. of domains to run against: 267
Batch size: 1


100%|██████████| 1/1 [00:00<00:00,  1.54it/s]


[[-0.01526114 -0.02739326  0.00470422 ... -0.02261437 -0.00247834
   0.00118139]]
Working on concept ['setting', 'integer', 'network', 'rule', 'max', 'size', 'request', 'require', 'response', 'sample']
No. of domains to run against: 267
Batch size: 1


100%|██████████| 1/1 [00:00<00:00,  1.63it/s]


[[-0.01526114 -0.02739326  0.00470422 ... -0.02261437 -0.00247834
   0.00118139]]
Working on concept ['command', 'serializable', 'deserializable', 'number', 'builder', 'task', 'profile', 'unsupported', 'deserializer', 'max']
No. of domains to run against: 267
Batch size: 1


100%|██████████| 1/1 [00:00<00:00,  1.45it/s]


[[-0.01526114 -0.02739326  0.00470422 ... -0.02261437 -0.00247834
   0.00118139]]


In [50]:
gpt_domain_res[0]

Unnamed: 0,concept,domain,sim_score
0,"['command', 'serializable', 'deserializable', 'task', 'number', 'builder', 'profile', 'duration', 'max', 'network']",serialization,[[0.7871095032472926]]
1,"['command', 'serializable', 'deserializable', 'task', 'number', 'builder', 'profile', 'duration', 'max', 'network']",computer benchmarking,[[0.7767717413135821]]
2,"['command', 'serializable', 'deserializable', 'task', 'number', 'builder', 'profile', 'duration', 'max', 'network']",network monitoring,[[0.776385586986835]]
3,"['command', 'serializable', 'deserializable', 'task', 'number', 'builder', 'profile', 'duration', 'max', 'network']",command-line interface,[[0.7755979170428933]]
4,"['command', 'serializable', 'deserializable', 'task', 'number', 'builder', 'profile', 'duration', 'max', 'network']",network analysis,[[0.7733984086127061]]
...,...,...,...
262,"['command', 'serializable', 'deserializable', 'task', 'number', 'builder', 'profile', 'duration', 'max', 'network']",banking industry,[[0.7011476762806811]]
263,"['command', 'serializable', 'deserializable', 'task', 'number', 'builder', 'profile', 'duration', 'max', 'network']",reinforcement learning,[[0.6995509128730848]]
264,"['command', 'serializable', 'deserializable', 'task', 'number', 'builder', 'profile', 'duration', 'max', 'network']",face detection,[[0.6994477931073957]]
265,"['command', 'serializable', 'deserializable', 'task', 'number', 'builder', 'profile', 'duration', 'max', 'network']",mathematical finance,[[0.6963970813150013]]


In [51]:
pd.set_option('display.max_colwidth', None)
gpt_domain_match_df = pd.concat(gpt_domain_res).reset_index(drop=True)
gpt_domain_match_df

Unnamed: 0,concept,domain,sim_score
0,"['command', 'serializable', 'deserializable', 'task', 'number', 'builder', 'profile', 'duration', 'max', 'network']",serialization,[[0.7871095032472926]]
1,"['command', 'serializable', 'deserializable', 'task', 'number', 'builder', 'profile', 'duration', 'max', 'network']",computer benchmarking,[[0.7767717413135821]]
2,"['command', 'serializable', 'deserializable', 'task', 'number', 'builder', 'profile', 'duration', 'max', 'network']",network monitoring,[[0.776385586986835]]
3,"['command', 'serializable', 'deserializable', 'task', 'number', 'builder', 'profile', 'duration', 'max', 'network']",command-line interface,[[0.7755979170428933]]
4,"['command', 'serializable', 'deserializable', 'task', 'number', 'builder', 'profile', 'duration', 'max', 'network']",network analysis,[[0.7733984086127061]]
...,...,...,...
2131,"['command', 'serializable', 'deserializable', 'number', 'builder', 'task', 'profile', 'unsupported', 'deserializer', 'max']",audio signal processing,[[0.6813674366676089]]
2132,"['command', 'serializable', 'deserializable', 'number', 'builder', 'task', 'profile', 'unsupported', 'deserializer', 'max']",banking industry,[[0.6803217919213869]]
2133,"['command', 'serializable', 'deserializable', 'number', 'builder', 'task', 'profile', 'unsupported', 'deserializer', 'max']",word embedding,[[0.6765283827069019]]
2134,"['command', 'serializable', 'deserializable', 'number', 'builder', 'task', 'profile', 'unsupported', 'deserializer', 'max']",reinforcement learning,[[0.6748938159319278]]


In [52]:
gpt_domain_match_df.to_csv("gpt_domain_match_df.csv", index=False)