## Concept/Domain coverage analysis: Matching with App Domains

This notebook extracts corpus keywords and analyse the concepts emerging from three sources: source code, documentation and tests

## Show topics

In [1]:
import pandas as pd

topics_df = pd.read_csv("topics_res_df.csv")
topics_df[20:25]

Unnamed: 0,project_name,module,code_num_topics,test_num_topics,code_shown_topics,code_topics,test_shown_topics,test_topics,doc_num_topics,doc_shown_topics,doc_topics
20,03_flink,flink-clients,8,9,"[(0, [('jar', 0.10477287), ('url', 0.06690521)...","[['jar', 'url', 'entry', 'program', 'setting',...","[(0, [('cluster', 0.11344488), ('factory', 0.0...","[['jar', 'url', 'entry', 'program', 'setting',...",9,"[(0, [('scala', 0.0055014915), ('processing', ...","[['scala', 'processing', 'java', 'intellij', '..."
21,03_flink,flink-connectors,9,9,"[(0, [('split', 0.17729564), ('source', 0.0911...","[['split', 'source', 'reader', 'hive', 'partit...","[(0, [('kafka', 0.047333054), ('partition', 0....","[['split', 'source', 'reader', 'hive', 'partit...",9,"[(0, [('scala', 0.0055014915), ('processing', ...","[['scala', 'processing', 'java', 'intellij', '..."
22,03_flink,flink-container,5,6,"[(0, [('cluster', 0.043478318), ('application'...","[['cluster', 'application', 'entry', 'line', '...","[(0, [('application', 0.08333337), ('standalon...","[['cluster', 'application', 'entry', 'line', '...",9,"[(0, [('scala', 0.0055014915), ('processing', ...","[['scala', 'processing', 'java', 'intellij', '..."
23,03_flink,flink-contrib,9,8,"[(0, [('event', 0.015625862), ('edit', 0.01559...","[['event', 'edit', 'diff', 'timestamp', 'chann...","[(0, [('context', 0.04166667), ('source', 0.04...","[['event', 'edit', 'diff', 'timestamp', 'chann...",9,"[(0, [('scala', 0.0055014915), ('processing', ...","[['scala', 'processing', 'java', 'intellij', '..."
24,03_flink,flink-core,7,9,"[(0, [('key', 0.10738443), ('comparator', 0.09...","[['key', 'comparator', 'value', 'normalize', '...","[(0, [('map', 0.13233617), ('integer', 0.10897...","[['key', 'comparator', 'value', 'normalize', '...",9,"[(0, [('scala', 0.0055014915), ('processing', ...","[['scala', 'processing', 'java', 'intellij', '..."


## Show annotated modules

In [2]:
anno_df = pd.read_csv("module_annotation.csv")
anno_df = anno_df[["project", "module", "top", "labels"]]
anno_df.head()

Unnamed: 0,project,module,top,labels
0,dubbo,dubbo-configcenter,big data,"['big data', 'instant messaging', 'user interf..."
1,dubbo,dubbo-remoting,server,"['server', 'instant messaging', 'web service',..."
2,dubbo,dubbo-spring-boot,microservices,"['microservices', 'web service', 'instant mess..."
3,dubbo,dubbo-serialization,serialization,"['serialization', 'database', 'file system', '..."
4,dubbo,dubbo-native,web server,"['web server', 'instant messaging', 'web servi..."


In [3]:
anno_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 483 entries, 0 to 482
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   project  483 non-null    object
 1   module   483 non-null    object
 2   top      483 non-null    object
 3   labels   483 non-null    object
dtypes: object(4)
memory usage: 15.2+ KB


## Take a subset of df as test

In [4]:
# skywalking apm-protocol as an example

topics_sub_df = topics_df[topics_df["module"]=="apm-protocol"]
topics_sub_df

Unnamed: 0,project_name,module,code_num_topics,test_num_topics,code_shown_topics,code_topics,test_shown_topics,test_topics,doc_num_topics,doc_shown_topics,doc_topics
17,02_skywalking,apm-protocol,8,1,"[(0, [('command', 0.0175439), ('serializable',...","[['command', 'serializable', 'deserializable',...","[(0, [('command', 0.11111111), ('complete', 0....","[['command', 'serializable', 'deserializable',...",9,"[(0, [('trace', 0.0060752206), ('support', 0.0...","[['trace', 'support', 'metric', 'mail', 'nativ..."


In [5]:
anno_sub_df = anno_df[anno_df["module"]=="apm-protocol"]
anno_sub_df

Unnamed: 0,project,module,top,labels
26,skywalking,apm-protocol,server,"['server', 'plot', 'instant messaging', 'websi..."


In [6]:
# pd.set_option('display.max_colwidth', None)
module_df = topics_sub_df.merge(anno_sub_df).drop(columns=["project"])
module_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   project_name       1 non-null      object
 1   module             1 non-null      object
 2   code_num_topics    1 non-null      int64 
 3   test_num_topics    1 non-null      int64 
 4   code_shown_topics  1 non-null      object
 5   code_topics        1 non-null      object
 6   test_shown_topics  1 non-null      object
 7   test_topics        1 non-null      object
 8   doc_num_topics     1 non-null      int64 
 9   doc_shown_topics   1 non-null      object
 10  doc_topics         1 non-null      object
 11  top                1 non-null      object
 12  labels             1 non-null      object
dtypes: int64(3), object(10)
memory usage: 236.0+ bytes


## LLM Matching of AD to Concepts

### using apm-protocol module as an example

### Using embeddings then calculate semantic similarity to match with domains

- StackOverflow w2v
- all-MiniLM-L6-v2-f16 (GPT4all + scikit-LLM)
- text-embedding-ada-002 (openAI + scikit-LLM)

In [7]:
concept_list = module_df['code_topics'][0]
domains = module_df['labels'][0]

In [8]:
concept_list

"[['command', 'serializable', 'deserializable', 'task', 'number', 'builder', 'profile', 'duration', 'max', 'network'], ['command', 'builder', 'number', 'runtime', 'unsupported', 'path', 'add', 'serializable', 'serialize', 'trace'], ['trigger', 'ebpf', 'fix', 'gson', 'extension', 'process', 'update', 'target', 'task', 'command'], ['task', 'profile', 'command', 'duration', 'time', 'min', 'max', 'endpoint', 'count', 'dump'], ['command', 'discovery', 'uuid', 'number', 'deserializable', 'serializable', 'key', 'value', 'pair', 'deserialize'], ['command', 'serializable', 'deserializable', 'profile', 'number', 'duration', 'builder', 'task', 'unsupported', 'uuid'], ['setting', 'integer', 'network', 'rule', 'max', 'size', 'request', 'require', 'response', 'sample'], ['command', 'serializable', 'deserializable', 'number', 'builder', 'task', 'profile', 'unsupported', 'deserializer', 'max']]"

In [9]:
domains

"['server', 'plot', 'instant messaging', 'website', 'file system', 'web server', 'database', 'command-line interface', 'World Wide Web', 'package management system', 'application performance management', 'client', 'web service', 'File Transfer Protocol', 'shell tool', 'user interface', 'telecommunications network', 'HTTP server', 'computer configuration', 'data binding', 'big data', 'extract, transform, load', 'object detection', 'data', 'security', 'web application', 'regular expression', 'data structure', 'web application security', 'smart contract', 'statistics', 'machine translation', 'social network', 'pattern matching', 'network monitoring', 'microservices', 'network security', 'time series', 'continuous integration', 'analytics', 'automation', 'object–relational mapping', 'HTTP client', 'neural machine translation', 'password manager', 'back end', 'operating system', 'WebSocket', 'embedded system', 'game server', 'font', 'evolutionary algorithm', 'data visualization', 'face dete

In [10]:
import ast

cl = ast.literal_eval(concept_list)
dl = ast.literal_eval(domains)

### StackOverflow W2V

In [30]:
from gensim.models.keyedvectors import KeyedVectors

so_w2v_model = KeyedVectors.load_word2vec_format('C:/Users/biadge/OneDrive - BP/PhD/extraction/SO_vectors_200.bin', binary=True)

In [31]:
# test
so_w2v_model.n_similarity(['test', 'case'], ['quality'])

0.1252212

In [32]:
def comp_con_domains_sow2v(so_w2v_model, con, domains):
    # list of domains should be given, now we return an array of similarity scores for all domains
    sim_res = []

    for dom in domains:
        sim_res.append((dom, so_w2v_model.n_similarity(con, [dom])))

    sim_df = pd.DataFrame(sim_res, columns=['domain','sim_score']).sort_values(by='sim_score', ascending=False).reset_index(drop=True)
    sim_df = sim_df[sim_df['sim_score']>0]

    return sim_df

In [33]:
sow2v_domain_res = []

for con in cl:
    df_res = comp_con_domains_sow2v(so_w2v_model, con, dl)
    df_res['concept'] = str(con)
    df_res = df_res[['concept', 'domain', 'sim_score']]
    sow2v_domain_res.append(df_res)

In [34]:
sow2v_domain_res[0]

Unnamed: 0,concept,domain,sim_score
0,"['command', 'serializable', 'deserializable', 'task', 'number', 'builder', 'profile', 'duration', 'max', 'network']",data,0.434844
1,"['command', 'serializable', 'deserializable', 'task', 'number', 'builder', 'profile', 'duration', 'max', 'network']",database,0.424948
2,"['command', 'serializable', 'deserializable', 'task', 'number', 'builder', 'profile', 'duration', 'max', 'network']",server,0.420506
3,"['command', 'serializable', 'deserializable', 'task', 'number', 'builder', 'profile', 'duration', 'max', 'network']",client,0.399335
4,"['command', 'serializable', 'deserializable', 'task', 'number', 'builder', 'profile', 'duration', 'max', 'network']",scheduler,0.373519
...,...,...,...
58,"['command', 'serializable', 'deserializable', 'task', 'number', 'builder', 'profile', 'duration', 'max', 'network']",routing,0.027414
59,"['command', 'serializable', 'deserializable', 'task', 'number', 'builder', 'profile', 'duration', 'max', 'network']",visualization,0.027070
60,"['command', 'serializable', 'deserializable', 'task', 'number', 'builder', 'profile', 'duration', 'max', 'network']",cryptography,0.025774
61,"['command', 'serializable', 'deserializable', 'task', 'number', 'builder', 'profile', 'duration', 'max', 'network']",engineering,0.024542


In [35]:
pd.set_option('display.max_colwidth', None)
sow2v_domain_match_df = pd.concat(sow2v_domain_res).reset_index(drop=True)
sow2v_domain_match_df

Unnamed: 0,concept,domain,sim_score
0,"['command', 'serializable', 'deserializable', 'task', 'number', 'builder', 'profile', 'duration', 'max', 'network']",data,0.434844
1,"['command', 'serializable', 'deserializable', 'task', 'number', 'builder', 'profile', 'duration', 'max', 'network']",database,0.424948
2,"['command', 'serializable', 'deserializable', 'task', 'number', 'builder', 'profile', 'duration', 'max', 'network']",server,0.420506
3,"['command', 'serializable', 'deserializable', 'task', 'number', 'builder', 'profile', 'duration', 'max', 'network']",client,0.399335
4,"['command', 'serializable', 'deserializable', 'task', 'number', 'builder', 'profile', 'duration', 'max', 'network']",scheduler,0.373519
...,...,...,...
425,"['command', 'serializable', 'deserializable', 'number', 'builder', 'task', 'profile', 'unsupported', 'deserializer', 'max']",music,0.044551
426,"['command', 'serializable', 'deserializable', 'number', 'builder', 'task', 'profile', 'unsupported', 'deserializer', 'max']",cryptography,0.040918
427,"['command', 'serializable', 'deserializable', 'number', 'builder', 'task', 'profile', 'unsupported', 'deserializer', 'max']",virtualization,0.037310
428,"['command', 'serializable', 'deserializable', 'number', 'builder', 'task', 'profile', 'unsupported', 'deserializer', 'max']",documentation,0.034948


In [36]:
sow2v_domain_match_df.to_csv("sow2v_domain_match_df.csv", index=False)

### scikit-LLM (GPT4All all-MiniLM-L6-v2-f16)

In [None]:
from gpt4all import Embed4All
text = 'The quick brown fox jumps over the lazy dog'
embedder = Embed4All()
output = embedder.embed(text)
print(output)

### scikit-LLM (OpenAI text-embedding-ada-002)

In [17]:
from skllm.preprocessing import GPTVectorizer
from skllm.config import SKLLMConfig
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from dotenv import load_dotenv, find_dotenv
import os

In [18]:
_ = load_dotenv(find_dotenv()) # read local .env file

In [19]:
SKLLMConfig.set_openai_key(os.getenv("API_KEY"))

In [20]:
model = GPTVectorizer()

vectors = model.fit_transform(["how old are you?", "what is your age?"])

vector_1 = np.array(vectors[0]).reshape(1, -1)
vector_2 = np.array(vectors[1]).reshape(1, -1)

100%|██████████| 2/2 [00:01<00:00,  1.13it/s]


In [21]:
cosine_similarity(vector_1, vector_2)

array([[0.94791596]])

In [22]:
import time

def comp_con_domains_gpt(gpt_model, con, domains):
    # list of domains should be given, now we return an array of similarity scores for all domains
    sim_res = []

    print("Working on concept {}".format(str(con)))
    print("No. of domains to run against: {}".format(str(len(domains))))

    for dom in domains:
        str_con = " ".join(con)

        vectors = gpt_model.fit_transform([str_con, dom])

        vector_1 = np.array(vectors[0]).reshape(1, -1)
        vector_2 = np.array(vectors[1]).reshape(1, -1)

        sim_res.append((dom, cosine_similarity(vector_1, vector_2)))
        time.sleep(1)

    sim_df = pd.DataFrame(sim_res, columns=['domain','sim_score']).sort_values(by='sim_score', ascending=False).reset_index(drop=True)
    sim_df = sim_df[sim_df['sim_score']>0]

    return sim_df

In [23]:
gpt_domain_res = []
gpt_model = GPTVectorizer()

for con in cl:
    df_res = comp_con_domains_gpt(gpt_model, con, dl)
    df_res['concept'] = str(con)
    df_res = df_res[['concept', 'domain', 'sim_score']]
    gpt_domain_res.append(df_res)

Working on concept ['command', 'serializable', 'deserializable', 'task', 'number', 'builder', 'profile', 'duration', 'max', 'network']
No. of domains to run against: 267


100%|██████████| 2/2 [00:00<00:00,  4.19it/s]
100%|██████████| 2/2 [00:00<00:00,  4.55it/s]
100%|██████████| 2/2 [00:00<00:00,  4.43it/s]
100%|██████████| 2/2 [00:00<00:00,  3.08it/s]
100%|██████████| 2/2 [00:00<00:00,  4.42it/s]
100%|██████████| 2/2 [00:00<00:00,  4.25it/s]
100%|██████████| 2/2 [00:00<00:00,  3.00it/s]
100%|██████████| 2/2 [00:00<00:00,  4.55it/s]
100%|██████████| 2/2 [00:00<00:00,  3.72it/s]
100%|██████████| 2/2 [00:01<00:00,  1.48it/s]
100%|██████████| 2/2 [00:00<00:00,  4.40it/s]
100%|██████████| 2/2 [00:00<00:00,  4.15it/s]
100%|██████████| 2/2 [00:00<00:00,  3.69it/s]
100%|██████████| 2/2 [00:00<00:00,  4.34it/s]
100%|██████████| 2/2 [00:00<00:00,  4.32it/s]
100%|██████████| 2/2 [00:00<00:00,  4.57it/s]
100%|██████████| 2/2 [00:00<00:00,  4.16it/s]
100%|██████████| 2/2 [00:00<00:00,  2.06it/s]
100%|██████████| 2/2 [00:00<00:00,  4.23it/s]
100%|██████████| 2/2 [00:00<00:00,  4.37it/s]
100%|██████████| 2/2 [00:00<00:00,  4.35it/s]
100%|██████████| 2/2 [00:00<00:00,

Working on concept ['command', 'builder', 'number', 'runtime', 'unsupported', 'path', 'add', 'serializable', 'serialize', 'trace']
No. of domains to run against: 267


100%|██████████| 2/2 [00:00<00:00,  3.86it/s]
100%|██████████| 2/2 [00:00<00:00,  4.24it/s]
100%|██████████| 2/2 [00:02<00:00,  1.14s/it]
100%|██████████| 2/2 [00:01<00:00,  1.72it/s]
100%|██████████| 2/2 [00:00<00:00,  3.33it/s]
100%|██████████| 2/2 [00:00<00:00,  3.77it/s]
100%|██████████| 2/2 [00:00<00:00,  3.20it/s]
100%|██████████| 2/2 [00:00<00:00,  3.07it/s]
100%|██████████| 2/2 [00:00<00:00,  4.73it/s]
100%|██████████| 2/2 [00:01<00:00,  1.81it/s]
100%|██████████| 2/2 [00:00<00:00,  4.18it/s]
100%|██████████| 2/2 [00:00<00:00,  4.74it/s]
100%|██████████| 2/2 [00:00<00:00,  4.66it/s]
100%|██████████| 2/2 [00:00<00:00,  4.74it/s]
100%|██████████| 2/2 [00:00<00:00,  4.60it/s]
100%|██████████| 2/2 [00:05<00:00,  2.62s/it]
100%|██████████| 2/2 [00:01<00:00,  1.14it/s]
100%|██████████| 2/2 [00:00<00:00,  4.55it/s]
100%|██████████| 2/2 [00:00<00:00,  4.40it/s]
100%|██████████| 2/2 [00:00<00:00,  4.03it/s]
100%|██████████| 2/2 [00:00<00:00,  3.05it/s]
100%|██████████| 2/2 [00:00<00:00,

Working on concept ['trigger', 'ebpf', 'fix', 'gson', 'extension', 'process', 'update', 'target', 'task', 'command']
No. of domains to run against: 267


100%|██████████| 2/2 [00:00<00:00,  4.60it/s]
100%|██████████| 2/2 [00:00<00:00,  4.34it/s]
100%|██████████| 2/2 [00:00<00:00,  4.52it/s]
100%|██████████| 2/2 [00:00<00:00,  4.82it/s]
100%|██████████| 2/2 [00:00<00:00,  4.58it/s]
100%|██████████| 2/2 [00:00<00:00,  3.55it/s]
100%|██████████| 2/2 [00:00<00:00,  3.56it/s]
100%|██████████| 2/2 [00:00<00:00,  4.74it/s]
100%|██████████| 2/2 [00:00<00:00,  4.67it/s]
100%|██████████| 2/2 [00:00<00:00,  3.32it/s]
100%|██████████| 2/2 [00:00<00:00,  4.43it/s]
100%|██████████| 2/2 [00:00<00:00,  4.20it/s]
100%|██████████| 2/2 [00:00<00:00,  3.16it/s]
100%|██████████| 2/2 [00:00<00:00,  4.37it/s]
100%|██████████| 2/2 [00:00<00:00,  3.65it/s]
100%|██████████| 2/2 [00:00<00:00,  4.41it/s]
100%|██████████| 2/2 [00:01<00:00,  1.63it/s]
100%|██████████| 2/2 [00:00<00:00,  4.09it/s]
100%|██████████| 2/2 [00:00<00:00,  3.53it/s]
100%|██████████| 2/2 [00:00<00:00,  3.98it/s]
100%|██████████| 2/2 [00:00<00:00,  4.07it/s]
100%|██████████| 2/2 [00:00<00:00,

Working on concept ['task', 'profile', 'command', 'duration', 'time', 'min', 'max', 'endpoint', 'count', 'dump']
No. of domains to run against: 267


100%|██████████| 2/2 [00:01<00:00,  1.18it/s]
100%|██████████| 2/2 [00:00<00:00,  3.64it/s]
100%|██████████| 2/2 [00:00<00:00,  4.78it/s]
100%|██████████| 2/2 [00:00<00:00,  3.98it/s]
100%|██████████| 2/2 [00:00<00:00,  4.56it/s]
100%|██████████| 2/2 [00:00<00:00,  4.20it/s]
100%|██████████| 2/2 [00:00<00:00,  3.84it/s]
100%|██████████| 2/2 [00:00<00:00,  4.26it/s]
100%|██████████| 2/2 [00:00<00:00,  3.15it/s]
100%|██████████| 2/2 [00:00<00:00,  4.33it/s]
100%|██████████| 2/2 [00:00<00:00,  4.34it/s]
100%|██████████| 2/2 [00:00<00:00,  3.06it/s]
100%|██████████| 2/2 [00:00<00:00,  3.56it/s]
100%|██████████| 2/2 [00:00<00:00,  4.32it/s]
100%|██████████| 2/2 [00:00<00:00,  4.24it/s]
100%|██████████| 2/2 [00:00<00:00,  4.60it/s]
100%|██████████| 2/2 [00:00<00:00,  4.58it/s]
100%|██████████| 2/2 [00:00<00:00,  4.32it/s]
100%|██████████| 2/2 [00:00<00:00,  4.24it/s]
100%|██████████| 2/2 [00:00<00:00,  4.63it/s]
100%|██████████| 2/2 [00:00<00:00,  4.17it/s]
100%|██████████| 2/2 [00:00<00:00,

Working on concept ['command', 'discovery', 'uuid', 'number', 'deserializable', 'serializable', 'key', 'value', 'pair', 'deserialize']
No. of domains to run against: 267


100%|██████████| 2/2 [00:00<00:00,  4.70it/s]
100%|██████████| 2/2 [00:00<00:00,  4.52it/s]
100%|██████████| 2/2 [00:02<00:00,  1.12s/it]
100%|██████████| 2/2 [00:00<00:00,  2.60it/s]
100%|██████████| 2/2 [00:00<00:00,  4.59it/s]
100%|██████████| 2/2 [00:00<00:00,  3.57it/s]
100%|██████████| 2/2 [00:00<00:00,  3.35it/s]
100%|██████████| 2/2 [00:00<00:00,  2.77it/s]
100%|██████████| 2/2 [00:00<00:00,  4.00it/s]
100%|██████████| 2/2 [00:00<00:00,  2.87it/s]
100%|██████████| 2/2 [00:00<00:00,  4.31it/s]
100%|██████████| 2/2 [00:00<00:00,  4.27it/s]
100%|██████████| 2/2 [00:00<00:00,  4.20it/s]
100%|██████████| 2/2 [00:00<00:00,  4.37it/s]
100%|██████████| 2/2 [00:00<00:00,  4.58it/s]
100%|██████████| 2/2 [00:00<00:00,  4.69it/s]
100%|██████████| 2/2 [00:00<00:00,  4.22it/s]
100%|██████████| 2/2 [00:00<00:00,  4.66it/s]
100%|██████████| 2/2 [00:00<00:00,  4.62it/s]
100%|██████████| 2/2 [00:00<00:00,  4.23it/s]
100%|██████████| 2/2 [00:00<00:00,  4.60it/s]
100%|██████████| 2/2 [00:00<00:00,

Working on concept ['command', 'serializable', 'deserializable', 'profile', 'number', 'duration', 'builder', 'task', 'unsupported', 'uuid']
No. of domains to run against: 267


100%|██████████| 2/2 [00:00<00:00,  3.16it/s]
100%|██████████| 2/2 [00:00<00:00,  4.05it/s]
100%|██████████| 2/2 [00:00<00:00,  3.58it/s]
100%|██████████| 2/2 [00:00<00:00,  4.11it/s]
100%|██████████| 2/2 [00:00<00:00,  4.47it/s]
100%|██████████| 2/2 [00:00<00:00,  4.34it/s]
100%|██████████| 2/2 [00:00<00:00,  4.15it/s]
100%|██████████| 2/2 [00:00<00:00,  4.21it/s]
100%|██████████| 2/2 [00:00<00:00,  3.27it/s]
100%|██████████| 2/2 [00:00<00:00,  2.96it/s]
100%|██████████| 2/2 [00:00<00:00,  4.27it/s]
100%|██████████| 2/2 [00:00<00:00,  3.72it/s]
100%|██████████| 2/2 [00:00<00:00,  4.65it/s]
100%|██████████| 2/2 [00:00<00:00,  3.70it/s]
100%|██████████| 2/2 [00:00<00:00,  4.32it/s]
100%|██████████| 2/2 [00:00<00:00,  3.77it/s]
100%|██████████| 2/2 [00:00<00:00,  4.22it/s]
100%|██████████| 2/2 [00:00<00:00,  4.17it/s]
100%|██████████| 2/2 [00:00<00:00,  4.68it/s]
100%|██████████| 2/2 [00:00<00:00,  4.45it/s]
100%|██████████| 2/2 [00:00<00:00,  3.96it/s]
100%|██████████| 2/2 [00:00<00:00,

Working on concept ['setting', 'integer', 'network', 'rule', 'max', 'size', 'request', 'require', 'response', 'sample']
No. of domains to run against: 267


100%|██████████| 2/2 [00:00<00:00,  4.66it/s]
100%|██████████| 2/2 [00:02<00:00,  1.40s/it]
100%|██████████| 2/2 [00:00<00:00,  4.27it/s]
100%|██████████| 2/2 [00:00<00:00,  4.25it/s]
100%|██████████| 2/2 [00:00<00:00,  2.90it/s]
100%|██████████| 2/2 [00:00<00:00,  2.70it/s]
100%|██████████| 2/2 [00:00<00:00,  2.24it/s]
100%|██████████| 2/2 [00:00<00:00,  2.20it/s]
100%|██████████| 2/2 [00:00<00:00,  3.89it/s]
100%|██████████| 2/2 [00:00<00:00,  4.59it/s]
100%|██████████| 2/2 [00:00<00:00,  4.55it/s]
100%|██████████| 2/2 [00:00<00:00,  4.38it/s]
100%|██████████| 2/2 [00:00<00:00,  4.59it/s]
100%|██████████| 2/2 [00:00<00:00,  4.65it/s]
100%|██████████| 2/2 [00:00<00:00,  3.88it/s]
100%|██████████| 2/2 [00:00<00:00,  3.43it/s]
100%|██████████| 2/2 [00:00<00:00,  4.37it/s]
100%|██████████| 2/2 [00:00<00:00,  3.10it/s]
100%|██████████| 2/2 [00:00<00:00,  3.80it/s]
100%|██████████| 2/2 [00:00<00:00,  4.32it/s]
100%|██████████| 2/2 [00:00<00:00,  4.33it/s]
100%|██████████| 2/2 [00:00<00:00,

Working on concept ['command', 'serializable', 'deserializable', 'number', 'builder', 'task', 'profile', 'unsupported', 'deserializer', 'max']
No. of domains to run against: 267


100%|██████████| 2/2 [00:00<00:00,  3.20it/s]
100%|██████████| 2/2 [00:00<00:00,  3.43it/s]
100%|██████████| 2/2 [00:01<00:00,  1.53it/s]
100%|██████████| 2/2 [00:00<00:00,  4.32it/s]
100%|██████████| 2/2 [00:00<00:00,  2.80it/s]
100%|██████████| 2/2 [00:00<00:00,  2.66it/s]
100%|██████████| 2/2 [00:01<00:00,  1.53it/s]
100%|██████████| 2/2 [00:04<00:00,  2.09s/it]
100%|██████████| 2/2 [00:00<00:00,  4.60it/s]
100%|██████████| 2/2 [00:00<00:00,  4.35it/s]
100%|██████████| 2/2 [00:00<00:00,  4.56it/s]
100%|██████████| 2/2 [00:01<00:00,  1.10it/s]
100%|██████████| 2/2 [00:00<00:00,  4.40it/s]
100%|██████████| 2/2 [00:00<00:00,  3.59it/s]
100%|██████████| 2/2 [00:00<00:00,  4.28it/s]
100%|██████████| 2/2 [00:00<00:00,  3.61it/s]
100%|██████████| 2/2 [00:00<00:00,  4.25it/s]
100%|██████████| 2/2 [00:00<00:00,  3.74it/s]
100%|██████████| 2/2 [00:00<00:00,  4.84it/s]
100%|██████████| 2/2 [00:00<00:00,  4.40it/s]
100%|██████████| 2/2 [00:00<00:00,  4.50it/s]
100%|██████████| 2/2 [00:01<00:00,

In [26]:
gpt_domain_res[0]

Unnamed: 0,concept,domain,sim_score
0,"['command', 'serializable', 'deserializable', 'task', 'number', 'builder', 'profile', 'duration', 'max', 'network']",serialization,[[0.7871095032472926]]
1,"['command', 'serializable', 'deserializable', 'task', 'number', 'builder', 'profile', 'duration', 'max', 'network']",computer benchmarking,[[0.7767717413135821]]
2,"['command', 'serializable', 'deserializable', 'task', 'number', 'builder', 'profile', 'duration', 'max', 'network']",network monitoring,[[0.776385586986835]]
3,"['command', 'serializable', 'deserializable', 'task', 'number', 'builder', 'profile', 'duration', 'max', 'network']",command-line interface,[[0.7755979170428933]]
4,"['command', 'serializable', 'deserializable', 'task', 'number', 'builder', 'profile', 'duration', 'max', 'network']",network analysis,[[0.7733984086127061]]
...,...,...,...
262,"['command', 'serializable', 'deserializable', 'task', 'number', 'builder', 'profile', 'duration', 'max', 'network']",banking industry,[[0.7011476762806811]]
263,"['command', 'serializable', 'deserializable', 'task', 'number', 'builder', 'profile', 'duration', 'max', 'network']",reinforcement learning,[[0.6995509128730848]]
264,"['command', 'serializable', 'deserializable', 'task', 'number', 'builder', 'profile', 'duration', 'max', 'network']",face detection,[[0.6994477931073957]]
265,"['command', 'serializable', 'deserializable', 'task', 'number', 'builder', 'profile', 'duration', 'max', 'network']",mathematical finance,[[0.6963970813150013]]


In [28]:
pd.set_option('display.max_colwidth', None)
gpt_domain_match_df = pd.concat(gpt_domain_res).reset_index(drop=True)
gpt_domain_match_df

Unnamed: 0,concept,domain,sim_score
0,"['command', 'serializable', 'deserializable', 'task', 'number', 'builder', 'profile', 'duration', 'max', 'network']",serialization,[[0.7871095032472926]]
1,"['command', 'serializable', 'deserializable', 'task', 'number', 'builder', 'profile', 'duration', 'max', 'network']",computer benchmarking,[[0.7767717413135821]]
2,"['command', 'serializable', 'deserializable', 'task', 'number', 'builder', 'profile', 'duration', 'max', 'network']",network monitoring,[[0.776385586986835]]
3,"['command', 'serializable', 'deserializable', 'task', 'number', 'builder', 'profile', 'duration', 'max', 'network']",command-line interface,[[0.7755979170428933]]
4,"['command', 'serializable', 'deserializable', 'task', 'number', 'builder', 'profile', 'duration', 'max', 'network']",network analysis,[[0.7733984086127061]]
...,...,...,...
2131,"['command', 'serializable', 'deserializable', 'number', 'builder', 'task', 'profile', 'unsupported', 'deserializer', 'max']",audio signal processing,[[0.6816538430905145]]
2132,"['command', 'serializable', 'deserializable', 'number', 'builder', 'task', 'profile', 'unsupported', 'deserializer', 'max']",banking industry,[[0.6804565451745261]]
2133,"['command', 'serializable', 'deserializable', 'number', 'builder', 'task', 'profile', 'unsupported', 'deserializer', 'max']",word embedding,[[0.6765824222485524]]
2134,"['command', 'serializable', 'deserializable', 'number', 'builder', 'task', 'profile', 'unsupported', 'deserializer', 'max']",reinforcement learning,[[0.6748938159319278]]


In [29]:
gpt_domain_match_df.to_csv("gpt_domain_match_df.csv", index=False)