## Concept/Domain coverage analysis: Matching with App Domains

This notebook extracts corpus keywords and analyse the concepts emerging from three sources: source code, documentation and tests

## Show topics

In [1]:
import pandas as pd
# pd.set_option('display.max_colwidth', None)

topics_df = pd.read_csv("topics_res_df.csv")
# topics_df = topics_df[["project_name", "module", "code_topics", "test_topics", "doc_topics"]]
topics_df[20:25]

Unnamed: 0,project_name,module,code_num_topics,test_num_topics,code_shown_topics,code_topics,test_shown_topics,test_topics,doc_num_topics,doc_shown_topics,doc_topics
20,03_flink,flink-clients,8,9,"[(0, [('jar', 0.10477287), ('url', 0.06690521)...","[['jar', 'url', 'entry', 'program', 'setting',...","[(0, [('cluster', 0.11344488), ('factory', 0.0...","[['jar', 'url', 'entry', 'program', 'setting',...",9,"[(0, [('scala', 0.0055014915), ('processing', ...","[['scala', 'processing', 'java', 'intellij', '..."
21,03_flink,flink-connectors,9,9,"[(0, [('split', 0.17729564), ('source', 0.0911...","[['split', 'source', 'reader', 'hive', 'partit...","[(0, [('kafka', 0.047333054), ('partition', 0....","[['split', 'source', 'reader', 'hive', 'partit...",9,"[(0, [('scala', 0.0055014915), ('processing', ...","[['scala', 'processing', 'java', 'intellij', '..."
22,03_flink,flink-container,5,6,"[(0, [('cluster', 0.043478318), ('application'...","[['cluster', 'application', 'entry', 'line', '...","[(0, [('application', 0.08333337), ('standalon...","[['cluster', 'application', 'entry', 'line', '...",9,"[(0, [('scala', 0.0055014915), ('processing', ...","[['scala', 'processing', 'java', 'intellij', '..."
23,03_flink,flink-contrib,9,8,"[(0, [('event', 0.015625862), ('edit', 0.01559...","[['event', 'edit', 'diff', 'timestamp', 'chann...","[(0, [('context', 0.04166667), ('source', 0.04...","[['event', 'edit', 'diff', 'timestamp', 'chann...",9,"[(0, [('scala', 0.0055014915), ('processing', ...","[['scala', 'processing', 'java', 'intellij', '..."
24,03_flink,flink-core,7,9,"[(0, [('key', 0.10738443), ('comparator', 0.09...","[['key', 'comparator', 'value', 'normalize', '...","[(0, [('map', 0.13233617), ('integer', 0.10897...","[['key', 'comparator', 'value', 'normalize', '...",9,"[(0, [('scala', 0.0055014915), ('processing', ...","[['scala', 'processing', 'java', 'intellij', '..."


## Show annotated modules

In [2]:
anno_df = pd.read_csv("module_annotation.csv")
anno_df = anno_df[["project", "module", "top", "labels"]]
anno_df.head()

Unnamed: 0,project,module,top,labels
0,dubbo,dubbo-configcenter,big data,"['big data', 'instant messaging', 'user interf..."
1,dubbo,dubbo-remoting,server,"['server', 'instant messaging', 'web service',..."
2,dubbo,dubbo-spring-boot,microservices,"['microservices', 'web service', 'instant mess..."
3,dubbo,dubbo-serialization,serialization,"['serialization', 'database', 'file system', '..."
4,dubbo,dubbo-native,web server,"['web server', 'instant messaging', 'web servi..."


In [3]:
anno_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 483 entries, 0 to 482
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   project  483 non-null    object
 1   module   483 non-null    object
 2   top      483 non-null    object
 3   labels   483 non-null    object
dtypes: object(4)
memory usage: 15.2+ KB


## Take a subset of df as test

In [4]:
# skywalking apm-protocol as an example

topics_sub_df = topics_df[topics_df["module"]=="apm-protocol"]
topics_sub_df

Unnamed: 0,project_name,module,code_num_topics,test_num_topics,code_shown_topics,code_topics,test_shown_topics,test_topics,doc_num_topics,doc_shown_topics,doc_topics
17,02_skywalking,apm-protocol,8,1,"[(0, [('command', 0.0175439), ('serializable',...","[['command', 'serializable', 'deserializable',...","[(0, [('command', 0.11111111), ('complete', 0....","[['command', 'serializable', 'deserializable',...",9,"[(0, [('trace', 0.0060752206), ('support', 0.0...","[['trace', 'support', 'metric', 'mail', 'nativ..."


In [5]:
anno_sub_df = anno_df[anno_df["module"]=="apm-protocol"]
anno_sub_df

Unnamed: 0,project,module,top,labels
26,skywalking,apm-protocol,server,"['server', 'plot', 'instant messaging', 'websi..."


In [6]:
# pd.set_option('display.max_colwidth', None)
module_df = topics_sub_df.merge(anno_sub_df).drop(columns=["project"])
module_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   project_name       1 non-null      object
 1   module             1 non-null      object
 2   code_num_topics    1 non-null      int64 
 3   test_num_topics    1 non-null      int64 
 4   code_shown_topics  1 non-null      object
 5   code_topics        1 non-null      object
 6   test_shown_topics  1 non-null      object
 7   test_topics        1 non-null      object
 8   doc_num_topics     1 non-null      int64 
 9   doc_shown_topics   1 non-null      object
 10  doc_topics         1 non-null      object
 11  top                1 non-null      object
 12  labels             1 non-null      object
dtypes: int64(3), object(10)
memory usage: 236.0+ bytes


## LLM Matching of AD to Concepts

In [7]:
from langchain.chat_models import AzureChatOpenAI
from langchain.prompts.chat import ChatPromptTemplate
from langchain.callbacks import get_openai_callback
from dotenv import load_dotenv, find_dotenv
import os

In [8]:
_ = load_dotenv(find_dotenv()) # read local .env file

In [9]:
sys_template = """You are a helpful assistant tasked to match terms (concepts) to the domain it should belong.
    There may be terms which may belong to more than one domain. Only assume they can only belong to, at most, three.
    You will be given a set of terms in lists that we shall call concepts.
    
    Example of one concept: ['high', 'tall', 'short', 'fat']
    
    This is an example of two domains you have to choose from: [emotion, adjective]
    
    You need to match the concept to the domain it belongs to, which means the most similar in terms of meaning. In this case, your answer should be: [adjective].

    You cannot make up your own domain, you have to *only choose* from the domains given.

    Another example, for the concept "['happy', 'sad']", your answer should be: [emotions, feelings]

    If there is no domain given that matches to the concept, then you should only match it to a 'None' domain. For example: [None]

    Do not output anything else, your answer format must always just be a list of domains, like this: [emotions, feelings]

    Remember to remove all the apostrophe from your response.
    """

In [10]:
concepts = "['market', 'church', 'school']"
domains = "['emotion', 'action', 'building', 'studies', 'places']"

In [11]:
human_template = f"""
    Now, this is the concept I need you to match:

    {concepts}

    And, these are the domains you can choose from:

    {domains}

    Which domains do the concept most belong to?

    Helpful Answer:"""


In [12]:
chat_prompt = ChatPromptTemplate.from_messages([
    ("system", sys_template),
    ("human", human_template),
])

In [13]:
model = AzureChatOpenAI(
    azure_deployment="gpt-35-turbo",
    api_key = os.getenv("AZURE_OPENAI_API_KEY"),  
    azure_endpoint = os.getenv("AZURE_OPENAI_API_ENDPOINT")
)

with get_openai_callback() as cb:
    response = model(chat_prompt.format_messages(text=human_template))
    print(
        f"Total Cost (USD): ${format(cb.total_cost, '.6f')}"
    ) 

Total Cost (USD): $0.000503


In [14]:
print(response.content)

['building', 'places']


In [15]:
def input_var(concepts, domains):

    human_template = f"""
        Now, this is the concept I need you to match:

        {concepts}

        And, these are the domains you can choose from:

        {domains}

        Which domains do the concept most belong to?

        Helpful Answer:"""

    return human_template

In [16]:
concept_list = module_df['code_topics'][0]
domains = module_df['labels'][0]

In [17]:
concept_list

"[['command', 'serializable', 'deserializable', 'task', 'number', 'builder', 'profile', 'duration', 'max', 'network'], ['command', 'builder', 'number', 'runtime', 'unsupported', 'path', 'add', 'serializable', 'serialize', 'trace'], ['trigger', 'ebpf', 'fix', 'gson', 'extension', 'process', 'update', 'target', 'task', 'command'], ['task', 'profile', 'command', 'duration', 'time', 'min', 'max', 'endpoint', 'count', 'dump'], ['command', 'discovery', 'uuid', 'number', 'deserializable', 'serializable', 'key', 'value', 'pair', 'deserialize'], ['command', 'serializable', 'deserializable', 'profile', 'number', 'duration', 'builder', 'task', 'unsupported', 'uuid'], ['setting', 'integer', 'network', 'rule', 'max', 'size', 'request', 'require', 'response', 'sample'], ['command', 'serializable', 'deserializable', 'number', 'builder', 'task', 'profile', 'unsupported', 'deserializer', 'max']]"

In [18]:
domains

"['server', 'plot', 'instant messaging', 'website', 'file system', 'web server', 'database', 'command-line interface', 'World Wide Web', 'package management system', 'application performance management', 'client', 'web service', 'File Transfer Protocol', 'shell tool', 'user interface', 'telecommunications network', 'HTTP server', 'computer configuration', 'data binding', 'big data', 'extract, transform, load', 'object detection', 'data', 'security', 'web application', 'regular expression', 'data structure', 'web application security', 'smart contract', 'statistics', 'machine translation', 'social network', 'pattern matching', 'network monitoring', 'microservices', 'network security', 'time series', 'continuous integration', 'analytics', 'automation', 'object–relational mapping', 'HTTP client', 'neural machine translation', 'password manager', 'back end', 'operating system', 'WebSocket', 'embedded system', 'game server', 'font', 'evolutionary algorithm', 'data visualization', 'face dete

### using apm-protocol module as an example

In [19]:
import ast

cl = ast.literal_eval(concept_list)

In [20]:
match_res = []
total_cost = 0

for con in cl:
    with get_openai_callback() as cb:
        human_template = input_var(con, domains)
        chat_prompt = ChatPromptTemplate.from_messages([
        ("system", sys_template),
        ("human", human_template),
        ])
        response = model(chat_prompt.format_messages(text=human_template))
        # print(
        #     f"Total Cost (USD): ${format(cb.total_cost, '.6f')}"
        # )
        total_cost+=cb.total_cost
        match_res.append([con, response.content])

In [21]:
print("USD$" + str(total_cost))

USD$0.0177635


In [22]:
pd.set_option('display.max_colwidth', None)
domain_res_df = pd.DataFrame(match_res, columns=['concept', 'domain'])
# domain_res_df['domain'] = domain_res_df['domain'].apply(lambda x: ast.literal_eval(x))
domain_res_df

Unnamed: 0,concept,domain
0,"[command, serializable, deserializable, task, number, builder, profile, duration, max, network]","['command-line interface', 'serialization', 'database']"
1,"[command, builder, number, runtime, unsupported, path, add, serializable, serialize, trace]","[command-line interface, file system, software development]"
2,"[trigger, ebpf, fix, gson, extension, process, update, target, task, command]","[command-line interface, software development, computer configuration]"
3,"[task, profile, command, duration, time, min, max, endpoint, count, dump]","['command-line interface', 'data structure', 'database management']"
4,"[command, discovery, uuid, number, deserializable, serializable, key, value, pair, deserialize]","['command-line interface', 'database', 'serialization']"
5,"[command, serializable, deserializable, profile, number, duration, builder, task, unsupported, uuid]","[database, command-line interface, software development]"
6,"[setting, integer, network, rule, max, size, request, require, response, sample]","['file system', 'web application', 'data structure']"
7,"[command, serializable, deserializable, number, builder, task, profile, unsupported, deserializer, max]","['command-line interface', 'database', 'file system']"
