## Concept/Domain coverage analysis: Matching with App Domains

This notebook extracts corpus keywords and analyse the concepts emerging from three sources: source code, documentation and tests

## Show topics

In [14]:
import pandas as pd
# pd.set_option('display.max_colwidth', None)

topics_df = pd.read_csv("topics_res_df.csv")
topics_df = topics_df[["project_name", "module", "code_topics", "test_topics", "doc_topics"]]
topics_df[:25]

Unnamed: 0,project_name,module,code_topics,test_topics,doc_topics
0,01_dubbo,dubbo-cluster,"[['merger', 'model', 'scope', 'aware', 'end', ...","[['merger', 'model', 'scope', 'aware', 'end', ...","[['service', 'see', 'issue', 'sample', 'projec..."
1,01_dubbo,dubbo-common,"[['map', 'extension', 'loader', 'property', 'm...","[['map', 'extension', 'loader', 'property', 'm...","[['service', 'see', 'issue', 'sample', 'projec..."
2,01_dubbo,dubbo-compatible,"[['invocation', 'invoker', 'attachment', 'argu...","[['invocation', 'invoker', 'attachment', 'argu...","[['service', 'see', 'issue', 'sample', 'projec..."
3,01_dubbo,dubbo-config,"[['application', 'model', 'module', 'context',...","[['application', 'model', 'module', 'context',...","[['service', 'see', 'issue', 'sample', 'projec..."
4,01_dubbo,dubbo-configcenter,"[['dynamic', 'factory', 'url', 'zookeeper', 'a...","[['dynamic', 'factory', 'url', 'zookeeper', 'a...","[['service', 'see', 'issue', 'sample', 'projec..."
5,01_dubbo,dubbo-container,"[['container', 'spring', 'error', 'extension',...","[['container', 'spring', 'error', 'extension',...","[['service', 'see', 'issue', 'sample', 'projec..."
6,01_dubbo,dubbo-filter,"[['method', 'parameter', 'validator', 'clazz',...","[['method', 'parameter', 'validator', 'clazz',...","[['service', 'see', 'issue', 'sample', 'projec..."
7,01_dubbo,dubbo-kubernetes,"[['service', 'map', 'instance', 'listener', 'a...","[['service', 'map', 'instance', 'listener', 'a...","[['service', 'see', 'issue', 'sample', 'projec..."
8,01_dubbo,dubbo-metadata,"[['service', 'map', 'param', 'key', 'url', 'de...","[['service', 'map', 'param', 'key', 'url', 'de...","[['service', 'see', 'issue', 'sample', 'projec..."
9,01_dubbo,dubbo-metrics,"[['metric', 'map', 'method', 'atomic', 'invoca...","[['metric', 'map', 'method', 'atomic', 'invoca...","[['service', 'see', 'issue', 'sample', 'projec..."


## Show annotated modules

In [7]:
anno_df = pd.read_csv("module_annotation.csv")
anno_df = anno_df[["project", "module", "top", "labels"]]
anno_df.head()

Unnamed: 0,project,module,top,labels
0,dubbo,dubbo-test,software testing,"['software testing', 'unit testing', 'file sys..."
1,skywalking,apm-protocol,server,"['server', 'plot', 'instant messaging', 'websi..."
2,flink,flink-libraries,information processing,"['information processing', 'database', 'big da..."
3,rocketmq,remoting,smart contract,"['smart contract', 'penetration test', 'back e..."
4,shardingsphere,examples,database,"['database', 'instant messaging', 'data bindin..."


## LLM Prompts to do matching

In [18]:
# dubbo as an example

topics_sub_df = topics_df[topics_df["module"]=="apm-protocol"]
topics_sub_df

Unnamed: 0,project_name,module,code_topics,test_topics,doc_topics
17,02_skywalking,apm-protocol,"[['command', 'serializable', 'deserializable',...","[['command', 'serializable', 'deserializable',...","[['trace', 'support', 'metric', 'mail', 'nativ..."


In [19]:
anno_sub_df = anno_df[anno_df["module"]=="apm-protocol"]
anno_sub_df

Unnamed: 0,project,module,top,labels
1,skywalking,apm-protocol,server,"['server', 'plot', 'instant messaging', 'websi..."


In [23]:
pd.set_option('display.max_colwidth', None)
module_df = topics_sub_df.merge(anno_sub_df).drop(columns=["project"])
module_df

Unnamed: 0,project_name,module,code_topics,test_topics,doc_topics,top,labels
0,02_skywalking,apm-protocol,"[['command', 'serializable', 'deserializable', 'task', 'number', 'builder', 'profile', 'duration', 'max', 'network'], ['command', 'builder', 'number', 'runtime', 'unsupported', 'path', 'add', 'serializable', 'serialize', 'trace'], ['trigger', 'ebpf', 'fix', 'gson', 'extension', 'process', 'update', 'target', 'task', 'command'], ['task', 'profile', 'command', 'duration', 'time', 'min', 'max', 'endpoint', 'count', 'dump'], ['command', 'discovery', 'uuid', 'number', 'deserializable', 'serializable', 'key', 'value', 'pair', 'deserialize'], ['command', 'serializable', 'deserializable', 'profile', 'number', 'duration', 'builder', 'task', 'unsupported', 'uuid'], ['setting', 'integer', 'network', 'rule', 'max', 'size', 'request', 'require', 'response', 'sample'], ['command', 'serializable', 'deserializable', 'number', 'builder', 'task', 'profile', 'unsupported', 'deserializer', 'max']]","[['command', 'serializable', 'deserializable', 'task', 'number', 'builder', 'profile', 'duration', 'max', 'network'], ['command', 'builder', 'number', 'runtime', 'unsupported', 'path', 'add', 'serializable', 'serialize', 'trace'], ['trigger', 'ebpf', 'fix', 'gson', 'extension', 'process', 'update', 'target', 'task', 'command'], ['task', 'profile', 'command', 'duration', 'time', 'min', 'max', 'endpoint', 'count', 'dump'], ['command', 'discovery', 'uuid', 'number', 'deserializable', 'serializable', 'key', 'value', 'pair', 'deserialize'], ['command', 'serializable', 'deserializable', 'profile', 'number', 'duration', 'builder', 'task', 'unsupported', 'uuid'], ['setting', 'integer', 'network', 'rule', 'max', 'size', 'request', 'require', 'response', 'sample'], ['command', 'serializable', 'deserializable', 'number', 'builder', 'task', 'profile', 'unsupported', 'deserializer', 'max']]","[['trace', 'support', 'metric', 'mail', 'native', 'pipeline', 'telemetry', 'agent', 'list', 'distribute'], ['follow', 'support', 'log', 'list', 'telemetry', 'trace', 'pipeline', 'native', 'metric', 'mail'], ['mail', 'pipeline', 'support', 'follow', 'native', 'metric', 'trace', 'telemetry', 'code', 'system'], ['metric', 'mail', 'support', 'follow', 'trace', 'pipeline', 'telemetry', 'native', 'agent', 'log'], ['support', 'mail', 'telemetry', 'metric', 'agent', 'trace', 'native', 'follow', 'performance', 'list'], ['mail', 'support', 'telemetry', 'native', 'metric', 'follow', 'trace', 'code', 'pipeline', 'performance'], ['mail', 'metric', 'telemetry', 'support', 'pipeline', 'log', 'distribute', 'follow', 'trace', 'performance'], ['metric', 'support', 'mail', 'pipeline', 'telemetry', 'native', 'log', 'follow', 'performance', 'distribute'], ['support', 'mail', 'metric', 'list', 'distribute', 'pipeline', 'follow', 'native', 'performance', 'log']]",server,"['server', 'plot', 'instant messaging', 'website', 'file system', 'web server', 'database', 'command-line interface', 'World Wide Web', 'package management system', 'application performance management', 'client', 'web service', 'File Transfer Protocol', 'shell tool', 'user interface', 'telecommunications network', 'HTTP server', 'computer configuration', 'data binding', 'big data', 'extract, transform, load', 'object detection', 'data', 'security', 'web application', 'regular expression', 'data structure', 'web application security', 'smart contract', 'statistics', 'machine translation', 'social network', 'pattern matching', 'network monitoring', 'microservices', 'network security', 'time series', 'continuous integration', 'analytics', 'automation', 'object–relational mapping', 'HTTP client', 'neural machine translation', 'password manager', 'back end', 'operating system', 'WebSocket', 'embedded system', 'game server', 'font', 'evolutionary algorithm', 'data visualization', 'face detection', 'data science', 'facial recognition system', 'finance', 'distributed database', 'front end', 'database management', 'functional programming', 'data compression', 'game engine', 'game', 'genetic algorithm', 'genomics', 'geographic information system', 'data mining', 'engineering', 'encryption', 'debugger', 'distributed computing', 'digital security', 'data analysis', 'distributed multimedia communications platform', 'distributed system', 'documentation', 'digital image processing', 'digital audio', 'design', 'e-commerce', 'dependency injection', 'education', 'electronic trading platform', 'deep neural network', 'deep learning', 'email', 'debugging', 'word embedding', 'computer data storage', 'cryptography', 'automated machine learning', 'augmented reality', 'audio signal processing', 'audio player software', 'artificial neural network', 'artificial intelligence', 'anomaly detection', 'animation', 'algorithmic trading', 'algorithm', 'Web Components', 'Semantic Web', 'Reverse engineering', 'RNA sequencing', 'Q-learning', 'Parser combinator', 'Naive Bayes classifier', 'Malware Analysis', 'Kalman filter', 'Image analysis', 'IRC bot', 'DevOps', 'Containerization', 'Bidirectional recurrent neural networks', 'Bayesian inference', '3D modeling', 'authentication', 'autonomous driving', 'cryptocurrency', 'backup', 'convolutional neural network', 'continuous deployment', 'continuous delivery', 'content management system', 'computer vision', 'computer science', 'computer programming', 'computer graphics', 'geometry', 'computer benchmarking', 'computational science', 'computational biology', 'compiler', 'code generation', 'cluster analysis', 'cloud computing', 'classification', 'chatbot', 'camera', 'business dashboard', 'blockchain', 'bitcoin', 'bioinformatics', 'benchmark', 'banking industry', 'geographic information', 'information extraction', 'graph algorithm', 'simulation', 'static site generator', 'static program analysis', 'static program analysis tool', 'speech synthesis', 'speech recognition', 'sorting algorithm', 'software testing', 'software engineering', 'software development', 'software design pattern', 'software architecture', 'simulator', 'signal processing', 'stream processing', 'serverless computing', 'serialization', 'sequencing', 'sentiment analysis', 'semi-supervised learning', 'search', 'search engine', 'science', 'schema migration', 'scheduler', 'routing', 'router', 'stock market', 'streaming media', 'rendering', 'video game', 'web scraping', 'web development', 'web crawler', 'web browser', 'web browser engine', 'visualization', 'virtualization', 'virtual reality', 'virtual machine', 'video', 'video tracking', 'video processing', 'video game development', 'support vector machine', 'validator', 'unit testing', 'translation', 'transfer learning', 'topic modeling', 'text processing', 'text mining', 'text editor', 'text classification', 'test automation', 'syntax highlighting', 'synchronization', 'robotics', 'reinforcement learning', 'graph database', 'information retrieval', 'logistic regression model', 'logging', 'linear regression', 'linear algebra', 'lexical analysis', 'language model', 'knowledge graph', 'kernel', 'interpreter', 'internet of things', 'internet bot', 'integrated development environment', 'information processing', 'machine learning', 'image', 'image segmentation', 'image recognition', 'image editing', 'image compression', 'image classification', 'image captioning', 'hyperparameter optimization', 'home automation', 'hacking tool', 'graphical user interface', 'graph', 'long short-term memory', 'malware', 'regression analysis', 'object-oriented programming', 'recurrent neural network', 'recommender system', 'real-time computing', 'reactive programming', 'random forest', 'question answering', 'quantum computer', 'productivity', 'physics', 'penetration test', 'parsing', 'optical character recognition', 'neural network', 'mathematical finance', 'network analysis', 'natural language', 'natural language understanding', 'natural language processing', 'named-entity recognition', 'music', 'multiplayer game', 'mobile computing', 'mobile application development', 'middleware', 'mathematics', 'mathematical optimization', '3D computer graphics']"


In [11]:
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.chains import ConversationalRetrievalChain, LLMChain
from langchain.llms import LlamaCpp
from langchain.memory import ConversationSummaryMemory
from langchain.prompts import PromptTemplate

In [12]:
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
llm = LlamaCpp(
    # model_path="C:\\Users\\biadge\\codellama-13b.Q4_K_M.gguf",
    model_path="C:\\Users\\biadge\\llama-2-7b.Q4_K_M.gguf",
    temperature=0,
    n_ctx=5000,
    n_gpu_layers=1,
    n_batch=512,
    f16_kv=True,
    callback_manager=callback_manager,
    verbose=True,
)

AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


In [13]:
concepts = "[[run, walk, jump], [happy, sad]]"
domains = "[emotion, action]"

In [18]:
from langchain.chains.question_answering import load_qa_chain

# Prompt
template = f"""You are a helpful assistant tasked to match terms to the domain it should belong.
There may be terms which may belong to more than one domain, but that's ok, you can assume they only belong to one.
You will be given a set of terms in lists that we shall call concepts. Example of 2 concepts are shown below:

[[happy, angry, sad], [high, tall, short, fat]]

There are two concepts here.

This is an example of domains:

[emotion, description]

There are two domains here.

Now, you need to match these. Your answer should be like the following, shown in backticks:

```
[happy, angry, sad] = emotion
[high, tall, short, fat] = adjective
```

Don't output anything else, try to match as much as you can. Not all concepts can be matched.

Now, these are the concepts I need you to match:

{concepts}

And, these are the domains:

{domains}

Helpful Answer:"""

In [19]:
llm(template)

Llama.generate: prefix-match hit




```
[run, walk, jump] = action
[happy, sad] = emotion
```


'\n\n```\n[run, walk, jump] = action\n[happy, sad] = emotion\n```\n'