# code semantic Search

### Requirements

##### pymilvus==2.1.1

In [1]:
#Connectings to Milvus
from pymilvus import connections, utility
connections.connect(host='192.168.103.57', port='19530')

In [2]:
TABLE_NAME = "text_collection"
if utility.has_collection(TABLE_NAME):
    utility.drop_collection(TABLE_NAME)
field_name = "embedding"
from pymilvus import Collection, CollectionSchema, FieldSchema, DataType
pk = FieldSchema(name="id", dtype=DataType.INT64, descrition='ids', is_primary=True, auto_id=True)
field = FieldSchema(name=field_name, dtype=DataType.FLOAT_VECTOR, descrition='embedding vectors', dim=512)
schema = CollectionSchema(fields=[pk, field], description="code search")
collection = Collection(name=TABLE_NAME, schema=schema)
index_param = {
        "metric_type":"IP",
        "index_type":"IVF_SQ8",
        "params":{"nlist":1024}
    }
collection.create_index(field_name=field_name, index_params=index_param)

Status(code=0, message='')

## get code embedding and save to disk(dataset: codeSearchNet-py-test)

In [None]:
! bash get_code_embed.sh

#### load code embedding from disk

In [3]:
import os
import numpy as np
code_embed_path = '../../tmp_ex0/code.embeddings.npy'
code_embed = np.load(code_embed_path)

In [4]:
em =list(code_embed)
mr = collection.insert([em])
ids = mr.primary_keys

In [5]:
id2code = dict()
code_path = '../../data/codeSearchNet-py/test/code.original_subtoken'
with open(code_path) as f:
    lines = f.readlines()
    assert len(lines) == len(ids)
    for line, id in zip(lines, ids):
        code = line.strip()
        id2code[id] = code

In [6]:
doc2code = dict()
doc_path = '../../data/codeSearchNet-py/test/doc.original'
with open(code_path) as f1, open(doc_path) as f2:
    code_lines = f1.readlines()
    doc_lines = f2.readlines()
    assert len(code_lines) == len(doc_lines)
    for code, doc in zip(code_lines, doc_lines):
        doc2code[doc.strip()] = code.strip()
if len(doc2code) < len(doc_lines):
    print('There are duplicate docs, keep only one of them!')

### search

In [15]:
import towhee
search_text = ['Check for message on subscribed channels and write to xcom the message with key message',
               'Get a pandas dataframe from a sql query .']
search_embeds = (
        towhee.dc(search_text) \
              .image_text_embedding.clip(model_name='clip_vit_b32', modality='text') \
              .tensor_normalize()
)

In [None]:
# scores = np.dot(code_embed, search_embeds[0])
# scores.shape

In [None]:
# sorted(zip(scores.tolist(), ids), reverse=True)[:10]

In [16]:
search_params = {"metric_type": "IP", "params": {"nprobe": 10}}
collection.load()
search_embeds = [embed.tolist() for embed in search_embeds]
results = collection.search(search_embeds, field_name, param=search_params, limit=8, expr=None)

In [19]:
import pandas as pd

for i in range(len(search_text)):
    print('search text: ', search_text[i])
    print('-'*80)
    print('ground truth:')
    print(doc2code[search_text[i]])
    print('-'*80)
    table = dict()
    table['search results'] = [id2code[result.id] for result in results[i]]
    table['distance'] = [result.distance for result in results[i]]
    df = pd.DataFrame(table)
    print(df)
    print('='*80+'\n')

search text:  Check for message on subscribed channels and write to xcom the message with key message
--------------------------------------------------------------------------------
ground truth:
def poke self context self log info 'Redis Pub Sub Sensor checking for message on channels %s' self channels message self pubsub get message self log info 'Message %s from channel %s' message self channels if message and message [ 'type' ] 'message' context [ 'ti' ] xcom push key 'message' value message self pubsub unsubscribe self channels return True return False
--------------------------------------------------------------------------------
                                      search results  distance
0  def poke self context self log info 'Redis Pub...  0.177596
1  def unserialize self msg list content True cop...  0.172991
2  def subscribe To Device Commands self type Id ...  0.170548
3  def sub channel self if self sub channel is No...  0.170207
5  def publish pyin self code parent ex