# Start local milvus
```sh
cd ops
docker compose up -d
```
[WebUI](http://localhost:9091/webui)

# download huggingface model and dataset
1. https://hf-mirror.com/Qwen/Qwen3-Embedding-0.6B
1. https://hf-mirror.com/datasets/bzb2023/Zhihu-KOL-More-Than-100-Upvotes


```sh
cd notebooks/milvus
export HF_ENDPOINT=https://hf-mirror.com
./hfd.sh Qwen/Qwen3-Embedding-0.6B
./hfd.sh bzb2023/Zhihu-KOL-More-Than-100-Upvotes --dataset
```

In [1]:
from pymilvus import connections, utility

collection_name = "example"
connections.connect(
    host="127.0.0.1",
    port="19530"
)

utility.drop_collection(collection_name=collection_name)

In [2]:
from pymilvus import FieldSchema, CollectionSchema, DataType

DIMENSION = 1024

fields = [
    FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
    FieldSchema(name="question", dtype=DataType.VARCHAR, max_length=2048),
    FieldSchema(name="question_embedding", dtype=DataType.FLOAT_VECTOR, dim=DIMENSION),
    FieldSchema(name="answer", dtype=DataType.VARCHAR, max_length=65535),
    FieldSchema(name="answer_embedding", dtype=DataType.FLOAT_VECTOR, dim=DIMENSION),
    FieldSchema(name="upvotes", dtype=DataType.INT64),
]

schema = CollectionSchema(fields=fields)

In [3]:
from pymilvus import Collection

collection = Collection(name=collection_name, schema=schema)

index_params = {
    "index_type": "IVF_FLAT",
    "metric_type":"L2",
    "params": {"nlist": 4}, 
}

collection.create_index(field_name="question_embedding", index_params=index_params)
collection.create_index(field_name="answer_embedding", index_params=index_params)

collection.load()

In [4]:
from sentence_transformers  import SentenceTransformer
from datasets import load_from_disk

import os

pwd = %pwd

transformerPath = os.path.join(pwd, "Qwen3-Embedding-0.6B")
datasetPath = os.path.join(pwd, "Zhihu-KOL-More-Than-100-Upvotes")

transformer = SentenceTransformer(transformerPath)
dataset = load_from_disk(datasetPath)


In [5]:
from IPython.display import display

df = dataset.to_pandas().query("`RESPONSE`.str.len() < 4096 & upvotes > 1000")
display(df.size)
display(df[:10])

417775

Unnamed: 0,INSTRUCTION,RESPONSE,SOURCE,METADATA,upvotes
6,你对青年警察有什么建议呢？,看有这么多慌张的弟弟妹妹。再多数几句。 1.遴选。我所在单位之前遴选到市局是不需要工作年限的...,Zhihu,"{""question_id"": 405252776.0, ""answer_id"": 1942...",2292
9,如果你魂穿在了戚夫人身上，时间点是刘邦死后，该如何自保，不被做成人彘？,很简单。其实戚夫人在刘邦死后要保命本来就是简单难度。 首先吕雉本来是一个贤德的大妇，对于刘邦...,Zhihu,"{""question_id"": 484727306.0, ""answer_id"": 2142...",3896
11,为什么老一辈会认为健身出来是畸形身材？,正常的健身： 打拳的 打球的 游泳的 跑步的 踢球的 畸形的健身： 老一辈觉得怎么样我不...,Zhihu,"{""question_id"": 410923579.0, ""answer_id"": 2804...",3971
15,为什么 苹果Mac 电脑不粘贴 Intel 标志?,谢邀。尽管现在好几家强势笔记本厂商没有贴Intel标志，但苹果Mac确实是第一个这么做的。这...,Zhihu,"{""question_id"": 338039138.0, ""answer_id"": 8019...",1187
17,为什么有些胖子明明瘦下来会很好看，却仍然在肥胖的道路上越走越远？,曾经，我就是一个大胖子 评论里120多斤的算啥胖啊，在我看也就微胖略丰满吧 爆个照（最胖时期...,Zhihu,"{""question_id"": 349224161.0, ""answer_id"": 8969...",4725
18,地球资源使用已经超过年预估值了，为什么人类依旧我行我素?,因为资源不足本来就是一个“伪”属性大于80%的命题。 几百年前的古人在用什么资源，现在你在用...,Zhihu,"{""question_id"": 339126201.0, ""answer_id"": 7805...",1304
25,《演员请就位》中李少红导演是不是看不上郭敬明？,​关于郭敬明，有几个「名场面」。 第一次是2005年，刚成名的他伴随抄袭的争议出现在央视...,Zhihu,"{""question_id"": 351365853.0, ""answer_id"": 8767...",8305
27,打野球的时候，哪种对手最让你绝望？,以我在各个野球场混迹十余年的经验来讲，总结了一下几种人让人绝望至极，想抱着篮球回家。 1.运...,Zhihu,"{""question_id"": 352205881.0, ""answer_id"": 8720...",1867
28,怎么样才能在二十天内减肥16斤？,20 天内瘦16斤，不难做到！！！ 自报家门先，上班族，体重最高时150斤，最低102斤，目...,Zhihu,"{""question_id"": 352940118.0, ""answer_id"": 9685...",8115
29,为什么人们宁可相信传入中国不到两百年的现代医学，却不肯相信已经在中国传承几千年的中医？,这个也是传承了几千年的博大精深,Zhihu,"{""question_id"": 353112251.0, ""answer_id"": 2035...",1166


In [None]:
milvus_input = []
for index, row in df.iterrows():
    entry = {}
    question = row["INSTRUCTION"]
    entry["question_embedding"] = transformer.encode(question)
    entry["question"] = question

    answer = row["RESPONSE"]
    entry["answer_embedding"] = transformer.encode(answer)
    entry["answer"] = answer

    entry["upvotes"] = row["upvotes"]
    
    milvus_input.append(entry)
    if len(milvus_input) == 100:
        collection.insert(milvus_input)
        collection.flush()
        milvus_input = []
        display(f"{index+1}/{df.size}")
if len(milvus_input) > 0:
    collection.insert(milvus_input)
    collection.flush()

'288/417775'

'607/417775'

'896/417775'

'1237/417775'

'1560/417775'

'1901/417775'

'2212/417775'

'2649/417775'

'3077/417775'

'3364/417775'

'3654/417775'

'3975/417775'