In [1]:
from pymilvus import MilvusClient, DataType

In [2]:
# 连接到 milvus
client = MilvusClient(uri='http://localhost:19530')

In [3]:
# 创建景点表
schema = client.create_schema()

# 主键
schema.add_field(
    field_name="id",
    is_primary=True,
    auto_id=True,
    datatype=DataType.INT64
)

schema.add_field(
    field_name="site_idx",
    datatype=DataType.INT64
)

schema.add_field(
    field_name="site_embed",
    datatype=DataType.FLOAT_VECTOR,
    dim=64
)

{'auto_id': False, 'description': '', 'fields': [{'name': 'id', 'description': '', 'type': <DataType.INT64: 5>, 'is_primary': True, 'auto_id': True}, {'name': 'site_idx', 'description': '', 'type': <DataType.INT64: 5>}, {'name': 'site_embed', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 64}}], 'enable_dynamic_field': False}

In [4]:
client.has_collection(collection_name="tb_site_embed")

False

In [8]:
client.describe_collection(collection_name="tb_site_embed")

{'collection_name': 'tb_site_embed',
 'auto_id': True,
 'num_shards': 1,
 'description': '',
 'fields': [{'field_id': 100,
   'name': 'id',
   'description': '',
   'type': <DataType.INT64: 5>,
   'params': {},
   'auto_id': True,
   'is_primary': True},
  {'field_id': 101,
   'name': 'site_idx',
   'description': '',
   'type': <DataType.INT64: 5>,
   'params': {}},
  {'field_id': 102,
   'name': 'site_embed',
   'description': '',
   'type': <DataType.FLOAT_VECTOR: 101>,
   'params': {'dim': 64}}],
 'functions': [],
 'aliases': [],
 'collection_id': 463088652533107462,
 'consistency_level': 2,
 'properties': {'timezone': 'UTC'},
 'num_partitions': 1,
 'enable_dynamic_field': False,
 'created_timestamp': 463088741075124242,
 'update_timestamp': 463088741075124242}

In [5]:
# 创建用户表
user_schema = client.create_schema()

user_schema.add_field(
    field_name="id",
    is_primary=True,
    auto_id=True,
    datatype=DataType.INT64
)

user_schema.add_field(
    field_name="user_id",
    datatype=DataType.INT64
)

user_schema.add_field(
    field_name="user_embed",
    datatype=DataType.FLOAT_VECTOR,
    dim=64
)


{'auto_id': False, 'description': '', 'fields': [{'name': 'id', 'description': '', 'type': <DataType.INT64: 5>, 'is_primary': True, 'auto_id': True}, {'name': 'user_id', 'description': '', 'type': <DataType.INT64: 5>}, {'name': 'user_embed', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 64}}], 'enable_dynamic_field': False}

In [28]:
if client.has_collection(collection_name="tb_site_embed"):
    client.drop_collection(collection_name="tb_site_embed")

In [12]:
if client.has_collection(collection_name="tb_user_embed"):
    client.drop_collection(collection_name="tb_user_embed")

In [6]:
print(client.has_collection(collection_name="tb_site_embed"))
print(client.has_collection(collection_name="tb_user_embed"))

False
False


In [7]:
site_index_params = client.prepare_index_params()

site_index_params.add_index(
    field_name="site_embed",
    index_name="site_embed_index",
    index_type="AUTOINDEX",
    metric_type="IP"
)

client.create_collection(
    collection_name="tb_site_embed",
    schema=schema,
    index_params=site_index_params
)

In [8]:
user_index_params = client.prepare_index_params()

user_index_params.add_index(
    field_name="user_embed",
    index_name="user_embed_index",
    index_type="AUTOINDEX",
    metric_type="IP"
)

client.create_collection(
    collection_name="tb_user_embed",
    schema=user_schema,
    index_params=user_index_params
)

In [23]:
client.describe_collection(collection_name="tb_site_embed")

{'collection_name': 'tb_site_embed',
 'auto_id': True,
 'num_shards': 1,
 'description': '',
 'fields': [{'field_id': 100,
   'name': 'id',
   'description': '',
   'type': <DataType.INT64: 5>,
   'params': {},
   'auto_id': True,
   'is_primary': True},
  {'field_id': 101,
   'name': 'site_idx',
   'description': '',
   'type': <DataType.INT64: 5>,
   'params': {}},
  {'field_id': 102,
   'name': 'site_embed',
   'description': '',
   'type': <DataType.FLOAT_VECTOR: 101>,
   'params': {'dim': 64}}],
 'functions': [],
 'aliases': [],
 'collection_id': 463071564295321901,
 'consistency_level': 2,
 'properties': {'timezone': 'UTC'},
 'num_partitions': 1,
 'enable_dynamic_field': False,
 'created_timestamp': 463072261962465298,
 'update_timestamp': 463072261962465298}

In [17]:
client.describe_collection(collection_name="tb_user_embed")

{'collection_name': 'tb_user_embed',
 'auto_id': True,
 'num_shards': 1,
 'description': '',
 'fields': [{'field_id': 100,
   'name': 'id',
   'description': '',
   'type': <DataType.INT64: 5>,
   'params': {},
   'auto_id': True,
   'is_primary': True},
  {'field_id': 101,
   'name': 'user_id',
   'description': '',
   'type': <DataType.INT64: 5>,
   'params': {}},
  {'field_id': 102,
   'name': 'user_embed',
   'description': '',
   'type': <DataType.FLOAT_VECTOR: 101>,
   'params': {'dim': 64}}],
 'functions': [],
 'aliases': [],
 'collection_id': 463071564295322436,
 'consistency_level': 2,
 'properties': {'timezone': 'UTC'},
 'num_partitions': 1,
 'enable_dynamic_field': False,
 'created_timestamp': 463072287809863697,
 'update_timestamp': 463072287809863697}

In [11]:
import numpy as np

In [12]:
vec = np.random.randn(64)

res = client.search(
    collection_name="tb_site_embed",
    anns_field="site_embed",
    data=[vec],
    limit=3,
    output_fields=["site_idx"],
    search_params={"metric_type": "IP"}
)

In [13]:
res[0]

[]

In [32]:
import pandas as pd

In [33]:
tb_sites = pd.read_csv('../dataset/tb_sites.csv')
tb_sites

Unnamed: 0,id,name,score,address,hot_degree,introduce,open_time,phone,images,site_idx
0,1,故宫博物院,4.8,北京市东城区景山前街4号,10.0,故宫博物院又称紫禁城，是明、清两代的皇宫，也是古老中国的标志和象征。当你置身于气派规整的高墙...,今日已闭园；明天08:30-16:30开放（15:30停止入园）,票务咨询：4009501925,https://dimg04.c-ctrip.com/images/100k0q000000...,1
1,2,八达岭长城,4.7,北京市延庆区G6京藏高速58号出口,10.0,八达岭长城号称天下九塞之一，风光集巍峨险峻、秀丽苍翠于一体，是明长城景色中的精华。“不到长城...,今日已闭园；明天07:30-16:00开放（16:00停止入园）,票务咨询：4008180088,https://dimg04.c-ctrip.com/images/1lo5h12000nz...,2
2,3,外滩,4.8,上海市黄浦区中山东一路北至外白渡桥、南至十六铺码头,10.0,外滩位于黄浦江畔，全长约1.5公里，是上海城市象征意义的景点，风格迥异的万国建筑群和浦江夜景...,全年全天开放,,https://dimg04.c-ctrip.com/images/10040m000000...,3
3,4,秦始皇帝陵博物院(兵马俑),4.7,西安市临潼区秦陵北路,10.0,秦始皇帝陵博物院(兵马俑)又称兵马俑、秦兵马俑。它是秦始皇陵的陪葬坑，与秦始皇陵一同组成了：...,今日已闭园；明天08:30-18:00开放（16:30停止入园）,票务咨询：029-81399127,https://dimg04.c-ctrip.com/images/0104012000f6...,4
4,5,香港迪士尼乐园,4.7,香港大屿山香港迪士尼乐园度假区,10.0,香港迪士尼乐园20周年奇妙派对\n在香港迪士尼乐园，来一场奇妙的派对！一般你喜爱的迪士尼朋友...,今日已闭园；明天10:00-20:30开放,票务咨询：+852-1830830,https://dimg04.c-ctrip.com/images/1lo0l12000pi...,5
...,...,...,...,...,...,...,...,...,...,...
995,996,京杭大运河杭州景区,4.3,浙江省杭州市拱墅区环城北路226号,6.8,京杭大运河杭州景区是京杭大运河的最南端，在千百年的历史中，融汇了中国南北各地的特色物产、饮食...,全年全天开放,票务咨询：0571-85190851,https://dimg04.c-ctrip.com/images/100u1f000001...,996
996,997,金山,4.6,江苏省镇江市润州区和平路街道金山路62号,6.8,金山位于镇江市西北，海拔43.7米，距离镇江市中心仅3公里。原先是长江中心的一个岛屿，后来长...,今日已闭园；明天08:00-16:30开放（16:00停止入园）,票务咨询：0511-85512992,https://dimg04.c-ctrip.com/images/0106612000f6...,997
997,998,大同市博物馆,4.4,山西省大同市平城区文瀛湖街道太和路506号,6.8,大同市博物馆始建于1958年，是一座综合性地志博物馆2020年被评为“国家一级博物馆”。大同...,今日已闭园；明天09:00-17:00开放（16:30停止入园）,票务咨询：0352-2303518,https://dimg04.c-ctrip.com/images/10080z000000...,998
998,999,徐州汉文化景区,4.6,江苏省徐州市云龙区兵马俑路1号,6.8,徐州汉文化景区距离徐州市区很近，其中徐州汉兵马俑博物馆、水下兵马俑博物馆、汉画长廊、1995...,冬季 08:30-17:00开放（16:30停止售票），夏季 08:30-17:30开放（1...,票务咨询：0516-83661269 ; 0516-83167053,https://dimg04.c-ctrip.com/images/0105512000h4...,999


In [37]:
introduce = tb_sites['introduce']

In [38]:
introduce[introduce.isna()].index

Index([], dtype='int64')

In [36]:
tb_sites['introduce'] = tb_sites['introduce'].fillna('')

In [39]:
empty_index = tb_sites[tb_sites['introduce'] == ''].index

In [40]:
empty_index

Index([198, 220, 365, 468, 552, 606, 635, 672, 687, 710, 750, 756, 825, 925,
       943, 953, 967, 991],
      dtype='int64')

In [43]:
mask = tb_sites["name"].str.contains("演唱会", na=False, case=False)

In [44]:
singer_index = tb_sites[mask].index

In [45]:
singer_index

Index([195, 287, 299, 365, 446, 479, 490, 552, 606, 622, 627, 667, 710, 756,
       779, 879, 977, 988],
      dtype='int64')

In [48]:
idx_to_del = empty_index.tolist() + singer_index.tolist()

In [49]:
idx_to_del

[198,
 220,
 365,
 468,
 552,
 606,
 635,
 672,
 687,
 710,
 750,
 756,
 825,
 925,
 943,
 953,
 967,
 991,
 195,
 287,
 299,
 365,
 446,
 479,
 490,
 552,
 606,
 622,
 627,
 667,
 710,
 756,
 779,
 879,
 977,
 988]

In [50]:
len(idx_to_del)

36

In [51]:
idx_to_del = list(set(idx_to_del))

In [52]:
idx_to_del, len(idx_to_del)

([779,
  667,
  925,
  287,
  672,
  552,
  299,
  687,
  943,
  953,
  825,
  446,
  195,
  198,
  710,
  967,
  977,
  468,
  220,
  988,
  606,
  991,
  479,
  490,
  365,
  750,
  622,
  879,
  627,
  756,
  635],
 31)

In [53]:
site_idx_to_del = [ idx+1 for idx in idx_to_del ]
site_idx_to_del, len(site_idx_to_del)

([780,
  668,
  926,
  288,
  673,
  553,
  300,
  688,
  944,
  954,
  826,
  447,
  196,
  199,
  711,
  968,
  978,
  469,
  221,
  989,
  607,
  992,
  480,
  491,
  366,
  751,
  623,
  880,
  628,
  757,
  636],
 31)

In [54]:
del_list_str = ','.join([ str(idx) for idx in site_idx_to_del ])
del_list_str

'780,668,926,288,673,553,300,688,944,954,826,447,196,199,711,968,978,469,221,989,607,992,480,491,366,751,623,880,628,757,636'

In [55]:
del_expr = f"site_idx in [{del_list_str}]"
del_expr

'site_idx in [780,668,926,288,673,553,300,688,944,954,826,447,196,199,711,968,978,469,221,989,607,992,480,491,366,751,623,880,628,757,636]'

In [59]:
print(del_expr)

client.delete(
    collection_name="tb_site_embed",
    filter=del_expr
)

site_idx in [780,668,926,288,673,553,300,688,944,954,826,447,196,199,711,968,978,469,221,989,607,992,480,491,366,751,623,880,628,757,636]


{'delete_count': 31, 'cost': 0}

In [5]:
import numpy as np

In [6]:
data = { 'user_id': 0, 'user_embed': np.random.uniform(0, 1, 64) }

resp = client.insert(
    collection_name='tb_user_embed',
    data=data
)

In [7]:
resp['insert_count']

1

In [8]:
resp = client.query(
    collection_name='tb_user_embed',
    filter='user_id == 0',
    output_fields=['user_id', 'user_embed']
)

In [13]:
resp[0]['user_id']

0

In [34]:
resp = client.delete(
    collection_name='tb_user_embed',
    filter='user_id == 0',
)

In [16]:
resp

{'delete_count': 1, 'cost': 0}

In [14]:
a = [1, 2, 3, 4, 5, 6]
b = [2, 3, 4, 5, 6, 7]
c = zip(a, b)

In [16]:
list(c)

[(1, 2), (2, 3), (3, 4), (4, 5), (5, 6), (6, 7)]

In [17]:
import pandas as pd

In [18]:
site_extra_infos = pd.read_csv("../dataset/tb_site_extra.csv")
site_extra_infos

Unnamed: 0,id,site_idx,price,positive_comment_count,negative_comment_count
0,1,1,0,115456,2063
1,2,2,35,39623,1154
2,3,3,0,128468,668
3,4,4,120,83931,3747
4,5,5,535,37770,1046
...,...,...,...,...,...
995,996,996,50,749,67
996,997,997,158,3482,85
997,998,998,0,478,46
998,999,999,380,938,28


In [19]:
positive_comment_infos = site_extra_infos['positive_comment_count']
negative_comment_infos = site_extra_infos['negative_comment_count']

In [21]:
positive_comment_infos[positive_comment_infos.isna()].index

Index([], dtype='int64')

In [22]:
negative_comment_infos[negative_comment_infos.isna()].index

Index([], dtype='int64')

In [23]:
comment_infos = positive_comment_infos + negative_comment_infos 

In [25]:
comment_infos.describe()

count      1000.000000
mean       6145.629000
std       12367.421854
min           0.000000
25%        1019.000000
50%        2494.500000
75%        5739.500000
max      175822.000000
dtype: float64

In [26]:
comment_infos[comment_infos == 175822].index

Index([6], dtype='int64')

In [27]:
positive_comment_infos.describe()

count      1000.000000
mean       5911.086000
std       12031.850716
min           0.000000
25%         964.250000
50%        2339.500000
75%        5500.000000
max      170427.000000
Name: positive_comment_count, dtype: float64

In [28]:
positive_comment_infos[positive_comment_infos == 170427].index

Index([6], dtype='int64')

In [30]:
site_extra_infos.iloc[6]

id                             7
site_idx                       7
price                        399
positive_comment_count    170427
negative_comment_count      5395
Name: 6, dtype: int64

In [31]:
max([1,2,3,4,5,6])

6

In [35]:
a = [1, 2, 3, 4, 5, 6]
a = [ s / 6 for s in a ]

In [37]:
a

[0.16666666666666666,
 0.3333333333333333,
 0.5,
 0.6666666666666666,
 0.8333333333333334,
 1.0]

In [39]:
a = [1, 2, 3]
b = [2, 3, 4]
c = [3, 4, 5]

d = [ i+ j+ k for i, j, k in zip(a, b, c)]

In [40]:
d

[6, 9, 12]

In [3]:
resp = client.query(
    collection_name="tb_site_embed",
    filter="site_idx == 1",
    output_fields=["site_idx", "site_embed"]
)

In [4]:
resp

data: ["{'site_idx': 1, 'site_embed': [0.0963556319475174, 0.06427876651287079, 0.10647214949131012, -0.14588391780853271, 0.05274094641208649, -0.00748829822987318, -0.002486209850758314, 0.13815192878246307, 0.20977088809013367, 0.05330279469490051, 0.1723252534866333, 0.08137103915214539, 0.03111337125301361, 0.09734208136796951, -0.13279122114181519, 0.0890011191368103, 0.15920473635196686, -0.17387305200099945, 0.04252069070935249, 0.09736600518226624, -0.03538454696536064, -0.11707425117492676, 0.13496875762939453, 0.16400468349456787, -0.1914016306400299, 0.16022706031799316, -0.13152462244033813, -0.07476786524057388, -0.05607331171631813, 0.0937274694442749, -0.17885710299015045, -0.0655229315161705, -0.1450410783290863, 0.07949322462081909, 0.17118096351623535, -0.05645675212144852, 0.09731192886829376, -0.11332552134990692, -0.07398656755685806, 0.052107710391283035, -0.0730527713894844, 0.0347457192838192, -0.15784336626529694, -0.16664336621761322, -0.1518060863018036, 0.0

In [5]:
cur_site_idx = resp[0]['site_idx']
cur_site_embed = resp[0]['site_embed']

In [6]:
print(cur_site_idx)
print(cur_site_embed)

1
[0.0963556319475174, 0.06427876651287079, 0.10647214949131012, -0.14588391780853271, 0.05274094641208649, -0.00748829822987318, -0.002486209850758314, 0.13815192878246307, 0.20977088809013367, 0.05330279469490051, 0.1723252534866333, 0.08137103915214539, 0.03111337125301361, 0.09734208136796951, -0.13279122114181519, 0.0890011191368103, 0.15920473635196686, -0.17387305200099945, 0.04252069070935249, 0.09736600518226624, -0.03538454696536064, -0.11707425117492676, 0.13496875762939453, 0.16400468349456787, -0.1914016306400299, 0.16022706031799316, -0.13152462244033813, -0.07476786524057388, -0.05607331171631813, 0.0937274694442749, -0.17885710299015045, -0.0655229315161705, -0.1450410783290863, 0.07949322462081909, 0.17118096351623535, -0.05645675212144852, 0.09731192886829376, -0.11332552134990692, -0.07398656755685806, 0.052107710391283035, -0.0730527713894844, 0.0347457192838192, -0.15784336626529694, -0.16664336621761322, -0.1518060863018036, 0.0574537068605423, 0.18830008804798126

In [9]:
searched_resp = client.search(
    collection_name="tb_site_embed",
    anns_field="site_embed",
    data=[cur_site_embed],
    limit=100,
    output_fields=["site_idx"],
    search_params= {
        "metric_type": "IP"
    }
)

In [12]:
searched_site_idxs = [ resp['entity']['site_idx'] for resp in searched_resp[0]]

In [13]:
searched_site_idxs

[1,
 103,
 2,
 438,
 584,
 536,
 691,
 822,
 111,
 839,
 102,
 925,
 570,
 281,
 777,
 977,
 696,
 313,
 640,
 859,
 950,
 38,
 855,
 994,
 524,
 349,
 592,
 277,
 354,
 232,
 809,
 784,
 796,
 886,
 151,
 901,
 303,
 446,
 771,
 32,
 187,
 15,
 340,
 638,
 824,
 532,
 306,
 867,
 579,
 671,
 258,
 672,
 825,
 763,
 382,
 280,
 911,
 1000,
 180,
 514,
 172,
 512,
 779,
 678,
 357,
 838,
 194,
 37,
 64,
 782,
 929,
 19,
 646,
 520,
 879,
 728,
 990,
 616,
 850,
 810,
 499,
 498,
 972,
 768,
 791,
 302,
 862,
 411,
 334,
 248,
 828,
 832,
 231,
 18,
 501,
 710,
 424,
 970,
 100,
 593]

In [14]:
searched_resp

data: [[{'id': 463122217384807186, 'distance': 1.0, 'entity': {'site_idx': 1}}, {'id': 463122217384807288, 'distance': 0.9998921751976013, 'entity': {'site_idx': 103}}, {'id': 463122217384807187, 'distance': 0.9998758435249329, 'entity': {'site_idx': 2}}, {'id': 463122217384807623, 'distance': 0.9998663067817688, 'entity': {'site_idx': 438}}, {'id': 463122217384807769, 'distance': 0.999855101108551, 'entity': {'site_idx': 584}}, {'id': 463122217384807721, 'distance': 0.9998405575752258, 'entity': {'site_idx': 536}}, {'id': 463122217384807876, 'distance': 0.999837338924408, 'entity': {'site_idx': 691}}, {'id': 463122217384808007, 'distance': 0.9998338222503662, 'entity': {'site_idx': 822}}, {'id': 463122217384807296, 'distance': 0.999828040599823, 'entity': {'site_idx': 111}}, {'id': 463122217384808024, 'distance': 0.9998217821121216, 'entity': {'site_idx': 839}}] ... and 90 entities remaining]

In [15]:
distances = [ resp['distance'] for resp in searched_resp[0] ]

In [16]:
distances

[1.0,
 0.9998921751976013,
 0.9998758435249329,
 0.9998663067817688,
 0.999855101108551,
 0.9998405575752258,
 0.999837338924408,
 0.9998338222503662,
 0.999828040599823,
 0.9998217821121216,
 0.9998209476470947,
 0.9998205900192261,
 0.9998193979263306,
 0.9998190402984619,
 0.9998180866241455,
 0.9998177886009216,
 0.9998173713684082,
 0.9998140335083008,
 0.9998106956481934,
 0.9998089075088501,
 0.9998074173927307,
 0.9998069405555725,
 0.99980628490448,
 0.9998055696487427,
 0.9998054504394531,
 0.999805212020874,
 0.9998044967651367,
 0.9998007416725159,
 0.9997997283935547,
 0.9997968673706055,
 0.9997943043708801,
 0.9997937679290771,
 0.9997926950454712,
 0.9997910261154175,
 0.9997904300689697,
 0.9997904300689697,
 0.9997900724411011,
 0.9997895956039429,
 0.9997889399528503,
 0.9997887015342712,
 0.9997875094413757,
 0.9997831583023071,
 0.9997814297676086,
 0.9997802972793579,
 0.999779999256134,
 0.9997789263725281,
 0.9997786283493042,
 0.9997786283493042,
 0.99977600574