In [24]:
from pymilvus import MilvusClient, DataType

In [25]:
# 连接到 milvus
client = MilvusClient(uri='http://localhost:19530')

In [29]:
# 创建景点表
schema = client.create_schema()

# 主键
schema.add_field(
    field_name="id",
    is_primary=True,
    auto_id=True,
    datatype=DataType.INT64
)

schema.add_field(
    field_name="site_idx",
    datatype=DataType.INT64
)

schema.add_field(
    field_name="site_embed",
    datatype=DataType.FLOAT_VECTOR,
    dim=64
)


{'auto_id': False, 'description': '', 'fields': [{'name': 'id', 'description': '', 'type': <DataType.INT64: 5>, 'is_primary': True, 'auto_id': True}, {'name': 'site_idx', 'description': '', 'type': <DataType.INT64: 5>}, {'name': 'site_embed', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 64}}], 'enable_dynamic_field': False}

In [27]:
client.has_collection(collection_name="tb_site_embed")

True

In [8]:
client.describe_collection(collection_name="tb_site_embed")

{'collection_name': 'tb_site_embed',
 'auto_id': True,
 'num_shards': 1,
 'description': '',
 'fields': [{'field_id': 100,
   'name': 'id',
   'description': '',
   'type': <DataType.INT64: 5>,
   'params': {},
   'auto_id': True,
   'is_primary': True},
  {'field_id': 101,
   'name': 'site_idx',
   'description': '',
   'type': <DataType.INT64: 5>,
   'params': {}},
  {'field_id': 102,
   'name': 'site_embed',
   'description': '',
   'type': <DataType.FLOAT_VECTOR: 101>,
   'params': {'dim': 64}}],
 'functions': [],
 'aliases': [],
 'collection_id': 463088652533107462,
 'consistency_level': 2,
 'properties': {'timezone': 'UTC'},
 'num_partitions': 1,
 'enable_dynamic_field': False,
 'created_timestamp': 463088741075124242,
 'update_timestamp': 463088741075124242}

In [13]:
# 创建用户表
user_schema = client.create_schema()

user_schema.add_field(
    field_name="id",
    is_primary=True,
    auto_id=True,
    datatype=DataType.INT64
)

user_schema.add_field(
    field_name="user_id",
    datatype=DataType.INT64
)

user_schema.add_field(
    field_name="user_embed",
    datatype=DataType.FLOAT_VECTOR,
    dim=64
)


{'auto_id': False, 'description': '', 'fields': [{'name': 'id', 'description': '', 'type': <DataType.INT64: 5>, 'is_primary': True, 'auto_id': True}, {'name': 'user_id', 'description': '', 'type': <DataType.INT64: 5>}, {'name': 'user_embed', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 64}}], 'enable_dynamic_field': False}

In [28]:
if client.has_collection(collection_name="tb_site_embed"):
    client.drop_collection(collection_name="tb_site_embed")

In [12]:
if client.has_collection(collection_name="tb_user_embed"):
    client.drop_collection(collection_name="tb_user_embed")

In [30]:
print(client.has_collection(collection_name="tb_site_embed"))
print(client.has_collection(collection_name="tb_user_embed"))

False
True


In [31]:
site_index_params = client.prepare_index_params()

site_index_params.add_index(
    field_name="site_embed",
    index_name="site_embed_index",
    index_type="AUTOINDEX",
    metric_type="COSINE"
)

client.create_collection(
    collection_name="tb_site_embed",
    schema=schema,
    index_params=site_index_params
)

In [14]:
user_index_params = client.prepare_index_params()

user_index_params.add_index(
    field_name="user_embed",
    index_name="user_embed_index",
    index_type="AUTOINDEX",
    metric_type="COSINE"
)

client.create_collection(
    collection_name="tb_user_embed",
    schema=user_schema,
    index_params=user_index_params
)

In [23]:
client.describe_collection(collection_name="tb_site_embed")

{'collection_name': 'tb_site_embed',
 'auto_id': True,
 'num_shards': 1,
 'description': '',
 'fields': [{'field_id': 100,
   'name': 'id',
   'description': '',
   'type': <DataType.INT64: 5>,
   'params': {},
   'auto_id': True,
   'is_primary': True},
  {'field_id': 101,
   'name': 'site_idx',
   'description': '',
   'type': <DataType.INT64: 5>,
   'params': {}},
  {'field_id': 102,
   'name': 'site_embed',
   'description': '',
   'type': <DataType.FLOAT_VECTOR: 101>,
   'params': {'dim': 64}}],
 'functions': [],
 'aliases': [],
 'collection_id': 463071564295321901,
 'consistency_level': 2,
 'properties': {'timezone': 'UTC'},
 'num_partitions': 1,
 'enable_dynamic_field': False,
 'created_timestamp': 463072261962465298,
 'update_timestamp': 463072261962465298}

In [17]:
client.describe_collection(collection_name="tb_user_embed")

{'collection_name': 'tb_user_embed',
 'auto_id': True,
 'num_shards': 1,
 'description': '',
 'fields': [{'field_id': 100,
   'name': 'id',
   'description': '',
   'type': <DataType.INT64: 5>,
   'params': {},
   'auto_id': True,
   'is_primary': True},
  {'field_id': 101,
   'name': 'user_id',
   'description': '',
   'type': <DataType.INT64: 5>,
   'params': {}},
  {'field_id': 102,
   'name': 'user_embed',
   'description': '',
   'type': <DataType.FLOAT_VECTOR: 101>,
   'params': {'dim': 64}}],
 'functions': [],
 'aliases': [],
 'collection_id': 463071564295322436,
 'consistency_level': 2,
 'properties': {'timezone': 'UTC'},
 'num_partitions': 1,
 'enable_dynamic_field': False,
 'created_timestamp': 463072287809863697,
 'update_timestamp': 463072287809863697}

In [11]:
import numpy as np

In [12]:
vec = np.random.randn(64)

res = client.search(
    collection_name="tb_site_embed",
    anns_field="site_embed",
    data=[vec],
    limit=3,
    output_fields=["site_idx"],
    search_params={"metric_type": "IP"}
)

In [13]:
res[0]

[]

In [32]:
import pandas as pd

In [33]:
tb_sites = pd.read_csv('../dataset/tb_sites.csv')
tb_sites

Unnamed: 0,id,name,score,address,hot_degree,introduce,open_time,phone,images,site_idx
0,1,故宫博物院,4.8,北京市东城区景山前街4号,10.0,故宫博物院又称紫禁城，是明、清两代的皇宫，也是古老中国的标志和象征。当你置身于气派规整的高墙...,今日已闭园；明天08:30-16:30开放（15:30停止入园）,票务咨询：4009501925,https://dimg04.c-ctrip.com/images/100k0q000000...,1
1,2,八达岭长城,4.7,北京市延庆区G6京藏高速58号出口,10.0,八达岭长城号称天下九塞之一，风光集巍峨险峻、秀丽苍翠于一体，是明长城景色中的精华。“不到长城...,今日已闭园；明天07:30-16:00开放（16:00停止入园）,票务咨询：4008180088,https://dimg04.c-ctrip.com/images/1lo5h12000nz...,2
2,3,外滩,4.8,上海市黄浦区中山东一路北至外白渡桥、南至十六铺码头,10.0,外滩位于黄浦江畔，全长约1.5公里，是上海城市象征意义的景点，风格迥异的万国建筑群和浦江夜景...,全年全天开放,,https://dimg04.c-ctrip.com/images/10040m000000...,3
3,4,秦始皇帝陵博物院(兵马俑),4.7,西安市临潼区秦陵北路,10.0,秦始皇帝陵博物院(兵马俑)又称兵马俑、秦兵马俑。它是秦始皇陵的陪葬坑，与秦始皇陵一同组成了：...,今日已闭园；明天08:30-18:00开放（16:30停止入园）,票务咨询：029-81399127,https://dimg04.c-ctrip.com/images/0104012000f6...,4
4,5,香港迪士尼乐园,4.7,香港大屿山香港迪士尼乐园度假区,10.0,香港迪士尼乐园20周年奇妙派对\n在香港迪士尼乐园，来一场奇妙的派对！一般你喜爱的迪士尼朋友...,今日已闭园；明天10:00-20:30开放,票务咨询：+852-1830830,https://dimg04.c-ctrip.com/images/1lo0l12000pi...,5
...,...,...,...,...,...,...,...,...,...,...
995,996,京杭大运河杭州景区,4.3,浙江省杭州市拱墅区环城北路226号,6.8,京杭大运河杭州景区是京杭大运河的最南端，在千百年的历史中，融汇了中国南北各地的特色物产、饮食...,全年全天开放,票务咨询：0571-85190851,https://dimg04.c-ctrip.com/images/100u1f000001...,996
996,997,金山,4.6,江苏省镇江市润州区和平路街道金山路62号,6.8,金山位于镇江市西北，海拔43.7米，距离镇江市中心仅3公里。原先是长江中心的一个岛屿，后来长...,今日已闭园；明天08:00-16:30开放（16:00停止入园）,票务咨询：0511-85512992,https://dimg04.c-ctrip.com/images/0106612000f6...,997
997,998,大同市博物馆,4.4,山西省大同市平城区文瀛湖街道太和路506号,6.8,大同市博物馆始建于1958年，是一座综合性地志博物馆2020年被评为“国家一级博物馆”。大同...,今日已闭园；明天09:00-17:00开放（16:30停止入园）,票务咨询：0352-2303518,https://dimg04.c-ctrip.com/images/10080z000000...,998
998,999,徐州汉文化景区,4.6,江苏省徐州市云龙区兵马俑路1号,6.8,徐州汉文化景区距离徐州市区很近，其中徐州汉兵马俑博物馆、水下兵马俑博物馆、汉画长廊、1995...,冬季 08:30-17:00开放（16:30停止售票），夏季 08:30-17:30开放（1...,票务咨询：0516-83661269 ; 0516-83167053,https://dimg04.c-ctrip.com/images/0105512000h4...,999


In [37]:
introduce = tb_sites['introduce']

In [38]:
introduce[introduce.isna()].index

Index([], dtype='int64')

In [36]:
tb_sites['introduce'] = tb_sites['introduce'].fillna('')

In [39]:
empty_index = tb_sites[tb_sites['introduce'] == ''].index

In [40]:
empty_index

Index([198, 220, 365, 468, 552, 606, 635, 672, 687, 710, 750, 756, 825, 925,
       943, 953, 967, 991],
      dtype='int64')

In [43]:
mask = tb_sites["name"].str.contains("演唱会", na=False, case=False)

In [44]:
singer_index = tb_sites[mask].index

In [45]:
singer_index

Index([195, 287, 299, 365, 446, 479, 490, 552, 606, 622, 627, 667, 710, 756,
       779, 879, 977, 988],
      dtype='int64')

In [48]:
idx_to_del = empty_index.tolist() + singer_index.tolist()

In [49]:
idx_to_del

[198,
 220,
 365,
 468,
 552,
 606,
 635,
 672,
 687,
 710,
 750,
 756,
 825,
 925,
 943,
 953,
 967,
 991,
 195,
 287,
 299,
 365,
 446,
 479,
 490,
 552,
 606,
 622,
 627,
 667,
 710,
 756,
 779,
 879,
 977,
 988]

In [50]:
len(idx_to_del)

36

In [51]:
idx_to_del = list(set(idx_to_del))

In [52]:
idx_to_del, len(idx_to_del)

([779,
  667,
  925,
  287,
  672,
  552,
  299,
  687,
  943,
  953,
  825,
  446,
  195,
  198,
  710,
  967,
  977,
  468,
  220,
  988,
  606,
  991,
  479,
  490,
  365,
  750,
  622,
  879,
  627,
  756,
  635],
 31)

In [53]:
site_idx_to_del = [ idx+1 for idx in idx_to_del ]
site_idx_to_del, len(site_idx_to_del)

([780,
  668,
  926,
  288,
  673,
  553,
  300,
  688,
  944,
  954,
  826,
  447,
  196,
  199,
  711,
  968,
  978,
  469,
  221,
  989,
  607,
  992,
  480,
  491,
  366,
  751,
  623,
  880,
  628,
  757,
  636],
 31)

In [54]:
del_list_str = ','.join([ str(idx) for idx in site_idx_to_del ])
del_list_str

'780,668,926,288,673,553,300,688,944,954,826,447,196,199,711,968,978,469,221,989,607,992,480,491,366,751,623,880,628,757,636'

In [55]:
del_expr = f"site_idx in [{del_list_str}]"
del_expr

'site_idx in [780,668,926,288,673,553,300,688,944,954,826,447,196,199,711,968,978,469,221,989,607,992,480,491,366,751,623,880,628,757,636]'

In [59]:
print(del_expr)

client.delete(
    collection_name="tb_site_embed",
    filter=del_expr
)

site_idx in [780,668,926,288,673,553,300,688,944,954,826,447,196,199,711,968,978,469,221,989,607,992,480,491,366,751,623,880,628,757,636]


{'delete_count': 31, 'cost': 0}