<a href="https://colab.research.google.com/github/tohnperfect/AIprototype2022/blob/main/VectorDB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install milvus



In [2]:
!pip install milvus[client]



# Example codes

In [3]:
from milvus import default_server
from pymilvus import connections, utility

In [4]:
from pymilvus import (
    connections,
    utility,
    FieldSchema,
    CollectionSchema,
    DataType,
    Collection,
)

In [5]:
with default_server:
  # Milvus Lite has already started, use default_server here.
  connections.connect(host='127.0.0.1', port=default_server.listen_port)



    __  _________ _   ____  ______
   /  |/  /  _/ /| | / / / / / __/
  / /|_/ // // /_| |/ / /_/ /\ \
 /_/  /_/___/____/___/\____/___/ {Lite}

 Welcome to use Milvus!

 Version:   v2.2.11-lite
 Process:   1122
 Started:   2023-07-24 10:42:39
 Config:    /root/.milvus.io/milvus-server/2.2.11/configs/milvus.yaml
 Logs:      /root/.milvus.io/milvus-server/2.2.11/logs

 Ctrl+C to exit ...




In [4]:
# Start your milvus server
default_server.start()

# Now you could connect with localhost and the given port
# Port is defined by default_server.listen_port
connections.connect(host='127.0.0.1', port=default_server.listen_port)

# Check if the server is ready.
print(utility.get_server_version())



    __  _________ _   ____  ______
   /  |/  /  _/ /| | / / / / / __/
  / /|_/ // // /_| |/ / /_/ /\ \
 /_/  /_/___/____/___/\____/___/ {Lite}

 Welcome to use Milvus!

 Version:   v2.2.11-lite
 Process:   590
 Started:   2023-07-24 16:20:17
 Config:    /root/.milvus.io/milvus-server/2.2.11/configs/milvus.yaml
 Logs:      /root/.milvus.io/milvus-server/2.2.11/logs

 Ctrl+C to exit ...
v2.2.11-lite


In [6]:
fields = [
    FieldSchema(name="pk", dtype=DataType.INT64, is_primary=True, auto_id=False),
    FieldSchema(name="random", dtype=DataType.DOUBLE),
    FieldSchema(name="embeddings", dtype=DataType.FLOAT_VECTOR, dim=8)
]
schema = CollectionSchema(fields, "hello_milvus is the simplest demo to introduce the APIs")
hello_milvus = Collection("hello_milvus", schema)

In [7]:
import random
entities = [
    [i for i in range(3000)],  # field pk
    [float(random.randrange(-20, -10)) for _ in range(3000)],  # field random
    [[random.random() for _ in range(8)] for _ in range(3000)],  # field embeddings
]
insert_result = hello_milvus.insert(entities)
# After final entity is inserted, it is best to call flush to have no growing segments left in memory
hello_milvus.flush()

In [10]:
len(entities)

3

In [11]:
index = {
    "index_type": "IVF_FLAT",
    "metric_type": "L2",
    "params": {"nlist": 128},
}
hello_milvus.create_index("embeddings", index)




Status(code=0, message=)

In [22]:
hello_milvus.load()
vectors_to_search = entities[-1][-2:]
search_params = {
    "metric_type": "L2",
    "params": {"nprobe": 10},
}
result = hello_milvus.search(vectors_to_search, "embeddings", search_params, limit=3, output_fields=["pk","random"])


In [17]:
vectors_to_search

[[0.3177493563909083,
  0.2223756061280907,
  0.04799438408919132,
  0.8485937802753468,
  0.3423304626098166,
  0.2339029737687267,
  0.310710557397019,
  0.003722010305584389],
 [0.6426312709799793,
  0.9843688019715365,
  0.6428635802644465,
  0.7012648838056548,
  0.6732065100037833,
  0.7943272205736936,
  0.2063378831729139,
  0.3722454697420817]]

In [23]:
for i,hits in enumerate(result):
    print(f'vector to search: {vectors_to_search[i]}')
    for hit in hits:
        print(f"hit: {hit}, random field: {hit.entity.get('random')}")

#print(search_latency_fmt.format(end_time - start_time))

vector to search: [0.3177493563909083, 0.2223756061280907, 0.04799438408919132, 0.8485937802753468, 0.3423304626098166, 0.2339029737687267, 0.310710557397019, 0.003722010305584389]
hit: id: 2998, distance: 0.0, entity: {'pk': 2998, 'random': -19.0}, random field: -19.0
hit: id: 1694, distance: 0.11356963962316513, entity: {'pk': 1694, 'random': -19.0}, random field: -19.0
hit: id: 2989, distance: 0.14509862661361694, entity: {'pk': 2989, 'random': -17.0}, random field: -17.0
vector to search: [0.6426312709799793, 0.9843688019715365, 0.6428635802644465, 0.7012648838056548, 0.6732065100037833, 0.7943272205736936, 0.2063378831729139, 0.3722454697420817]
hit: id: 2999, distance: 0.0, entity: {'pk': 2999, 'random': -11.0}, random field: -11.0
hit: id: 2077, distance: 0.09053687751293182, entity: {'pk': 2077, 'random': -18.0}, random field: -18.0
hit: id: 384, distance: 0.1561051309108734, entity: {'pk': 384, 'random': -11.0}, random field: -11.0


# Load AIDB

In [5]:
import sqlite3
import pandas as pd

In [6]:
conn = sqlite3.connect('nayoo_AIDB.db')
ai_df = pd.read_sql_query("SELECT * FROM aidash", conn)
conn.close()

In [20]:
ai_df = ai_df.dropna(subset=['id'])

In [21]:
ai_df.head()

Unnamed: 0,id,latitude,longitude,sell_price,number_of_bedroom,number_of_bathroom,number_of_rai,number_of_ngan,number_of_va,renovate,...,fortrain,forviz,time_stamps,central_cost,user_click,sub_dis_click,prop_click,sub_district,demand_score,combined_index
0,1,16.139321,102.631587,390000,2,2,0.0,1.0,45.0,0.0,...,1,1,2023-05-24 15:35:04,0.0,240.0,240.0,1168.0,ศรีบุญเรือง,49.503438,บ้านเดี่ยว_ศรีบุญเรือง_ชนบท
1,4,16.676156,102.917998,820000,1,1,0.0,0.0,88.0,0.0,...,1,1,2023-05-24 15:35:04,0.0,118.0,338.0,5458.0,ทรายมูล,49.935331,บ้านเดี่ยว_ทรายมูล_น้ำพอง
2,6,16.057538,102.726317,7000000,3,4,0.0,0.0,61.0,0.0,...,1,1,2023-05-24 15:35:04,0.0,219.0,3250.0,629.0,ในเมือง,51.564502,อาคารพาณิชย์/สำนักงาน_ในเมือง_บ้านไผ่
3,7,16.530401,102.594174,710000,0,0,3.0,2.0,9.0,0.0,...,1,1,2023-05-24 15:35:04,,260.0,1129.0,1865.0,หนองบัว,50.127279,ที่ดิน_หนองบัว_บ้านฝาง
4,9,15.85441,102.616296,13000000,0,0,38.0,0.0,92.6,0.0,...,1,1,2023-05-24 15:35:04,,77.0,271.0,1341.0,เก่างิ้ว,50.104252,ที่ดิน_เก่างิ้ว_พล


In [22]:
ai_df.columns

Index(['id', 'latitude', 'longitude', 'sell_price', 'number_of_bedroom',
       'number_of_bathroom', 'number_of_rai', 'number_of_ngan', 'number_of_va',
       'renovate', 'project_name', 'property_type', 'zone', 'usable_area',
       'ai_price', 'ai_score', 'number_of_parking', 'furniture', 'direction',
       'district', 'floor_number', 'number_of_story', 'email', 'name_th',
       'contact_name', 'updated_at', 'mobile1', 'created_at', 'province',
       'web_path', 'status', 'listing_type', 'motorway', 'trunk', 'primary',
       'secondary', 'tertiary', 'residential', 'fortrain', 'forviz',
       'time_stamps', 'central_cost', 'user_click', 'sub_dis_click',
       'prop_click', 'sub_district', 'demand_score', 'combined_index'],
      dtype='object')

In [23]:
ai_embed = ai_df[['latitude', 'longitude', 'sell_price', 'number_of_bedroom',
       'number_of_bathroom', 'number_of_rai', 'number_of_ngan', 'number_of_va',
       'renovate', 'property_type', 'zone', 'usable_area','number_of_parking',
       'district', 'sub_district', 'floor_number', 'number_of_story', 'motorway', 'trunk', 'primary',
       'secondary', 'tertiary', 'residential']]

# Build Embedding

In [24]:
from sklearn import preprocessing

In [31]:
import numpy as np

## one-hot

In [33]:
embed = pd.get_dummies(ai_embed,columns=['property_type','zone','district','sub_district'])

In [34]:
embed

Unnamed: 0,latitude,longitude,sell_price,number_of_bedroom,number_of_bathroom,number_of_rai,number_of_ngan,number_of_va,renovate,usable_area,...,sub_district_โนนหินแห่,sub_district_โนนอุดม,sub_district_โนนแดง,sub_district_โพธิ์ชัย,sub_district_โพธิ์ไชย,sub_district_โพนเพ็ก,sub_district_โสกนกเต็น,sub_district_ในเมือง,sub_district_ใหม่นาเพียง,sub_district_ไชยสอ
0,16.139321,102.631587,390000,2,2,0.0,1.0,45.0,0.0,244.0,...,0,0,0,0,0,0,0,0,0,0
1,16.676156,102.917998,820000,1,1,0.0,0.0,88.0,0.0,164.0,...,0,0,0,0,0,0,0,0,0,0
2,16.057538,102.726317,7000000,3,4,0.0,0.0,61.0,0.0,344.0,...,0,0,0,0,0,0,0,1,0,0
3,16.530401,102.594174,710000,0,0,3.0,2.0,9.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,15.854410,102.616296,13000000,0,0,38.0,0.0,92.6,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7563,16.434660,102.808615,4200000,3,2,0.0,0.0,58.7,0.5,0.0,...,0,0,0,0,0,0,0,0,0,0
7564,16.416775,102.818807,8000000,4,3,1.0,1.0,1.0,0.5,200.0,...,0,0,0,0,0,0,0,0,0,0
7565,16.396894,102.857463,900000,0,0,0.0,0.0,90.0,0.5,0.0,...,0,0,0,0,0,0,0,0,0,0
7566,16.669156,102.782942,200000,0,0,10.0,0.0,0.0,0.5,0.0,...,0,0,0,0,0,0,0,0,0,0


In [35]:
embed = embed.fillna(0)

## normalized

In [36]:
min_max_scaler = preprocessing.MinMaxScaler()
embed_scaled = min_max_scaler.fit_transform(embed.values)

In [37]:
embed_scaled.shape

(7568, 299)

In [38]:
np.argwhere(np.isnan(embed_scaled))

array([], shape=(0, 2), dtype=int64)

## start DB

In [15]:
# Start your milvus server
default_server.start()

# Now you could connect with localhost and the given port
# Port is defined by default_server.listen_port
connections.connect(host='127.0.0.1', port=default_server.listen_port)

# Check if the server is ready.
print(utility.get_server_version())

fields = [
    FieldSchema(name="pk", dtype=DataType.INT64, is_primary=True, auto_id=False),
    FieldSchema(name="data", dtype=DataType.VARCHAR,max_length=512,),
    FieldSchema(name="embeddings", dtype=DataType.FLOAT_VECTOR, dim=embed_scaled.shape[1])
]
schema = CollectionSchema(fields, "nayoo VDB")
hello_milvus = Collection("nayooVDB", schema)





    __  _________ _   ____  ______
   /  |/  /  _/ /| | / / / / / __/
  / /|_/ // // /_| |/ / /_/ /\ \
 /_/  /_/___/____/___/\____/___/ {Lite}

 Welcome to use Milvus!

 Version:   v2.2.11-lite
 Process:   26840
 Started:   2023-07-25 03:42:21
 Config:    /root/.milvus.io/milvus-server/2.2.11/configs/milvus.yaml
 Logs:      /root/.milvus.io/milvus-server/2.2.11/logs

 Ctrl+C to exit ...




v2.2.11-lite


## insert data

In [39]:
entities = [
    [i for i in range(ai_df.shape[0])],  # field pk
    ai_df['id'].values,  # field pk
    embed_scaled,  # field embeddings
]


insert_result = hello_milvus.insert(entities)
# After final entity is inserted, it is best to call flush to have no growing segments left in memory
hello_milvus.flush()

## indexing

In [40]:
index = {
    "index_type": "IVF_FLAT",
    "metric_type": "L2",
    "params": {"nlist": 128},
}
hello_milvus.create_index("embeddings", index)



Status(code=0, message=)

## search

In [43]:
vectors_to_search = embed_scaled[10,:]

In [44]:
vectors_to_search

array([9.86806738e-06, 1.00645217e-05, 2.00000000e-05, 1.17647059e-02,
       1.17647059e-02, 0.00000000e+00, 0.00000000e+00, 3.19417476e-02,
       0.00000000e+00, 7.24788430e-05, 3.47222222e-03, 0.00000000e+00,
       4.44444444e-02, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 1.00000000e+00, 1.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
      

In [50]:
hello_milvus.load()

search_params = {
    "metric_type": "L2",
    "params": {"nprobe": 10},
}
result = hello_milvus.search([vectors_to_search], "embeddings", search_params, limit=5, output_fields=["data"])

In [51]:
result

<pymilvus.orm.search.SearchResult at 0x7db9107bd090>

In [52]:
for i,hits in enumerate(result):
    print(f'vector to search: {vectors_to_search[i]}')
    for hit in hits:
        print(f"hit: {hit}, data field: {hit.entity.get('data')}")

vector to search: 9.868067383233532e-06
hit: id: 10, distance: 0.0, entity: {'data': '20'}, data field: 20
hit: id: 1133, distance: 0.002382750390097499, entity: {'data': '6178'}, data field: 6178
hit: id: 1035, distance: 0.002725713886320591, entity: {'data': '6052'}, data field: 6052
hit: id: 1225, distance: 0.002854237100109458, entity: {'data': '6377'}, data field: 6377
hit: id: 1337, distance: 1.000012993812561, entity: {'data': '1060P'}, data field: 1060P


In [53]:
default_server.stop()