In [2]:
# +------------------------------+ #
# 資料設定值
weight_vector_title = 3
weight_vector_content = 1
weight_vector_tag = 2
weight_vector_tf = 2
# 設定 Milvus collection 名稱和維度
collection_name = 'trello_finder_v2'
# 設定分區名稱
partition_name = 'import'
# +------------------------------+ #

In [6]:
# 引入套件
import pandas as pd
import numpy as np

In [9]:
# 讀取 json 檔案包含轉換向量
df = pd.read_json("../backup/vector_all.json")

In [None]:
# Convert 'vector_content', 'vector_title', 'vector_tag', 'vector_tf' to NumPy ndarray
def convert_to_np_array(lst):
    # 檢查 lst 是否為 None 或為具有值的列表
    if lst is None:
        return np.zeros(768)
    elif isinstance(lst, list):
        return np.array(lst)
    else:
        return np.zeros(768)


# 將 'vector_content' 轉換為 NumPy 的 ndarray，若為 None 則轉換為長度 768 的 0 陣列
df["np_vector_content"] = df["vector_content"].apply(convert_to_np_array)

# 將 'vector_title' 轉換為 NumPy 的 ndarray，若為 None 則轉換為長度 768 的 0 陣列
df["np_vector_title"] = df["vector_title"].apply(convert_to_np_array)

# 將 'vector_tag' 轉換為 NumPy 的 ndarray，若為 None 則轉換為長度 768 的 0 陣列
df["np_vector_tag"] = df["vector_tag"].apply(convert_to_np_array)

# 將 'vector_tf' 轉換為 NumPy 的 ndarray，若為 None 則轉換為長度 768 的 0 陣列
df["np_vector_tf"] = df["vector_tf"].apply(convert_to_np_array)

In [None]:
df

In [2]:
#####################
# Request Variables
# -------------------
# set : list of dict
#   - weight : float
#   - array : np.array
# len_array : int (length of array)
#####################
def calc_array_mean(set, len_array):
    # Define weight that need to sum
    sum_weight = 0
    # Define sum_array to sum all array
    sum_array = np.zeros(len_array)
    for item in set:
        # Check Array is empty
        if item["array"].sum() == 0:
            # No calculate this weight
            sum_weight += 0
        else:
            # Calculate this weight
            sum_weight += item['weight']
            # Add to sum_array
            sum_array += item['array'] * item['weight']

    return sum_array / sum_weight

In [33]:
def search_array(arr_title, arr_content, arr_tag, arr_tf):
    return calc_array_mean(
        set=[{
            "weight": weight_vector_title,
            "array": arr_title,
        }, {
            "weight": weight_vector_content,
            "array": arr_content,
        }, {
            "weight": weight_vector_tag,
            "array": arr_tag,
        }, {
            "weight": weight_vector_tf,
            "array": arr_tf,
        }],
        len_array=768
    )


# Create a new column "vector_search" with default value None
df['vector_search'] = None

# Define a function to apply the search_array function to each row


def apply_search_array(row):
    # return search_array(row['np_vector_title'], row['np_vector_content'], row['np_vector_tag'], row['np_vector_tf'])
    return search_array(row['vector_title'], row['vector_content'], row['vector_tag'], row['vector_tf'])


# Apply the function to each row and update the "vector_search" column
df['vector_search'] = df.apply(apply_search_array, axis=1)

In [3]:
from pymilvus import DataType, FieldSchema, CollectionSchema, Collection, connections
# 設定 Milvus 的連線資訊
connections.connect(alias="default")
milvus_collection = Collection(collection_name)

In [4]:
errorlist = []

In [None]:
# 將資料存入 Milvus
for idx, row in df.iterrows():
    try:

        insert_milvus_title = row["title"]
        if (len(insert_milvus_title) > 500):
            insert_milvus_title = insert_milvus_title[:497] + "..."

        try:
            # 插入 Milvus
            status = milvus_collection.insert({
                "id": str(idx),
                "track_id": str(idx),
                "value": row["vector_search"],
                "title": insert_milvus_title,
            }, partition_name=partition_name)

            if (status.insert_count != 1):
                print("Milvus Insert Fail")
                print(status)
                errorlist.append(idx)
            # else:
                # print("Done \n--------")

        except Exception as exp:
            print("--------------------")
            print(str(idx), "|", len(str(idx)))
            print("Vector |", len(row["vector"]))
            print(insert_milvus_title, "|", len(insert_milvus_title))
            print("--------------------")
            print("Milvus Insert Fail")
            print(exp)
            errorlist.append(idx)

    except Exception as exp:
        print("Milvus Insert Fail")
        print(exp)
        errorlist.append(idx)

In [None]:
errorlist

---


In [None]:
for idx, row in df.iloc[errorlist].iterrows():
    print(idx)
    insert_milvus_title = row["title"]
    if (len(insert_milvus_title) > 75):
        insert_milvus_title = insert_milvus_title[:71] + "..."
    print(len(insert_milvus_title))
    print(row["title"])

In [None]:
expr = 'id in ["21846","8591"]'
milvus_collection.delete(expr)

In [5]:
expr = 'id in ["8370"]'
milvus_collection.delete(expr)

(insert count: 0, delete count: 1, upsert count: 0, timestamp: 443367844300718081, success count: 0, err count: 0)

---


In [28]:
# 建立空的 DataFrame
df = pd.DataFrame(index=[0], columns=['vector_content',
                  'vector_title', 'vector_tag', 'vector_tf'])
vector_length = 768
df.at[0, 'vector_content'] = np.full(vector_length, 3)
df.at[0, 'vector_title'] = np.full(vector_length, 2)
df.at[0, 'vector_tag'] = np.full(vector_length, 1)
df.at[0, 'vector_tf'] = np.zeros(vector_length)

In [34]:
df

Unnamed: 0,vector_content,vector_title,vector_tag,vector_tf,vector_search
0,"[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[1.8333333333333333, 1.8333333333333333, 1.833..."


In [9]:
# Import modules of Milvus
from pymilvus import connections, Collection, utility
# Setup environment value
import os
from dotenv import load_dotenv
load_dotenv()
milvus_path = os.getenv("milvus_path")
milvus_port = os.getenv("milvus_port")
milvus_db_name = os.getenv("milvus_db_name")
article_vector_index_type = os.getenv("article_vector_index_type")
article_vector_metric_type = os.getenv("article_vector_metric_type")


# Connect to milvus server (connector)
conn = connections.connect(
    alias="default",
    host=milvus_path,
    port=milvus_port,
    db_name=milvus_db_name
)

# 設定 Milvus collection 名稱

# 建立 collection
collection = Collection("trello_finder_keyword")

In [10]:
errorlist = []

In [16]:
import pandas as pd
df = pd.read_json("../backup/5. TF 字典（含向量）.json")

In [17]:
df

Unnamed: 0,word,word_id,vector
0,方法,0,"[-0.018200440300000002, -0.0207575597, -0.0205..."
1,因果,1,"[-0.0245203413, 0.047072701200000004, -0.02090..."
2,關係,2,"[0.0035511809, 0.1148635149, -0.0184334479, 0...."
3,資料,3,"[-0.1050320342, 0.10554046930000001, -0.018963..."
4,AI,4,"[-0.1786929816, 0.10562170300000001, -0.010878..."
...,...,...,...
1595823,黨團,1595823,"[-0.0738442689, 0.0248856582, -0.0183375981, 0..."
1595831,備而不用,1595831,"[0.025757161900000002, 0.1049960852, -0.021495..."
1595850,格架,1595850,"[-0.0103145884, 0.0129827634, -0.0215662960000..."
1595881,Goods,1595881,"[0.0315635279, 0.034908812500000004, -0.012700..."


In [20]:
len(df)

101525

In [26]:
def uploadToMilvus(index, vector, word):
    status = collection.insert({
        "keyword_id": str(index),
        "link_id": str(index),
        "vector": vector,
        "preview_str": word,
    }, partition_name="V3")

    if (status.insert_count != 1):
        print("Insert failed!")
        return False
    else:
        return True

In [27]:
df["milvusUpload"] = df.apply(lambda row: uploadToMilvus(row["word_id"], row["vector"],row["word"]), axis=1 )

In [31]:
df.loc[df["milvusUpload"]==True]

Unnamed: 0,word,word_id,vector,milvusUpload
0,方法,0,"[-0.018200440300000002, -0.0207575597, -0.0205...",True
1,因果,1,"[-0.0245203413, 0.047072701200000004, -0.02090...",True
2,關係,2,"[0.0035511809, 0.1148635149, -0.0184334479, 0....",True
3,資料,3,"[-0.1050320342, 0.10554046930000001, -0.018963...",True
4,AI,4,"[-0.1786929816, 0.10562170300000001, -0.010878...",True
...,...,...,...,...
1595823,黨團,1595823,"[-0.0738442689, 0.0248856582, -0.0183375981, 0...",True
1595831,備而不用,1595831,"[0.025757161900000002, 0.1049960852, -0.021495...",True
1595850,格架,1595850,"[-0.0103145884, 0.0129827634, -0.0215662960000...",True
1595881,Goods,1595881,"[0.0315635279, 0.034908812500000004, -0.012700...",True
