### Basic Function

In [121]:
# 引入套件
import pandas as pd
import numpy as np

In [123]:
# Import modules
from sentence_transformers import SentenceTransformer, util
# Select model by transformer
# about model: https://huggingface.co/sentence-transformers/paraphrase-multilingual-mpnet-base-v2
sbert_model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')
print('sbert_model loaded')

def embedding_sentence(sentence):
    try:
        return {
            "state" : True,
            "value" : sbert_model.encode(sentence).tolist(), # return a vector list
        }
    except Exception as exp:
        return {
            "state" : False,
            "value" : np.zeros(768)
        }

def convert_to_np_array(lst):
    # 檢查 lst 是否為 None 或為具有值的列表
    if lst is None:
        return np.zeros(768)
    elif isinstance(lst, list):
        return np.array(lst)
    else:
        return np.zeros(768)

def calc_array_mean(input_set, len_array):
    # Define weight that needs to sum
    sum_weight = 0
    # Define sum_array to sum all arrays
    sum_array = np.zeros(len_array)
    for item in input_set:
        item['array'] = np.array(item['array'])  # Convert to NumPy array
        # Check if the array is empty
        if item['array'].sum() == 0:
            # Don't calculate this weight
            sum_weight += 0
        else:
            # Calculate this weight
            sum_weight += item['weight']
            # Add to sum_array
            sum_array += item['array'] * item['weight']

    return sum_array / sum_weight

# Import modules of MongoDB
from pymongo import MongoClient
from pymongo.errors import InvalidOperation
# Import modules of datetime(For Log)
from datetime import datetime,timezone,timedelta
# Setup environment value
import os
from dotenv import load_dotenv
load_dotenv()
mongodb_path = os.getenv("mongodb_path")
mongodb_username = os.getenv("mongodb_username")  # 替換為你的用戶名
mongodb_password = os.getenv("mongodb_password")  # 替換為你的密碼

# 建立 MongoDB 連線
client = MongoClient(mongodb_path, username=mongodb_username, password=mongodb_password)
db = client.nthu_trello_helper
mongo_article_collection = db.article
mongo_trello_log_collection = db.trello_log
mongo_word_injection_collection = db.injection_list
mongo_keyword_collection = db.keyword
mongo_keyword_record_collection = db.keyword_record

#article_df = []
#milvus_df = []
#kr_df = []

import re
import monpa
monpa.load_userdict("../../setting/MONPA_斷詞字典.txt")
print('monpa_dict loaded')

# 讀取文字檔並轉換成串列
def txt_to_list(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.readlines()
            content = [line.strip() for line in content]
        return content
    except FileNotFoundError:
        print("找不到指定的檔案。請檢查檔案路徑是否正確。", file_path)
        return []
    except Exception as e:
        print("讀取檔案時發生錯誤：", e)
        return []


# 指定文字檔路徑
stop_word_list = txt_to_list("../../setting/stopwords_chinese.txt")
print('stop_word_list loaded')

def cutSentence(sentence):
    seg = []
    for item in sentence.split("，"):
        if item != "\n":
            seg.extend(monpa.cut(str(item+"，")))

    word_list = seg[:-1]

    word_list = [word for word in word_list if word not in stop_word_list]
    # 将词列表拼接成句子
    sentence = ' '.join(word_list)
    return sentence
import re
from collections import Counter
def calculate_word_frequency(row, text):
    df_array = []
    words = re.findall(r'\b\w+\b', text)  # 利用正則表達式分割文本成單詞
    word_freq = Counter(words)  # 計算字詞的字頻
    total_words = len(words)  # 總字詞數

    # 計算每個字的tf
    word_tf = {word: word_freq[word] / total_words for word in word_freq}

    # 將結果轉換成指定的格式
    result = [{"article": row, "word": word, "count": word_freq[word],
               "tf": word_tf[word]} for word in word_freq if word_freq[word] > 1]
    df_array.extend(result)
    return df_array
def get_new_article_id():
    # 执行聚合查询以找到linked_id字段的最大值
    # 使用聚合查询将文本字段转换为数字并找到最大值
    pipeline = [
        {
            "$project": {
                "link_id_numeric": {
                    "$convert": {
                        "input": "$link_id",
                        "to": "int",
                        "onError": 0  # 如果转换失败，将使用0作为默认值
                    }
                }
            }
        },
        {
            "$group": {
                "_id": None,
                "maxLinkIdNumeric": { "$max": "$link_id_numeric" }
            }
        }
    ]

    try:
        result = list(mongo_article_collection.aggregate(pipeline))
        if result:
            max_linked_id_numeric = result[0]["maxLinkIdNumeric"]
            #print(f"最大的link_id数值是: {max_linked_id_numeric}")
            return (max_linked_id_numeric+1)
        else:
            print("没有找到可转换为数值的linked_id值。")
            return 0
    except InvalidOperation as e:
        print(f"无法进行转换或聚合操作: {e}")
        return 0
def get_new_keyword_id():
    # 执行聚合查询以找到linked_id字段的最大值
    # 使用聚合查询将文本字段转换为数字并找到最大值
    pipeline = [
        {
            "$project": {
                "link_id_numeric": {
                    "$convert": {
                        "input": "$link_id",
                        "to": "int",
                        "onError": 0  # 如果转换失败，将使用0作为默认值
                    }
                }
            }
        },
        {
            "$group": {
                "_id": None,
                "maxLinkIdNumeric": { "$max": "$link_id_numeric" }
            }
        }
    ]

    try:
        result = list(mongo_keyword_collection.aggregate(pipeline))
        if result:
            max_linked_id_numeric = result[0]["maxLinkIdNumeric"]
            #print(f"最大的link_id数值是: {max_linked_id_numeric}")
            return (max_linked_id_numeric+1)
        else:
            print("没有找到可转换为数值的linked_id值。")
            return -1
    except InvalidOperation as e:
        print(f"无法进行转换或聚合操作: {e}")
        return -1
def get_article_id(article_title):
    # 搜尋關鍵字 ID
    result = mongo_article_collection.find_one({
        "title": article_title
    })
    if result is not None:
        return result["link_id"]
def get_kw_id(kw_string):
    # 搜尋關鍵字 ID
    result = mongo_keyword_collection.find_one({
        "preview_str": kw_string
    })
    if result is not None:
        return result["link_id"]
    else:
        # Import modules of Milvus
        from pymilvus import connections, Collection, utility
        # Setup environment value
        import os
        from dotenv import load_dotenv
        load_dotenv()
        milvus_path = os.getenv("milvus_path")
        milvus_port = os.getenv("milvus_port")
        milvus_db_name = os.getenv("milvus_db_name")

        # Connect to milvus server (connector)
        conn = connections.connect(
            alias="default",
            host=milvus_path,
            port=milvus_port,
            db_name=milvus_db_name
        )

        # 設定 Milvus collection 名稱
        # 建立 collection
        collection = Collection("trello_finder_keyword")
        #======================== env done ==========================#

        new_kw_id = str(get_new_keyword_id())
        kw_vector = embedding_sentence(kw_string)["value"]
        mongo_keyword_collection.insert_one({
            "link_id": new_kw_id,
            "vector": kw_vector,
            "preview_str": kw_string,
        })

        status = collection.insert({
            "keyword_id": new_kw_id,
            "link_id": new_kw_id,
            "vector": kw_vector,
            "preview_str": kw_string,
        }, partition_name = MILVUS_KEYWORD_PART)

        if (status.insert_count != 1):
            print("Insert failed!")
            return False
        else:
            return new_kw_id

from pymilvus import DataType, FieldSchema, CollectionSchema, Collection, connections

MONGO_FROM_TEXT = "_data"
MILVUS_ARTICLE_PART = 'import'
MILVUS_KEYWORD_PART = "import"


11/04/2023 13:15:38 - INFO - sentence_transformers.SentenceTransformer - Load pretrained SentenceTransformer: paraphrase-multilingual-mpnet-base-v2


11/04/2023 13:15:40 - INFO - sentence_transformers.SentenceTransformer - Use pytorch device: cpu


sbert_model loaded
monpa_dict loaded
stop_word_list loaded


In [179]:
def insert_to_mongo(article_id, title, url, content, vector):
    try:
        monpa_str = (title + ' ')*10 + content
        # 提取字頻和tf
        corpus = cutSentence(monpa_str)
        # 插入 MongoDB
        mongo_data = {
            "link_id": article_id,
            "title": title,
            "url": url,
            "content": content,
            "values": vector,
            "cuted" : corpus,
            "form" : MONGO_FROM_TEXT
        }
        #global article_df  # 声明使用外部的 article_df
        #article_df.append(mongo_data)
        mongo_article_collection.insert_one(mongo_data)
        return True
    except Exception as exp:
        print("MongoDB Insert Fail")
        print(exp)
        return False

def insert_to_milvus(article_id, article_title, vectors_to_search):
    # 設定 Milvus 的連線資訊
    # +------------------------------+ #
    # 設定 Milvus collection 名稱和維度
    collection_name = 'trello_finder_v2'
    # +------------------------------+ #
    connections.connect(alias="default")
    milvus_collection = Collection(collection_name)
    
    try:
        insert_milvus_title = article_title
        if (len(insert_milvus_title) > 50):
            insert_milvus_title = insert_milvus_title[:47] + "..."

        try:
            milvus_data = {
                "id": article_id,
                "track_id": article_id,
                "value": vectors_to_search,
                "title": insert_milvus_title,
            }
            #global milvus_df  # 声明使用外部的 article_df
            #milvus_df.append(milvus_data)


            # 插入 Milvus
            status = milvus_collection.insert(milvus_data, partition_name=MILVUS_ARTICLE_PART)

            if (status.insert_count != 1):
                print("Milvus Insert Fail")
                print(status)
                return False
            return True

        except Exception as exp:
            print("--------------------")
            #print(article_id, "|", len(article_id))
            print("Vector |", len(vectors_to_search))
            print(insert_milvus_title, "|", len(insert_milvus_title))
            print("--------------------")
            print("Milvus Insert Fail")
            print(exp)
            return False

    except Exception as exp:
        print("Milvus Insert Fail")
        print(exp)
        return False

def insert_to_kr(article_id, article_title, article_content):
    try:
        monpa_str = (article_title + ' ')*10 + article_content
        
        # 提取字頻和tf
        corpus = cutSentence(monpa_str)
        kw_list = calculate_word_frequency(article_id, corpus)

        for item in kw_list:
            kr_data = {
                "article_id" : article_id,
                "keyword_id" : get_kw_id(item["word"]),
                "tf_value" : item["tf"],
                "count" : item["count"],
                "score" : item["tf"] * item["count"]
            }
            #global kr_df  # 声明使用外部的 article_df
            #kr_df.append(kr_data)
            
            mongo_keyword_record_collection.insert_one(kr_data)
        
        return True
    except Exception as e:
        print(e)
        return False


In [152]:
# 將文章的資料引入
def add_article(article_content,article_title,article_url,article_tag="", content_vector=[], title_vector=[]):
    
    if article_content != "":
        if len(content_vector) == 0:
            vector_content = embedding_sentence(article_content)["value"]
        else:
            vector_content = content_vector
    else:
        return {
            "state" : False,
            "msg" : "article_content is null"
        }
    if article_title != "":
        if len(title_vector) == 0:
            vector_title = embedding_sentence(article_title)["value"]
        else:
            vector_title = title_vector
    else:
        return {
            "state" : False,
            "msg" : "article_title is null"
        }
    if article_url == "":
        return {
            "state" : False,
            "msg" : "article_url is null"
        }
    if article_tag != "":
        vector_tag = embedding_sentence(article_tag)["value"]
    else:
        noTag = True


    weights = [3, 1]  # 根据你的要求定义权重
    vectors_to_calc = [np.array(vector_title), np.array(vector_content)]
    
    sum_weight = sum(weights)
    weighted_sum = sum(w * v for w, v in zip(weights, vectors_to_calc))
    vectors_to_search = weighted_sum / sum_weight

    new_article_id = str(get_new_article_id())

    # Insert to mongodb
    toMongo = insert_to_mongo(new_article_id, article_title, article_url, article_content, vectors_to_search.tolist())
    if toMongo is False:
        return {
            "state" : False,
            "msg" : "tomongo error"
        }

    # Insert to Milvus
    toMilvus = insert_to_milvus(new_article_id, article_title, vectors_to_search.tolist())
    if toMilvus is False:
        return {
            "state" : False,
            "msg" : "to milvus error"
        }

    toKr = insert_to_kr(new_article_id,article_title,article_content)
    if toKr is False:
        return {
            "state" : False,
            "msg" : "to keyrecord error"
        }
    
    return {
        "state" : True,
        "msg" : "Yes"
    }

### Read Article
require : `title` , `content` , `url` , `vector_title[768]` , `vector_content[768]`

In [145]:
df = pd.read_json("../backup/vector_all.json")

In [147]:
df.iloc[0]

class                                                            數學
title             微積分初階－歷史發展的眼光（9）萊布尼茲從差和分連續化得到微積分（First Course ...
tag                                 積分 無窮小量 微積分 微分 和分 數學 差分 微積分根本定理
TF                                  無窮 dx 微積分 積分 Delta displaystyle
url               https://highscope.ch.ntu.edu.tw/wordpress/?p=1...
content           微積分初階－歷史發展的眼光（9）萊布尼茲從差和分連續化得到微積分（First Course ...
vector_content    [-0.1001347452, 0.06442421670000001, -0.009284...
train_word        微積分初階－歷史發展的眼光（9）萊布尼茲從差和分連續化得到微積分（First Course ...
vector            [-0.1342700869, 0.038052819700000004, -0.00888...
vector_title      [-0.1260712594, 0.0393261835, -0.0068134414, 0...
vector_tag        [-0.0099172266, -0.0680422038, -0.016843661700...
vector_tf         [-0.1165701821, -0.1580965668, -0.0129927788, ...
Name: 2562, dtype: object

Setting Save tag for mongodb and milvus partition_name

In [176]:
MONGO_FROM_TEXT = "Pansci_data"
MILVUS_ARTICLE_PART = 'import_pansci'
MILVUS_KEYWORD_PART = "import_pansci"

In [154]:
df_size = len(df)
for index, row in df.iterrows():
    step = add_article(
        row["content"],
        row["title"],
        row["url"],
        content_vector= row["vector_content"],
        title_vector= row["vector_title"],
    )
    if step["state"] is False:
        print(index,"/",df_size)
        break

没有找到可转换为数值的linked_id值。


Batches: 100%|██████████| 1/1 [00:00<00:00, 41.10it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 35.80it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 39.28it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 39.09it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 40.21it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 40.34it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 33.49it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 39.68it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 41.09it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 42.32it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 38.48it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 39.10it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 40.35it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 38.86it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 40.45it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 34.05it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 42.12it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 40.62it/s]
Batches: 1

### Delete Data by Index

In [160]:
# 設定 Milvus collection 名稱和維度
collection_name = 'trello_finder_v2'
# +------------------------------+ #
connections.connect(alias="default")
milvus_collection = Collection(collection_name)

for i in range(25228,30988,1):
    expr = f'id in ["{i}"]'
    #print(expr)
    milvus_collection.delete(expr)

In [173]:
# Delete Keyword Record
for i in range(25228,30988,1):
    result = mongo_keyword_record_collection.delete_many({"article_id": f"{i}"})
    if result.deleted_count > 0:
        print(f"{result.deleted_count} documents deleted successfully.")

491 documents deleted successfully.
135 documents deleted successfully.
190 documents deleted successfully.
88 documents deleted successfully.
413 documents deleted successfully.
353 documents deleted successfully.
236 documents deleted successfully.
87 documents deleted successfully.
53 documents deleted successfully.
199 documents deleted successfully.
434 documents deleted successfully.
172 documents deleted successfully.
391 documents deleted successfully.
596 documents deleted successfully.
51 documents deleted successfully.
1071 documents deleted successfully.
579 documents deleted successfully.
108 documents deleted successfully.
450 documents deleted successfully.
63 documents deleted successfully.
192 documents deleted successfully.
558 documents deleted successfully.
43 documents deleted successfully.
41 documents deleted successfully.
118 documents deleted successfully.
117 documents deleted successfully.
529 documents deleted successfully.
618 documents deleted successfully