In [4]:
from attr.validators import instance_of
#创建集合单元
from sentence_transformers import SentenceTransformer
from pymilvus import FieldSchema, CollectionSchema, DataType, MilvusClient

from 爬虫.新闻动态 import title

client = MilvusClient(uri="http://192.168.200.130:19530")

def create_collection():
    # 定义字段 schemas
    id_field = FieldSchema(name="id", dtype=DataType.INT64, auto_id=True, is_primary=True)
    url_field = FieldSchema(name="url", dtype=DataType.VARCHAR, max_length=65535)
    content_field = FieldSchema(name="content", dtype=DataType.VARCHAR, max_length=65535)
    content_vector_field = FieldSchema(name="content_vector", dtype=DataType.FLOAT_VECTOR, dim=1024)
    # title_field = FieldSchema(name="title", dtype=DataType.VARCHAR, max_length=65535)
    # title_vector_field = FieldSchema(name="title_vector", dtype=DataType.FLOAT_VECTOR, dim=1024)
    publish_time_field = FieldSchema(name="publish_time", dtype=DataType.INT64)

    # 创建 Collection schema
    schema = CollectionSchema(
        fields=[id_field, url_field, content_field, content_vector_field, publish_time_field],
        # fields=[id_field, url_field, content_field, content_vector_field, title_field, title_vector_field, publish_time_field],
        description="CUMT_GPT的Chuxin数据集"
    )

    # 创建 Collection
    collection_name = "cumt_gpt_chuxin"
    if client.has_collection(collection_name):
        client.drop_collection(collection_name)

    client.create_collection(collection_name=collection_name, schema=schema)

    # 为 content_vector 创建索引
    # index_params = client.prepare_index_params()
    # index_params.add_index(
    #     field_name="content_vector",
    #     index_type="IVF_FLAT",
    #     metric_type="COSINE",
    #     params={"nlist": 128}
    # )
    
    # HNSW
    index_params = client.prepare_index_params()
    index_params.add_index(
        field_name="content_vector",
        index_type="HNSW",
        metric_type="COSINE",
        params={"M": 32, "efConstruction": 250}
    )
    # index_params.add_index(
    #     field_name="title_vector",
    #     index_type="HNSW",
    #     metric_type="COSINE",
    #     params={"M": 32, "efConstruction": 250}
    # )

    client.create_index(
        collection_name=collection_name,
        index_params=index_params,
        sync=False)


create_collection()
client.list_indexes(collection_name="cumt_gpt_chuxin")

['content_vector']

In [11]:
# 插入数据单元
import json
from datetime import datetime
from pymilvus import MilvusClient
from langchain_community.embeddings import OllamaEmbeddings
from FlagEmbedding import FlagModel
# from sonyflake import SonyFlake

client = MilvusClient(uri="http://192.168.200.130:19530")

def emb_text(texts):
    # 使用 bert-large-chinese 模型
    # embedding_model = SentenceTransformer('maidalun1020/bce-embedding-base_v1')
    
    embedding_model = FlagModel('chuxin-llm/Chuxin-Embedding', use_fp16=True)
    embedding_vectors = embedding_model.encode_queries(texts, batch_size=256)
    
    return embedding_vectors

# 文档列表
# docs = [
#     "Artificial intelligence was founded as an academic discipline in 1956.",
#     "Alan Turing was the first person to conduct substantial research in AI.",
#     "Born in Maida Vale, London, Turing was raised in southern England.",
# ]
# doc = "矿小助是一款由FlyingStudio（中国矿业大学翔工作室）开发维护的校园软件。提供课表、考试、成绩、校车校历、图书馆藏、校卡余额、宿舍电量等查询功能；同时具有课表导入日历、加权成绩计算、校园网自动登录、个性化主题背景等实用功能。"

contents = []
titles = []

# 打开 JSON 文件
# filename = '原始数据/articles.json'
# filename = ('原始数据/kxz.json')
filename = '原始数据/news.json'
# filename = '原始数据/矿大新闻网.json'
with open(filename, 'r', encoding='utf-8') as f:
    python_data = json.load(f)
    for data in python_data['data']:
        contents.append(data.get("content"))
        titles.append(data.get("title"))

vectors = emb_text(contents)

# 要插入的数据
datas = []
i = 0
for data in python_data['data']:
    # 创建一个 SonyFlake 实例
    # flake = SonyFlake()

    # 生成唯一ID
    # sonyflake_id = flake.next_id()
    
    # 使用 strptime 将字符串解析为日期对象
    date_obj = datetime.strptime(data.get("date"), "%Y-%m-%d")
    
    # 使用 strftime 格式化为所需的格式，并将其转换为整数
    date_int = int(date_obj.strftime("%Y%m%d"))

    datas.append({
            # "id": sonyflake_id,
            "url": data.get("url"),
            "content": data.get("content"),
            "content_vector": vectors[i],
            "publish_time": date_int
        })
    i += 1

# 打印每个向量的维度和生成的嵌入
# print("向量数量: ", len(vectors))
print("向量维度: ", vectors.shape)
# print(contents)
# print("生成的向量:", vectors)

# 插入数据
res = client.insert(collection_name="cumt_gpt_chuxin", data=datas)
print(res)


Inference Embeddings: 100%|██████████| 3/3 [06:31<00:00, 130.59s/it]


向量数量:  680
向量维度:  (680, 1024)
{'insert_count': 680, 'ids': [453289129978532540, 453289129978532541, 453289129978532542, 453289129978532543, 453289129978532544, 453289129978532545, 453289129978532546, 453289129978532547, 453289129978532548, 453289129978532549, 453289129978532550, 453289129978532551, 453289129978532552, 453289129978532553, 453289129978532554, 453289129978532555, 453289129978532556, 453289129978532557, 453289129978532558, 453289129978532559, 453289129978532560, 453289129978532561, 453289129978532562, 453289129978532563, 453289129978532564, 453289129978532565, 453289129978532566, 453289129978532567, 453289129978532568, 453289129978532569, 453289129978532570, 453289129978532571, 453289129978532572, 453289129978532573, 453289129978532574, 453289129978532575, 453289129978532576, 453289129978532577, 453289129978532578, 453289129978532579, 453289129978532580, 453289129978532581, 453289129978532582, 453289129978532583, 453289129978532584, 453289129978532585, 453289129978532586, 

In [13]:
# 测试向量搜索单元
# from langchain_community.embeddings import OllamaEmbeddings
# from sentence_transformers import SentenceTransformer
from pymilvus import FieldSchema, CollectionSchema, DataType, MilvusClient
from FlagEmbedding import FlagModel

client = MilvusClient(uri="http://192.168.200.130:19530")

def emb_text(texts):
    # embedding_model = SentenceTransformer('maidalun1020/bce-embedding-base_v1')
    # embedding_vectors = embedding_model.encode(text, batch_size=1024)
    embedding_model = FlagModel('chuxin-llm/Chuxin-Embedding', use_fp16=True)
    
    embedding_vectors = embedding_model.encode_queries(texts, batch_size=256)
    
    return embedding_vectors

doc = ["孙杨什么时候来矿大"]

doc_vector = emb_text(doc)

search_params = {
    "metric_type": "COSINE",
    "params": {}
}

# IVF_FLAT
# res = client.search(
#     collection_name="cumt_gpt_qwen",
#     data=doc_vector,
#     limit=10,
#     output_fields=["id", "url", "content", "publish_time"],
#     search_params=search_params
# )

# HNSW
res = client.search(
  collection_name="cumt_gpt_chuxin", # Collection name
  data=doc_vector, # Replace with your query vector
  search_params={
    "metric_type": "COSINE",
    "params": {"ef":150, "radius":0.3}, # Search parameters
  }, # Search parameters
  limit=10, # Max. number of search results to return
  output_fields=["id","url","content","publish_time"], # Fields to return in the search results
  consistency_level="Bounded"
)


print(res)

data: ['[{\'id\': 453289129978532544, \'distance\': 0.43711838126182556, \'entity\': {\'id\': 453289129978532544, \'url\': \'https://youth.cumt.edu.cn/../info/1010/5011.htm\', \'content\': \'迎新特辑 | 不偏不倚，就在等你！\\n作者：\\n骄阳似火，热情如潮。8月31日开始，中国矿业大学迎来了2024级新生，开学季的迎新与学生返校工作在热浪中有序进行。来自全国各地的学子们汇聚于此，他们怀揣着梦想与希望，踏入了矿大的校园，开启了他们与矿大共同成长的新篇章。\\n\\n准备进行时\\n\\n\\n金风送爽，\\n精心策划的迎新小站，\\n满载心意的欢迎礼包，\\n身着马甲、热情洋溢的志愿者们，\\n全校同心，\\n只为共赴这场秋天的相逢。\\n\\n\\n\\n\\n接站进行时\\n\\n\\n满怀期待，\\n步入学府殿堂，\\n热情迎接和细致引导，\\n为你点亮前行之路；\\n\\n\\n\\n\\n艳阳高照，\\n各处报道点蓄势待发，\\n热情迎接，\\n温暖初到校园的你；\\n\\n\\n笑容明媚，\\n洋溢着青春的活力，\\n热情答疑，\\n指引着未来的方向，\\n我们诚挚地邀请你，\\n加入矿大这个温暖的大家庭。\\n\\n\\n\\n\\n迎新进行时\\n\\n\\n\\n\\n排列井然，\\n布置雅致，\\n满目皆是温馨与期待，\\n各学院迎新站点熠熠生辉，\\n流淌着满满的热情与关怀，\\n邀你共赴这场与矿大的双向奔赴，\\n开启属于你们的璀璨篇章。\\n\\n打卡进行时\\n\\n\\n捕捉微笑瞬间，\\n定格坚定目光，\\n满心欢喜地在矿大校园穿梭，\\n沉浸在这份专为你准备的仪式感之中，\\n让每一刻都成为难忘的记忆。\\n\\n\\n\\n\\n\\n\\n对话矿大新生代\\n\\n金秋送爽时，\\n2024级新学子们怀揣着梦想，\\n掀开了与矿大相遇的序幕，\\n丰富多彩的大学生活画卷正缓缓铺展，\\n心中满是对未来的无限期待与想象，\\n在这特别的迎新之日，\\n让我们一同聆听他们的心声吧!\\n\\n“矿大校园很大也很美，宿舍环境也很好，希望在大学的