In [1]:
from attr.validators import instance_of
#创建集合单元
from sentence_transformers import SentenceTransformer
from pymilvus import FieldSchema, CollectionSchema, DataType, MilvusClient

client = MilvusClient(uri="http://192.168.200.130:19530")

def create_collection():
    # 定义字段 schemas
    id_field = FieldSchema(name="id", dtype=DataType.INT64, auto_id=True, is_primary=True)
    url_field = FieldSchema(name="url", dtype=DataType.VARCHAR, max_length=65535)
    content_field = FieldSchema(name="content", dtype=DataType.VARCHAR, max_length=65535)
    vector_field = FieldSchema(name="content_vector", dtype=DataType.FLOAT_VECTOR, dim=3584)
    publish_time_field = FieldSchema(name="publish_time", dtype=DataType.INT64)

    # 创建 Collection schema
    schema = CollectionSchema(
        fields=[id_field, url_field, content_field, vector_field, publish_time_field],
        description="CUMT_GPT的Qwen数据集"
    )

    # 创建 Collection
    collection_name = "cumt_gpt_qwen"
    if client.has_collection(collection_name):
        client.drop_collection(collection_name)

    client.create_collection(collection_name=collection_name, schema=schema)

    # 为 content_vector 创建索引
    # index_params = client.prepare_index_params()
    # index_params.add_index(
    #     field_name="content_vector",
    #     index_type="IVF_FLAT",
    #     metric_type="COSINE",
    #     params={"nlist": 128}
    # )
    
    # HNSW
    index_params = client.prepare_index_params()
    index_params.add_index(
        field_name="content_vector",
        index_type="HNSW",
        metric_type="COSINE",
        params={"M": 64, "efConstruction": 250}
    )

    client.create_index(
        collection_name=collection_name,
        index_params=index_params,
        sync=False)


create_collection()
client.list_indexes(collection_name="cumt_gpt_qwen")

  from tqdm.autonotebook import tqdm, trange


['content_vector']

In [8]:
# 插入数据单元
import json
from datetime import datetime
from pymilvus import FieldSchema, CollectionSchema, DataType, MilvusClient
from langchain_community.embeddings import OllamaEmbeddings
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import normalize
# from sonyflake import SonyFlake

client = MilvusClient(uri="http://192.168.200.130:19530")

def emb_text(text):
    # embedding_model = SentenceTransformer('maidalun1020/bce-embedding-base_v1')
    # embedding_vectors = embedding_model.encode(text, batch_size=1024)
    if isinstance(text, str):
        text = [text]
    
    embedding_model = OllamaEmbeddings(model="qwen2.5:7b")
    embedding_vectors = []
    
    for text_i in text:
        embedding_vectors.append(embedding_model.embed_query(text_i))

    # embedding_vectors = normalize(embedding_vectors, norm='l2')
    return embedding_vectors

# 文档列表
# docs = [
#     "Artificial intelligence was founded as an academic discipline in 1956.",
#     "Alan Turing was the first person to conduct substantial research in AI.",
#     "Born in Maida Vale, London, Turing was raised in southern England.",
# ]
# doc = "矿小助是一款由FlyingStudio（中国矿业大学翔工作室）开发维护的校园软件。提供课表、考试、成绩、校车校历、图书馆藏、校卡余额、宿舍电量等查询功能；同时具有课表导入日历、加权成绩计算、校园网自动登录、个性化主题背景等实用功能。"

contents = []

# 打开 JSON 文件
# filename = '原始数据/articles.json'
# filename = ('原始数据/kxz.json')
# filename = '原始数据/news.json'
filename = '原始数据/矿大新闻网.json'
with open(filename, 'r', encoding='utf-8') as f:
    python_data = json.load(f)
    for data in python_data['data']:
        contents.append(data.get("content"))

vectors = emb_text(contents)

# 要插入的数据
datas = []
i = 0
for data in python_data['data']:
    # 创建一个 SonyFlake 实例
    # flake = SonyFlake()

    # 生成唯一ID
    # sonyflake_id = flake.next_id()
    
    # 使用 strptime 将字符串解析为日期对象
    date_obj = datetime.strptime(data.get("date"), "%Y-%m-%d")
    
    # 使用 strftime 格式化为所需的格式，并将其转换为整数
    date_int = int(date_obj.strftime("%Y%m%d"))

    datas.append({
            # "id": sonyflake_id,
            "url": data.get("url"),
            "content": data.get("content"),
            "content_vector": vectors[i],
            "publish_time": date_int
        })
    i += 1

# 打印每个向量的维度和生成的嵌入
print("向量数量: ", len(vectors))
print("向量维度: ", len(vectors[0]))
# print(contents)
# print("生成的向量:", vectors)

# 插入数据
res = client.insert(collection_name="cumt_gpt_qwen", data=datas)
print(res)


向量数量:  14
向量维度:  3584
{'insert_count': 14, 'ids': [453316957169519715, 453316957169519716, 453316957169519717, 453316957169519718, 453316957169519719, 453316957169519720, 453316957169519721, 453316957169519722, 453316957169519723, 453316957169519724, 453316957169519725, 453316957169519726, 453316957169519727, 453316957169519728]}


In [12]:
# 测试向量搜索单元
from langchain_community.embeddings import OllamaEmbeddings
from sentence_transformers import SentenceTransformer
from pymilvus import FieldSchema, CollectionSchema, DataType, MilvusClient
from sklearn.preprocessing import normalize

client = MilvusClient(uri="http://192.168.200.130:19530")

def emb_text(text):
    # embedding_model = SentenceTransformer('maidalun1020/bce-embedding-base_v1')
    # embedding_vectors = embedding_model.encode(text, batch_size=1024)
    if isinstance(text, str):
        text = [text]
    
    embedding_model = OllamaEmbeddings(model="qwen2.5:7b")
    embedding_vectors = []
    
    for text_i in text:
        embedding_vectors.append(embedding_model.embed_query(text_i))
    
    # embedding_vectors = normalize(embedding_vectors, norm='l2')
    return embedding_vectors

doc = "孙杨"

doc_vector = emb_text(doc)

search_params = {
    "metric_type": "COSINE",
    "params": {}
}

# IVF_FLAT
# res = client.search(
#     collection_name="cumt_gpt_qwen",
#     data=doc_vector,
#     limit=10,
#     output_fields=["id", "url", "content", "publish_time"],
#     search_params=search_params
# )

# HNSW
res = client.search(
  collection_name="cumt_gpt_qwen", # Collection name
  data=doc_vector, # Replace with your query vector
  search_params={
    "metric_type": "COSINE",
    "params": {"ef":150, "radius":0}, # Search parameters
  }, # Search parameters
  limit=10, # Max. number of search results to return
  output_fields=["id","url","content","publish_time"], # Fields to return in the search results
  consistency_level="Bounded"
)


print(res)

data: ["[{'id': 453316957169518620, 'distance': 0.5855851173400879, 'entity': {'publish_time': 20151102, 'id': 453316957169518620, 'url': 'https://youth.cumt.edu.cn/info/1010/1381.htm', 'content': '我校计算机学院学子荣获2015中国大学生程序设计竞赛铜奖\\n作者：校团委\\n10月18日，2015年中国大学生程序设计竞赛在南阳理工学院举行，计算机学院由信安12-1班李嘉鑫、计科13-1班李梦天、计科13-2班黄锦静等三名同学组成的“CUMTCS”队参加比赛，经过激烈角逐获得铜奖。\\n为迎接本次大赛，计算机学院的杨文嘉老师、毕方明老师在暑期开展了程序设计集训工作，并在学期初进行了校内选拔。本次比赛共吸引了来自清华大学、北京大学、复旦大学、上海交通大学、浙江大学、中山大学等130多所学校的240多支队伍参赛。\\n中国大学生程序设计竞赛（China?Collegiate?Programming?Contest,简称CCPC）是由中国大学生程序设计竞赛协会主办的面向世界大学生的年度赛事，旨在激励当代大学生运用计算机编程技术和技术来解决实际问题，激发其学习算法和程序设计的兴趣，培养其团队合作意识、创新能力和挑战精神。比赛以3人组队形式参赛，每队只能使用1台电脑，在5个小时内用C、C++、Java、Python和Scala中的任意一种程序语言编写程序，解决10余个问题。竞赛全部英语命题，涉及知识面广泛，包括数据结构、图论、动态规划、计算几何、搜索等。考验了选手的问题分析能力，算法实现能力，限时和高压力下编写程序的能力，以及团队协作精神。\\n\\n\\n我校代表队'}}, {'id': 453316957169518479, 'distance': 0.5710771083831787, 'entity': {'publish_time': 20170513, 'id': 453316957169518479, 'url': 'https://youth.cumt.edu.cn/info/1010/1247.htm', 'content': '我校战队在徐州市爱国卫生知识竞赛中