## 1. connect to Milvus

In [2]:
from pymilvus import (
    connections,
    utility,
    FieldSchema,
    CollectionSchema,
    DataType,
    Collection,
)

In [3]:
connections.connect("default", host="localhost", port="19530")

In [4]:
has = utility.has_collection("hello_rag_v1")
print(f"Does collection hello_rag_v1 exist in Milvus: {has}")

Does collection hello_rag_v1 exist in Milvus: True


## 2. create collection
We're going to create a collection with 3 fields.

|   | field name   | field type  | other attributes              | field description         |
|---|--------------|-------------|-------------------------------|---------------------------|
| 1 | "pk"         | VarChar     | is_primary=True auto_id=False | "primary field"           |
| 2 | "text"       | VarChar     |                               | "original text"           |
| 3 | "embeddings" | FloatVector | dim=384                       | "float vector with dim 384" |


In [18]:
dim = 384
fields = [
    FieldSchema(name="pk", dtype=DataType.VARCHAR, is_primary=True, auto_id=False, max_length=100),
    FieldSchema(name="text", dtype=DataType.VARCHAR, description="this is the original text field", max_length=5000), 
    FieldSchema(name="embeddings", dtype=DataType.FLOAT_VECTOR, dim=dim)
]

schema = CollectionSchema(fields, description="hello_rag_v1 is the demo to introduce the usage of RAG")
hello_rag_v1 = Collection("hello_rag_v1", schema, consistency_level="Strong")


In [6]:
hello_rag_v1 = Collection("hello_rag_v1")

## 3. insert data
We are going to insert data of  into `hello_rag`
Data to be inserted must be organized in fields.

The insert() method returns:
- either automatically generated primary keys by Milvus if auto_id=True in the schema;
- or the existing primary key field from the entities if auto_id=False in the schema.

### 3.1 generate entities from grammar book

In [10]:
from sentence_transformers import SentenceTransformer
embedding_model_name = "/data/remote_dev/lin/all-MiniLM-L6-v2-sentence-transformer-model"  # dim = 384, loaded from local on the server GPU1
embedding_model = SentenceTransformer(embedding_model_name)

# sentences = ["This is an example sentence", "Each sentence is converted"]
# embeddings = embedding_model.encode(sentences)
# for sentence, embedding in zip(sentences, embeddings):
#     print("Sentence:", sentence)
#     print("Embedding:", embedding)
#     print("")


  from .autonotebook import tqdm as notebook_tqdm


In [10]:
import os
from os.path import dirname, join
current_dir = os.getcwd()
file_path = join(current_dir, "grammar_book/虚拟语气在从句中的用法.pdf")
print("Current directory:", current_dir)
print("File path:", file_path)

Current directory: /data/remote_dev/lin/rag
File path: /data/remote_dev/lin/rag/grammar_book/虚拟语气在从句中的用法.pdf


In [20]:
from nltk.tokenize import sent_tokenize
import json

from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextContainer

def extract_text_from_pdf(filename, page_numbers=None, min_line_length=1):
    '''从 PDF 文件中（按指定页码）提取文字'''
    paragraphs = []
    buffer = ''
    full_text = ''
    # 提取全部文本
    for i, page_layout in enumerate(extract_pages(filename)):
        # 如果指定了页码范围，跳过范围外的页
        if page_numbers is not None and i not in page_numbers:
            continue
        for element in page_layout:
            if isinstance(element, LTTextContainer):
                full_text += element.get_text() + '\n'
    # 按空行分隔，将文本重新组织成段落
    lines = full_text.split('\n')
    for text in lines:
        if len(text) >= min_line_length:
            buffer += (' '+text) if not text.endswith('-') else text.strip('-')
        elif buffer:
            paragraphs.append(buffer)
            buffer = ''
    if buffer:
        paragraphs.append(buffer)
    return paragraphs

import os
from os.path import dirname, join
current_dir = os.getcwd()
file_path = join(current_dir, "grammar_book/虚拟语气在从句中的用法.pdf")

paragraphs = extract_text_from_pdf(file_path, min_line_length=10)

def split_text(paragraphs, chunk_size=300, overlap_size=100):
    '''按指定 chunk_size 和 overlap_size 交叠割文本'''
    sentences = [s.strip() for p in paragraphs for s in sent_tokenize(p)]
    chunks = []
    i = 0
    while i < len(sentences):
        chunk = sentences[i]
        overlap = ''
        prev_len = 0
        prev = i - 1
        # 向前计算重叠部分
        while prev >= 0 and len(sentences[prev])+len(overlap) <= overlap_size:
            overlap = sentences[prev] + ' ' + overlap
            prev -= 1
        chunk = overlap+chunk
        next = i + 1
        # 向后计算当前chunk
        while next < len(sentences) and len(sentences[next])+len(chunk) <= chunk_size:
            chunk = chunk + ' ' + sentences[next]
            next += 1
        chunks.append(chunk)
        i = next
    return chunks

chunks = split_text(paragraphs, 300, 100)

print(chunks)
print(len(chunks))

["虚拟语⽓在从句中的⽤法.md 2024-01-31 虚拟语⽓在从句中的⽤法 1. 主语从句中的⽤法 (1) 在以it为形式主语的复合句中，虚拟语⽓在主语从句中表示建议、要求、命令等，谓语动词⽤should be型或be型虚拟式，在 美国英语中常⽤be型虚拟式。如： It's natural that she should do so. 她这么做是很⾃然的事。 It is essential that we should tell her the news. 我们有必要告诉她这个消息。", '她这么做是很⾃然的事。 It is essential that we should tell her the news. 我们有必要告诉她这个消息。 It is important that we should make full use of our mineral resources. 对我们来说，充分利⽤我国的矿产资源是重要的。 It is strange that the result of the experiment should be unsatisfactory. 奇怪的是实验结果竟然如此令⼈不满意。', 'It is strange that the result of the experiment should be unsatisfactory. 奇怪的是实验结果竟然如此令⼈不满意。 It is desired that we should get everything ready by tonight. 希望⼀切在今晚准备好。 It was arranged that Mr Sam should go and help Bettie. 他们作了安排，由萨姆先⽣去帮助⻉蒂。', '希望⼀切在今晚准备好。 It was arranged that Mr Sam should go and help Bettie. 他们作了安排，由萨姆先⽣去帮助⻉蒂。 (2) 主句的谓语为某些动词的被动语态，常⽤在It is (was) desired (suggested，settled，proposed， recommended， requested，decided，etc. ) that. . . 句型中。 It is suggested that the ques

In [21]:
embeddings = embedding_model.encode(chunks)

In [None]:
for chunk, embedding in zip(chunks, embeddings):
    print("chunk:", chunk)
    print("Embedding:", embedding)
    print("")

3.2 insert entities

In [24]:
entities = [
    [str(i) for i in range(len(embeddings))], # provide the pk field because `auto_id` is set to False
    chunks,
    embeddings,    # field embeddings, supports numpy.ndarray and list
]

In [25]:
insert_result = hello_rag_v1.insert(entities)

In [None]:
# hello_rag_v1.flush()
print(f"Number of entities in Milvus: {hello_rag_v1.num_entities}")  # check the num_entities

### 3.3 Index

We are going to create an IVF_FLAT index for hello_rag_v1 collection.

create_index() can only be applied to `FloatVector` and `BinaryVector` fields.

In [7]:
index = {
    "index_type": "GPU_IVF_FLAT",
    "metric_type": "COSINE",
    "params": {"nlist": 128},
}

hello_rag_v1.create_index("embeddings", index)

Status(code=0, message=)

4. 基于向量检索的RAG

In [8]:
hello_rag_v1.load()

In [9]:
search_params = {
    "metric_type": "COSINE", 
    "offset": 0, 
    "ignore_growing": False, 
    "params": {"nlist": 128},
}


In [11]:
user_query = "虚拟语气应该如何在从句中使用？"
vectors_to_search = embedding_model.encode([user_query])

In [18]:

result = hello_rag_v1.search(vectors_to_search, "embeddings", search_params, limit=3, expr=None, output_fields=["text"], partition_names=None)


In [20]:
result[0].ids

['3', '17', '13']

In [21]:
result[0].distances

[0.43105536699295044, 0.4235227704048157, 0.4143679738044739]

In [27]:
result[0][0].entity.get('text')

'希望⼀切在今晚准备好。 It was arranged that Mr Sam should go and help Bettie. 他们作了安排，由萨姆先⽣去帮助⻉蒂。 (2) 主句的谓语为某些动词的被动语态，常⽤在It is (was) desired (suggested，settled，proposed， recommended， requested，decided，etc. ) that. . . 句型中。 It is suggested that the question should be discussed at the next meeting.'

In [29]:
from langchain.chains import LLMChain
from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate

base_url = "http://192.168.100.252:8000/v1/"
llm = ChatOpenAI(temperature=0, api_key="EMPTY", base_url=base_url)

In [30]:
rag_template = PromptTemplate(
    input_variables=["INFO", "QUERY"],
    template="""
你是一个问答机器人。
你的任务是根据下述给定的已知信息回答用户问题。
确保你的回复完全依据下述已知信息。不要编造答案。
如果下述已知信息不足以回答用户的问题，请直接回复"我无法回答您的问题"。

已知信息:
{INFO}

用户问：
{QUERY}

请用中文回答用户问题。
""",
)

In [31]:
rag_chain = LLMChain(
    llm=llm, prompt=rag_template
)

In [32]:
user_input = "虚拟语气应该如何在从句中使用？"
output = rag_chain.invoke({"QUERY": user_input, "INFO": result[0][0].entity.get('text')})

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [33]:
print(output)

{'QUERY': '虚拟语气应该如何在从句中使用？', 'INFO': '希望⼀切在今晚准备好。 It was arranged that Mr Sam should go and help Bettie. 他们作了安排，由萨姆先⽣去帮助⻉蒂。 (2) 主句的谓语为某些动词的被动语态，常⽤在It is (was) desired (suggested，settled，proposed， recommended， requested，decided，etc. ) that. . . 句型中。 It is suggested that the question should be discussed at the next meeting.', 'text': '虚拟语气在从句中的使用方式如下：\n1. 在从句中，如果主句是虚拟语气，那么从句通常会采用一种表示建议、要求或命令的语气。例如：It is suggested that we should visit Japan next year.（我们建议明年去日本。）\n2. 如果主句不是虚拟语气，那么从句可能会采用一种表示事实或事实的建议语气。例如：I think it is time for us to move out.（我认为我们应该搬走了。）\n3. 在从句中，虚拟语气常常使用should + 动词原形的结构。例如：You should always follow the rules.（你应该始终遵守规则。）'}


In [34]:
print(type(output))

<class 'dict'>


In [35]:
print(output['text'])

虚拟语气在从句中的使用方式如下：
1. 在从句中，如果主句是虚拟语气，那么从句通常会采用一种表示建议、要求或命令的语气。例如：It is suggested that we should visit Japan next year.（我们建议明年去日本。）
2. 如果主句不是虚拟语气，那么从句可能会采用一种表示事实或事实的建议语气。例如：I think it is time for us to move out.（我认为我们应该搬走了。）
3. 在从句中，虚拟语气常常使用should + 动词原形的结构。例如：You should always follow the rules.（你应该始终遵守规则。）


In [38]:
from langchain.chains import LLMChain
from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate

# base_url = "http://192.168.100.252:8000/v1/"
# llm = ChatOpenAI(temperature=0, api_key="EMPTY", base_url=base_url)

simple_template = PromptTemplate(
    input_variables=["QUERY"],
    template="""
你是一个问答机器人。
你的任务是回答用户问题。

用户问：
{QUERY}

请用中文回答用户问题。
""",
)

simple_chain = LLMChain(
    llm=llm, prompt=simple_template
)

user_input = "虚拟语气应该如何在从句中使用？"
output = simple_chain.invoke({"QUERY": user_input})
print(output['text'])

虚拟语气是在句子中表达一种与事实相反或者假设的情况，通常用于表示说话者对某种情况的真实性、可能性或期望的语气。在从句中使用虚拟语气时，需要根据具体情况进行调整。

以下是一些在从句中使用虚拟语气的例子：

1. 如果条件成立，我就会接受这个邀请。（如果条件成立，这里表示一种假设情况）
2. 如果你觉得这个决定很明智，那么我就同意了。（如果你觉得这个决定很明智，这里表示一种依赖于他人意见的情况）
3. 她告诉我她已经准备好参加这场比赛了。（她告诉我她已经准备好参加这场比赛了，这里表示一种说话者对他人行动的期望或建议）

总之，在使用虚拟语气时，需要注意时态、情态动词和语气词的使用，以及根据具体情况调整语气。
