# LLM generate Test

In [None]:
from sparkai.llm.llm import ChatSparkLLM, ChunkPrintHandler
from sparkai.core.messages import ChatMessage

#星火认知大模型Spark Max的URL值，其他版本大模型URL值请前往文档（https://www.xfyun.cn/doc/spark/Web.html）查看
SPARKAI_URL = 'wss://spark-api.xf-yun.com/v3.5/chat'
#星火认知大模型调用秘钥信息，请前往讯飞开放平台控制台（https://console.xfyun.cn/services/bm35）查看
SPARKAI_APP_ID = ''
SPARKAI_API_SECRET = ''
SPARKAI_API_KEY = ''
#星火认知大模型Spark Max的domain值，其他版本大模型domain值请前往文档（https://www.xfyun.cn/doc/spark/Web.html）查看
SPARKAI_DOMAIN = 'generalv3.5'

if __name__ == '__main__':
    spark = ChatSparkLLM(
        spark_api_url=SPARKAI_URL,
        spark_app_id=SPARKAI_APP_ID,
        spark_api_key=SPARKAI_API_KEY,
        spark_api_secret=SPARKAI_API_SECRET,
        spark_llm_domain=SPARKAI_DOMAIN,
        streaming=False,
    )
    messages = [ChatMessage(
        role="user",
        content='我最近一直流鼻涕，打喷嚏，头还痛，浑身乏力，这是怎么回事，怎么办啊？'
    )]
    handler = ChunkPrintHandler()
    a = spark.generate([messages], callbacks=[handler])
    print(a.generations[0][0].text)

# RAG based on es search

## text trunk

In [None]:
import csv

csv_file = './data.csv'
documents = []

with open(csv_file, mode='r', encoding='utf-8') as file:
    csv_reader = csv.DictReader(file)
    for row in csv_reader:
        documents.append(row)
print(documents)

## es

In [None]:
from elasticsearch import Elasticsearch,helpers

es = Elasticsearch("http://localhost:9200")
index_name = "my_index"
for i, doc in enumerate(documents):
    rs = es.index(index=index_name, id=i, body=doc)
    print(rs)


In [None]:
search_query = {
    "query": {
        "match": {
            "query": "我最近一直流鼻涕，打喷嚏，头还痛，浑身乏力，这是怎么回事，怎么办啊？"
        }
    }
}

response = es.search(index=index_name, body=search_query)
print(response)
print("搜索结果：")
for hit in response['hits']['hits']:
    print(f"Query: {hit['_source']['query']}")
    print(f"Answer: {hit['_source']['answer']}\n")


## prompt combine

In [None]:
docs = []
for hit in response['hits']['hits']:
    docs.append(hit['_source']['query'] + '\n' + hit['_source']['answer'])
print(docs)


query = search_query['query']['match']['query']
print(query)

In [None]:
prompt = f"""
请根据我下面的文档内容，回答我提出的问题：
文档内容：{docs}
问题：{query}
一律用中文回答
"""

print(prompt)

## LLM generate

In [None]:
from sparkai.llm.llm import ChatSparkLLM, ChunkPrintHandler
from sparkai.core.messages import ChatMessage

#星火认知大模型Spark Max的URL值，其他版本大模型URL值请前往文档（https://www.xfyun.cn/doc/spark/Web.html）查看
SPARKAI_URL = 'wss://spark-api.xf-yun.com/v3.5/chat'
#星火认知大模型调用秘钥信息，请前往讯飞开放平台控制台（https://console.xfyun.cn/services/bm35）查看


SPARKAI_APP_ID = ''
SPARKAI_API_SECRET = ''
SPARKAI_API_KEY = ''


#星火认知大模型Spark Max的domain值，其他版本大模型domain值请前往文档（https://www.xfyun.cn/doc/spark/Web.html）查看
SPARKAI_DOMAIN = 'generalv3.5'

if __name__ == '__main__':
    spark = ChatSparkLLM(
        spark_api_url=SPARKAI_URL,
        spark_app_id=SPARKAI_APP_ID,
        spark_api_key=SPARKAI_API_KEY,
        spark_api_secret=SPARKAI_API_SECRET,
        spark_llm_domain=SPARKAI_DOMAIN,
        streaming=False,
    )
    messages = [ChatMessage(
        role="user",
        content= prompt
    )]
    handler = ChunkPrintHandler()
    a = spark.generate([messages], callbacks=[handler])
    print(a.generations[0][0].text)

# RAG based on text2vec & cos_smi 

In [None]:
from text2vec import SentenceModel
from tqdm import tqdm
import numpy as np
model = SentenceModel('./text2vec-base-multilingual')

In [None]:
def sentense2vec(sentense):
    embedding = model.encode(sentense)
    return embedding

def documents_vec(file_path):
    documents = []
    with open(file_path,'r') as file:
        lines = file.readlines()
    head = lines[0]
    lines = lines[1:]
    
    print('documents to vec')
    for line in tqdm(lines):
        embedding = sentense2vec(line)
        documents.append(embedding)
    return lines,head,documents

original_docs,head,documents = documents_vec("./data.csv")
print(head)
print(documents)

## docs generate Based on cos_smi

In [None]:
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
import numpy as np

def generate_docs(query,documents,top_k):
    query = model.encode(query)
    query = np.array(query).reshape(1,-1)
    cos_sim = []
    for i,document in enumerate(documents):
        document = np.array(document).reshape(1,-1)
        cos_sim.append(
            { i: cosine_similarity(query,document)[0,0]}
        )
    cos_sim.sort(key=lambda x: list(x.values())[0],reverse=True)
    top_k_array = np.array(cos_sim)[:top_k]
    print(top_k_array)
    index_list = []
    for item in top_k_array:
        index_list.append(
            list(item.keys())[0]
        )
    docs = []
    for index in index_list:
        docs.append(original_docs[index])
    return docs

## prompt combine

In [None]:
query = '我最近一直流鼻涕，打喷嚏，头还痛，浑身乏力，这是怎么回事，怎么办啊？'
docs = generate_docs(query,documents,3)

for doc in docs:
    print(doc)

prompt = f"""
请根据我下面的文档内容，回答我提出的问题：
文档内容：{docs}
问题：{query}
一律用中文回答
"""

## LLM generate

In [None]:
from sparkai.llm.llm import ChatSparkLLM, ChunkPrintHandler
from sparkai.core.messages import ChatMessage

#星火认知大模型Spark Max的URL值，其他版本大模型URL值请前往文档（https://www.xfyun.cn/doc/spark/Web.html）查看
SPARKAI_URL = 'wss://spark-api.xf-yun.com/v3.5/chat'
#星火认知大模型调用秘钥信息，请前往讯飞开放平台控制台（https://console.xfyun.cn/services/bm35）查看
SPARKAI_APP_ID = ''
SPARKAI_API_SECRET = ''
SPARKAI_API_KEY = ''
#星火认知大模型Spark Max的domain值，其他版本大模型domain值请前往文档（https://www.xfyun.cn/doc/spark/Web.html）查看
SPARKAI_DOMAIN = 'generalv3.5'

if __name__ == '__main__':
    spark = ChatSparkLLM(
        spark_api_url=SPARKAI_URL,
        spark_app_id=SPARKAI_APP_ID,
        spark_api_key=SPARKAI_API_KEY,
        spark_api_secret=SPARKAI_API_SECRET,
        spark_llm_domain=SPARKAI_DOMAIN,
        streaming=False,
    )
    messages = [ChatMessage(
        role="user",
        content=prompt
    )]
    handler = ChunkPrintHandler()
    a = spark.generate([messages], callbacks=[handler])
    print(a.generations[0][0].text)

# RAG based on embedding

In [None]:
model_dir = './bge-large-zh-v1.5/'
from tqdm import tqdm
from FlagEmbedding import FlagModel
import numpy as np
model = FlagModel(model_dir, 
                  query_instruction_for_retrieval="为这个句子生成表示以用于检索相关文章：",
                  use_fp16=True) # Setting use_fp16 to True speeds up computation with a slight performance degradation
# similarity = embeddings_1 @ embeddings_2.T
# print(similarity)

# for s2p(short query to long passage) retrieval task, suggest to use encode_queries() which will automatically add the instruction to each query
# corpus in retrieval task can still use encode() or encode_corpus(), since they don't need instruction
# queries = ['query_1', 'query_2']
# passages = ["样例文档-1", "样例文档-2"]
# q_embeddings = model.encode_queries(queries)
# p_embeddings = model.encode(passages)
# scores = q_embeddings @ p_embeddings.T

In [None]:

def embedding_sentence(sentense):
    embedding = model.encode(sentense)
    return embedding
def documents_embedding(file_path):
    vec_documents = []
    with open(file_path,'r') as file:
        lines = file.readlines()
    head = lines[0]
    lines = lines[1:]
    for line in tqdm(lines):
        vec_documents.append(
            embedding_sentence(line)
        )
    return np.array(vec_documents),lines,head

vec_documents,original_docs,head = documents_embedding('./data.csv')
print(vec_documents[0].shape)
print(vec_documents)

## load faiss

In [None]:
import faiss

dimension = 1024
db = faiss.IndexFlatL2(dimension)
db.add(vec_documents)


## retrieve

In [None]:
def docs_generate(query,documents,top_k):
    query_vector = model.encode_queries(query).reshape(1,-1)
    distances, indices = db.search(query_vector, top_k)
    print(f"{indices[0]}: {distances[0]}")
    docs = []
    for index in indices[0]:
        docs.append(
            documents[index]
        )
    return docs

## prompt combine

In [None]:
top_k = 2
query = '我最近一直流鼻涕，打喷嚏，头还痛，浑身乏力，这是怎么回事，怎么办啊？'
docs = docs_generate(query,original_docs,top_k)
print()

for doc in docs:
    print(doc,end='')

prompt = f"""
请根据我下面的文档内容，回答我提出的问题：
文档内容：{docs}
问题：{query}
一律用中文回答
"""

## LLM generate

In [None]:
from sparkai.llm.llm import ChatSparkLLM, ChunkPrintHandler
from sparkai.core.messages import ChatMessage

#星火认知大模型Spark Max的URL值，其他版本大模型URL值请前往文档（https://www.xfyun.cn/doc/spark/Web.html）查看
SPARKAI_URL = 'wss://spark-api.xf-yun.com/v3.5/chat'
#星火认知大模型调用秘钥信息，请前往讯飞开放平台控制台（https://console.xfyun.cn/services/bm35）查看
SPARKAI_APP_ID = ''
SPARKAI_API_SECRET = ''
SPARKAI_API_KEY = ''
#星火认知大模型Spark Max的domain值，其他版本大模型domain值请前往文档（https://www.xfyun.cn/doc/spark/Web.html）查看
SPARKAI_DOMAIN = 'generalv3.5'

if __name__ == '__main__':
    spark = ChatSparkLLM(
        spark_api_url=SPARKAI_URL,
        spark_app_id=SPARKAI_APP_ID,
        spark_api_key=SPARKAI_API_KEY,
        spark_api_secret=SPARKAI_API_SECRET,
        spark_llm_domain=SPARKAI_DOMAIN,
        streaming=False,
    )
    messages = [ChatMessage(
        role="user",
        content=prompt
    )]
    handler = ChunkPrintHandler()
    a = spark.generate([messages], callbacks=[handler])
    print(a.generations[0][0].text)