# 数据入库

## 连接 OpenSearch 并加载 LLM 模型

In [15]:
from opensearchpy import OpenSearch, RequestsHttpConnection, AWSV4SignerAuth
from sagemaker.huggingface.model import HuggingFacePredictor
# import boto3
import os
from dotenv import load_dotenv

load_dotenv()

host = 'search-aiml-bot-rzeucj66zz7ortkojghfgzh2c4.us-east-1.es.amazonaws.com' 
port = 443
region = 'us-east-1' # e.g. us-west-1

# credentials = boto3.Session().get_credentials()
# auth = AWSV4SignerAuth(credentials, region)
auth = (os.environ.get("AOS_USER"), os.environ.get("AOS_PWD")) 

client = OpenSearch(
    hosts = [f'{host}:{port}'],
    http_auth = auth,
    use_ssl = True,
    verify_certs = True,
    connection_class = RequestsHttpConnection
)


predictor = HuggingFacePredictor(
  endpoint_name='llm-models'
)

# 插入数据
def update_data(id, data):
    response = client.index(
        index=index_name,
        body=data, 
        id=id
    )

    return response


index_name = 'doc_embeddings'

## 创建索引

目前向量模型转化的 vector 的数组长度为 768，所以向量的维度为 `"dimension": 768`

In [None]:
index_body = {
      "settings": {
        "index": {
          "knn": True
        }
      },
      "mappings": {
        "properties": {
          "content_vec": {
            "type": "knn_vector",
            "index": True, 
            "dimension": 768,
            "method": {
              "name": "hnsw",
              "space_type": "l2",
              "engine": "nmslib",
              "parameters": {
                "ef_construction": 128,
                "m": 24
              }
            }
          }
        }
      }
    }
client.indices.create(index_name, body=index_body)

In [None]:
# 删除索引
client.indices.delete(index=index_name)

## 分拆文档并插入 AOS

### 数据插入 - 大学

读取文本文件，并按照行进行分片，然后向量化，并插入 AOS

In [None]:

file_name = './docs/985-uni.txt'
university_file = open(file_name, 'r')

lines = university_file.readlines()

for line in lines:
  line = line.strip()
  if line:
    print("Inserting: {}...".format(line[:20]))
    update_data(None, {
      "text": line,
      "origion": file_name,
      "content_vec": predictor.predict({"text": line,"type": 2}),
    })

print("Done")

### 数据插入：解决方案

In [None]:
import os

sol_dir = './docs_solutions/'

file_list = os.listdir(sol_dir)
for file_path in file_list:
  file_name = sol_dir + file_path
  sol_file = open(file_name, 'r')
  lines = sol_file.readlines()
  for line in lines:
    line = line.strip()
    if line:
      print("Inserting: {}... {}".format(line[:20], file_name))
      update_data(None, {
        "text": line,
        "origion": file_name,
        "content_vec": predictor.predict({"text": line,"type": 2}),
      })

print("Done")

### 数据插入：商品

In [None]:
import os

sol_dir = './docs_ec/'

file_list = os.listdir(sol_dir)
for file_path in file_list:
  file_name = sol_dir + file_path
  sol_file = open(file_name, 'r')
  line = sol_file.read()
  line = line.strip()
  if line:
    print("Inserting: {}... {}".format(line[:20], file_name))
    update_data(None, {
      "text": line,
      "origion": file_name,
      "content_vec": predictor.predict({"text": line,"type": 2}),
    })

print("Done")

## 查询

In [16]:
def search(sentense):
    sentence_vec = predictor.predict({"text": sentense, "type": 2})
    query_body = {
        "size": 3,
        "query": {
            "knn": {
                "content_vec": {
                    "vector": sentence_vec,
                    "k": 1
                }
            }
        }
    }

    response = client.search(index=index_name, body=query_body)
    result =  map(map_res, response["hits"]["hits"])
    return list(result)

def map_res(obj):
    return {
        "id": obj["_id"],
        "score": obj["_score"],
        "text": obj["_source"]["text"],
        "origion": obj["_source"]["origion"],

    }

# search("What are the universities of foreign languages")
# search("国防科技大学的王牌专业是哪个？")


In [18]:
# search("折叠电动自行车")
search("插座电涌保护器")
# search("carbon pen box")
# search("Coussin d’assise étanche") # 防水坐垫
# search("จักรยานพับไฟฟ้า") # 折叠电动自行车

[{'id': '1dW0KokBRau1ZRpy6sXT',
  'score': 0.21213564,
  'text': 'GE 6-Outlet Surge Protector, 15 Ft Extension Cord, Power Strip, 800 Joules, Flat Plug, Twist-to-Close Safety Covers, Protected Indicator Light, UL Listed, Black, 50767\nAbout this item\nExpand Your Power – 6 grounded and protected 3-prong outlets provide power from a distance — perfect for creating a centralized hub for your electronics at your home, office, workshop or garage.\nVersatile Cord – Extra-long 15ft. power cord comprised of 14-gauge SJT cable housed in a durable PVC insulation and jacket with a space-saving flat plug to allow unit to fit closely to the wall. This heavy-duty cord is strong enough to last a lifetime and pliable enough to easily fully extend.\nProtection – 800 Joules protection rating safeguards your electronics from harmful voltage spikes. Integrated circuit breaker and automatic shutdown technology cut power to your surge protector should it experience overvoltage or overheating.\nFunctionalit

## 对话化

In [19]:
def ask(q):
  answers = search(q)
  strAnswers = ""
  references = {}
  for an in answers:
      strAnswers += an['text'] + "\n\n"
      origion = an['origion']
      if origion in references :
        references[origion] += 1
      else:
         references[origion] = 1
  ori_prompt = f"""
Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

{strAnswers}

Question: {q}
            
Helpful Answer:

  """
  # print("Prompt: ")
  # print(ori_prompt)

  print("问题: ", q)
  a = predictor.predict({"text": ori_prompt,"type": 8})

  return a, references

In [20]:
answer01 = ask("G-force德国新国标折叠电动自行车的特性,价格")

print("回答：", answer01[0])
print("引用：", answer01[1])

问题:  G-force德国新国标折叠电动自行车的特性,价格
回答： G-force德国新国标折叠电动自行车的特性包括：可折叠、轮圈尺寸为14英寸、电压为48V、电池容量为10AH及以下、可拆卸电池、额定功率为251-400W、净重为21-30kg、理论时速为25km/h、车架材质为高碳钢、制动方式为前后碟刹、标准认证为CCC认证、变速档位为3速、理论续航为26-40km。该车的价格为人民币999元。
引用： {'./docs_ec/g6.txt': 1, './docs_ec/g5.txt': 1, './docs_ec/g4.txt': 1}


In [21]:
answer01 = ask("Features and price of carbon fiber pen holder")  # 碳纤维笔筒

print("回答：", answer01[0])
print("引用：", answer01[1])

问题:  Features and price of carbon fiber pen holder
回答： The carbon fiber pen holder is made of hand-crafted linear carbon fiber and non-铅 yellow copper. It has a length of 180mm, an outer diameter of 16mm, and an inner diameter of 13mm. The pen holder weighs 30g and is produced in Italy. It comes with the original factory packaging, a manual, and the pen筒。

价格：￥728
引用： {'./docs_ec/g1.txt': 1, './docs_ec/g5.txt': 1, './docs_ec/g6.txt': 1}


In [None]:
answer01 = ask("基本农田保护方案是什么？")

print("回答：", answer01[0])
print("引用：", answer01[1])


In [None]:
answer01 = ask("你有城市停车运营解决方案吗，具体是怎么运作的？")

print("回答：", answer01[0])
print("引用：", answer01[1])

In [None]:
answer01 = ask("How to manage the hospital, do you have a specific plan?")

print("回答：", answer01[0])
print("引用：", answer01[1])


