In [None]:
import ray
from typing import Dict, List
import numpy as np

ds = ray.data.read_json('../etl_out/partition_by_date')
#ds = ds.limit(10)

In [None]:
from langchain_community.embeddings.huggingface import HuggingFaceEmbeddings
from transformers import AutoModel
from llama_index.core.node_parser import  SentenceSplitter #SimpleNodeParser
from llama_index.core.schema import Node
from llama_index.core.schema import Document
import pandas as pd
import pyarrow as pa
import warnings
import pandas as pd
import copy
from llama_index.core.schema import TextNode, NodeRelationship, RelatedNodeInfo

# Suppress FutureWarning messages
warnings.simplefilter(action='ignore', category=FutureWarning)

def convert_documents_into_nodes(data: Dict[str, np.ndarray]) -> List[Dict[str, Node]]:
    splitter = SentenceSplitter()
    all_text = data['title'] + data['content']
    doc = Document(text = all_text )
    del data["title"]
    del data["content"]
    doc.metadata = data
    documents = [ doc]
    nodes = splitter.get_nodes_from_documents(documents=documents)
    return [{"node": node} for node in nodes]

class EmbedNodes:
    def __init__(self):
        import os
        #os.environ['HTTPS_PROXY'] = 'http://100.109.83.118:808/'
        self.embedding_model = HuggingFaceEmbeddings(model_name='/data/models/bge-large-zh')

    #列存储的方式
    def __call__(self, summary_batch: Dict[str, np.ndarray]) -> Dict[str, List[Node]]:
        nodes = []
        summaries = []
        all_text = (summary_batch['title']+summary_batch['content'])
        for text, title in zip(all_text, summary_batch['title']):
    
            node = TextNode(text=text, id_=title)
            nodes.append(node)
            summaries.append(text)
        
        embeddings = self.embedding_model.embed_documents(summaries)
        assert len(nodes) == len(embeddings)

        for node, embedding in zip(nodes, embeddings):
            node.embedding = embedding
        return {"embedded_nodes": nodes}

In [None]:
embeds = ds.map_batches(EmbedNodes, concurrency=4, num_gpus=1, batch_size=2000)


In [None]:
from llama_index.core import Settings
from llama_index.llms.ollama import Ollama
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

Settings.llm = Ollama(model="qwen:7b", request_timeout=120.0)
Settings.embed_model = HuggingFaceEmbedding(
    model_name="/data/models/bge-large-zh"
)

In [None]:
stock_docs_nodes = []
title_dict = {}
for row in embeds.iter_rows():
    node:Node = row["embedded_nodes"]
    if not node.id_ in title_dict:
        title_dict[node.id_] = True
        assert node.embedding is not None
        stock_docs_nodes.append(node)

In [None]:
len(stock_docs_nodes)

In [None]:
import chromadb
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext
from chromadb.errors import DuplicateIDError
from llama_index.core import Settings
from llama_index.llms.ollama import Ollama
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

Settings.llm = Ollama(model="qwen:7b", request_timeout=120.0)
Settings.embed_model = HuggingFaceEmbedding(
    model_name="/data/models/bge-large-zh"
)

# initialize client, setting path to save data
db = chromadb.PersistentClient(path="./stock_db")

# create collection
chroma_collection = db.get_or_create_collection("stock_summary")

# assign chroma as the vector_store to the context
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

index = VectorStoreIndex.from_vector_store(
    vector_store
)
# create your index
# try:
#     index = VectorStoreIndex(
#         nodes=stock_docs_nodes, storage_context=storage_context
#     )
# except DuplicateIDError:
#     print("duplicat id, ignore it!")

# # create a query engine and query
# query_engine = index.as_query_engine()
# response = query_engine.query("What is the meaning of life?")
# print(response)

In [None]:
query_engine = index.as_query_engine(
    similarity_top_k = 10
)
response = query_engine.query("比亚迪汽车的销售情况")

In [None]:
response = query_engine.query("2024年药明生物和三星生物做对比")

In [None]:
from IPython.display import Markdown, display

In [None]:
display(Markdown(f"<b>{response}</b>"))

In [None]:
response

In [None]:
docs

In [None]:
from llama_index.core import VectorStoreIndex
ray_docs_index = VectorStoreIndex(nodes=stock_docs_nodes)
ray_docs_index.storage_context.persist(persist_dir="./tmp/index")

In [None]:
query_engine = ray_docs_index.as_query_engine()
response = query_engine.query("汽车行业的领先地位的公司的具体信息")
#display(Markdown(f"<b>{response}</b>"))

In [None]:
from IPython.display import Markdown, display
display(Markdown(f"<b>{response}</b>"))

In [None]:
response