In [None]:
import pandas as pd 
import os 
import json 
import openai
import llama_index
from openai import AzureOpenAI
from llama_index.core.embeddings import BaseEmbedding
from llama_index.core.bridge.pydantic import PrivateAttr
from typing import Any, List
from llama_index.llms.openai_like import OpenAILike
from llama_index.core import Settings
from llama_index.core import Document, VectorStoreIndex




In [None]:
os.environ["no_proxy"] = "10.156.254.10"
openai.api_key = "dtnumds"
openai.api_base = "http://10.156.254.10:8000/v1"
os.environ["OPENAI_API_KEY"] = "dtnumds"
os.environ["OPENAI_API_BASE"] = "http://10.156.254.10:8000/v1"

In [None]:
client = AzureOpenAI( api_key="dtnumds",
                azure_endpoint="http://10.156.254.10:8000/v1",
                api_version = "2023-07-01-preview" )

In [None]:
for model in client.models.list().data :
    print(model.id)

In [None]:

class DGFIPEmbeddings(BaseEmbedding):
    _model_name: str = PrivateAttr()
    _openai_client = PrivateAttr()

    def __init__(
        self,
        openai_client,
        model_name: str = "dgfip-e5-large",
        **kwargs: Any,
    ) -> None:
        self._model_name = model_name
        self._openai_client = openai_client
        super().__init__(**kwargs)

    @classmethod
    def class_name(cls) -> str:
        return "DGFIPEmbedding"

    async def _aget_query_embedding(self, query: str) -> List[float]:
        return self._get_query_embedding(query)

    def _get_query_embedding(self, query: str) -> List[float]:
        embeddings = self._openai_client.embeddings.create(
            input = query,
            model= self._model_name # model = "deployment_name".
        )
        return embeddings.data[0].embedding

    def _get_text_embedding(self, text: str) -> List[float]:
        embeddings = self._openai_client.embeddings.create(
            input = text,
            model= self._model_name # model = "deployment_name".
        )
        return embeddings.data[0].embedding
    
    def _get_text_embeddings(self, texts: List[str]) -> List[List[float]]:
        embeddings = self._openai_client.embeddings.create(
            input = texts,
            model= self._model_name # model = "deployment_name".
        )
        embs = [e.embedding for e in embeddings.data]
        return embs

    async def _aget_query_embedding(self, query: str) -> List[float]:
        return self._get_query_embedding(query)

    async def _aget_text_embedding(self, text: str) -> List[float]:
        return self._get_text_embedding(text)


In [None]:
Settings.embed_model = DGFIPEmbeddings(openai_client = client)
Settings.llm  = OpenAILike(model='mixtral-instruct', max_tokens=2048, timeout=600)

In [None]:
from llama_index.core.node_parser import (
    SemanticSplitterNodeParser,
)

In [None]:
splitter = SemanticSplitterNodeParser(
    buffer_size=1, breakpoint_percentile_threshold=50, embed_model=DGFIPEmbeddings(openai_client = client)
)

In [None]:
df = pd.read_csv('../../data/test_html.csv')
df['n_words'] = df.text.apply(lambda x: len(x.split()))

In [None]:
df.n_words.max()

In [None]:
df.text.to_list()

In [None]:
contents = df.text.to_list()
filenames = df.filename.to_list()
n_words = df.n_words.to_list()

In [None]:
documents = [Document(text=contents[i], metadata={"filename": filenames[i],
                                                "n_words": n_words[i]
                                                    }) for i in range(len(contents))]

In [None]:
nodes = splitter.get_nodes_from_documents(documents)

In [None]:
nodes[1]