In [None]:
import os
from uuid import uuid4
import boto3
from pathlib import Path

from langchain_community.chat_models import ChatLiteLLM
from langchain_community.embeddings import SentenceTransformerEmbeddings
from langchain_elasticsearch import ApproxRetrievalStrategy, ElasticsearchStore
from elasticsearch import Elasticsearch

from redbox.models import Settings
from redbox.storage import ElasticsearchStorageHandler
from core_api.ingest_runnables import make_worker_ingest_runnable, LocalFile

In [None]:
env = Settings(_env_file=".env")
env.elastic.host = "localhost"
env.minio_host = "localhost"

embedding_model = SentenceTransformerEmbeddings(model_name=env.embedding_model, cache_folder="../models/")

es = Elasticsearch(
    hosts=[
        {
            "host": "localhost",
            "port": env.elastic.port,
            "scheme": env.elastic.scheme,
        }
    ],
    basic_auth=(env.elastic.user, env.elastic.password),
)

if env.elastic.subscription_level == "basic":
    strategy = ApproxRetrievalStrategy(hybrid=False)
elif env.elastic.subscription_level in ["platinum", "enterprise"]:
    strategy = ApproxRetrievalStrategy(hybrid=True)

vector_store = ElasticsearchStore(
    es_connection=es,
    index_name="summarisation-chunk",
    embedding=embedding_model,
    strategy=strategy,
    vector_query_field="embedding",
)

print(os.environ["AZURE_API_VERSION"])
llm = ChatLiteLLM(
    model="azure/gpt-35-turbo",
    api_base="https://oai-i-dot-ai-playground-sweden.openai.azure.com/",
    api_version="2024-02-01",
    max_tokens=1024,
)

storage_handler = ElasticsearchStorageHandler(es_client=es, root_index="summarisation")
s3_client = boto3

In [None]:
ingest = make_worker_ingest_runnable(
    storage_handler=storage_handler,
    s3_client=boto3.client(
        "s3",
        endpoint_url=f"http://{env.minio_host}:{env.minio_port}",
        aws_access_key_id=env.aws_access_key,
        aws_secret_access_key=env.aws_secret_key,
    ),
    env=env,
    chunk_size=1024,
)

In [None]:
es.indices.delete(index="summarisation-file", ignore=[400, 404])
es.indices.delete(index="summarisation-chunk", ignore=[400, 404])

file = LocalFile(filepath=Path("../data/Conservative-Manifesto-GE2024.pdf"), creator_user_uuid=uuid4())
ingest.invoke(file)
print(file.creator_user_uuid)