# RAG using local models



https://python.langchain.com/docs/use_cases/question_answering/local_retrieval_qa

In [None]:
!pip install bs4 tiktoken langchain elasticsearch eland[pytorch]==8.10.1 --user


In [None]:
from getpass import getpass
from elasticsearch import Elasticsearch

ES_URL = input('Elasticsearch URL(ex:https://127.0.0.1:9200): ')
ES_USER = "elastic" 
ES_USER_PASSWORD = getpass('elastic user PW: ')
CERT_PATH = input('Elasticsearch pem path: ')

client = Elasticsearch(
    ES_URL,
    basic_auth=(ES_USER, ES_USER_PASSWORD),
    ca_certs=CERT_PATH,
    request_timeout=60
)


In [None]:
import os
cwd = os.getcwd()

try :
    os.mkdir(cwd + "/models") 
except:
    pass

In [None]:
os.chdir(cwd + "/models")

try :
    os.system("git clone https://huggingface.co/intfloat/multilingual-e5-base")
except:
    print('이미 모델이 존재합니다.')

os.chdir(cwd)

es_model_id = "intfloat_multilingual_efive_base"

In [None]:
import os
import shutil
from pathlib import Path
from eland.ml import MLModel
from eland.ml.pytorch import PyTorchModel
from eland.common import es_version
from eland.ml.pytorch.transformers import TransformerModel

es_model = MLModel(client, es_model_id)

if (es_model.exists_model() == False) :
    # 현재 경로 얻기
    cwd = os.getcwd()
    local_model_path = cwd + '/models/multilingual-e5-base'

    print(local_model_path)

    # 모델 이름 및 작업 유형 설정
    #tm = TransformerModel(local_model_path, "text_embedding")
    es_cluster_version = es_version(client)
    tm = TransformerModel(
        model_id=local_model_path, 
        task_type="text_embedding", 
        es_version=es_cluster_version
    )
    tmp_path = "tmp_models/" + es_model_id 
    Path(tmp_path).mkdir(parents=True, exist_ok=True)
    model_path, config, vocab_path = tm.save(tmp_path)

    print(tmp_path)

    ptm = PyTorchModel(client, es_model_id)
    ptm.import_model(
        model_path=model_path,
        config_path=None,
        vocab_path=vocab_path,
        config=config
    ) 
    ptm.start()

    shutil.rmtree(Path("tmp_models"), ignore_errors=True)


In [None]:
from langchain.document_loaders import WebBaseLoader

loader = WebBaseLoader([
    "https://www.elastic.co/search-labs/blog/articles/may-2023-launch-announcement",
    "https://www.elastic.co/kr/blog/may-2023-launch-announcement"
])
loader.requests_kwargs = {'verify':False}

data = loader.load()

from langchain.text_splitter import CharacterTextSplitter

text_splitter = CharacterTextSplitter(
    chunk_size=512,
    chunk_overlap=0, 
    separator=". ",
    length_function = len
)
docs = text_splitter.split_documents(data)

In [None]:
from langchain.vectorstores import ElasticsearchStore
from langchain.embeddings.elasticsearch import ElasticsearchEmbeddings

embeddings = ElasticsearchEmbeddings.from_es_connection(
    es_connection=client,
    model_id = es_model_id
)

vectorstore = ElasticsearchStore(
    es_connection=client, 
    embedding=embeddings, 
    query_field="text_field",
    vector_query_field="vector_query_field.predicted_value",
    index_name= "workplace_index"
)


In [None]:
PIPELINE_ID="vectorize_workplace"

vectorstore.client.ingest.put_pipeline(id=PIPELINE_ID, processors=[{
  "inference": {
    "model_id": es_model_id,
    "field_map": {
      "query_field": "text_field"
    },
      "target_field": "vector_query_field",
  }
}])

In [None]:
INDEX_NAME = "workplace_index"

# define index mapping
INDEX_MAPPING = {
    "properties": {
        "text_field": {
            "type": "text"
        },
        "vector_query_field": {
            "properties": {
                "is_truncated": {
                    "type": "boolean"
                },
                "predicted_value": {
                    "type": "dense_vector",
                    "dims": 768,
                    "index": True,
                    "similarity": "cosine"
                }
            }
        }
    }
}

INDEX_SETTINGS = {"index": { "default_pipeline": PIPELINE_ID}}

if vectorstore.client.indices.exists(index=INDEX_NAME):
    print("Deleting existing %s" % INDEX_NAME)
    vectorstore.client.indices.delete(index=INDEX_NAME, ignore=[400, 404])

print("Creating index %s" % INDEX_NAME)
vectorstore.client.indices.create(index=INDEX_NAME, mappings=INDEX_MAPPING, settings=INDEX_SETTINGS,
                  ignore=[400, 404])

In [None]:
db = ElasticsearchStore.from_documents(
    docs, 
    es_connection=client,
    query_field="text_field",
    vector_query_field="vector_query_field.predicted_value",
    index_name= "workplace_index",
    distance_strategy="COSINE",
    strategy=ElasticsearchStore.ApproxRetrievalStrategy(
        hybrid=True,
        query_model_id=es_model_id
    ),
    bulk_kwargs={
        "chunk_size": 10,
        "max_chunk_bytes": 200000000
    }
)

In [None]:
def showResults(output):
  print("Total results: ", len(output))
  for index in range(len(output)):
    print(output[index])

In [None]:
showResults(db.similarity_search("セマンティック検索 RRF", k=2))

In [None]:
query = "LLM의 역할을 알려줘?"
results = db.similarity_search(
    query
)

showResults(results)


In [None]:
!pip install llama-cpp-python

In [None]:
from langchain.llms import LlamaCpp
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

In [None]:
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])

# Make sure the model path is correct for your system!
llm = LlamaCpp(
    model_path = cwd + "/models/Llama-2-ko-7B-chat-gguf-q8_0.bin",
    # n_gpu_layers=n_gpu_layers,
    # n_batch=n_batch,
    n_ctx=2048,

    # https://www.reddit.com/r/LocalLLaMA/comments/1343bgz/what_model_parameters_is_everyone_using/
    temperature=0.7,
    top_k=2,
    top_p=0.1,

    max_tokens=512,
    verbose=True,
    f16_kv=True,  # MUST set to True, otherwise you will run into problem after a couple of calls
    callback_manager=callback_manager,
)

In [None]:
llm("Large Language Model에 대해 설명해줘")

In [None]:
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain

# Prompt
prompt = PromptTemplate.from_template(
    "Summarize the main themes in these retrieved docs: {docs}"
)

# Chain
llm_chain = LLMChain(llm=llm, prompt=prompt)

# Run
question = "Elasticsearch에 LLM을 적용하는 방법을 알려줘"
docs = vectorstore.similarity_search(question)
result = llm_chain(docs)

# Output
result

In [None]:
from langchain.chains import RetrievalQA

template = """
<s>[INST] <<SYS>>
Act as a cryptocurrency expert. Use the following information to answer the question at the end.
<</SYS>>
 
{context}
 
{question} [/INST]
"""
 
prompt = PromptTemplate(template=template, input_variables=["context", "question"])
 
 
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=db.as_retriever(search_kwargs={"k": 2}),
    return_source_documents=True,
    chain_type_kwargs={"prompt": prompt},
)
 
result = qa_chain(
    question
)
print(result["result"].strip())