# Import

In [None]:
import pandas as pd
import numpy as np
import os

In [None]:
# Langchain
from langchain.chains.question_answering import load_qa_chain

from langchain_google_vertexai.model_garden import ChatAnthropicVertex

from langchain.prompts import PromptTemplate

In [None]:
import key_param

os.environ["OPENAI_API_KEY"] = key_param.OPENAI_API_KEY

In [None]:
from openai import OpenAI
client = OpenAI()

def get_embedding(text, model="text-embedding-3-small"):
   text = str(text)
   text = text.replace("\n", " ")
   
   return client.embeddings.create(input = [text], model=model).data[0].embedding

# Merge Vector Store

In [None]:
import pandas as pd

def concatenate_csv_columns(file_list):
    """
    Concat ระหว่าง PDF และ CSV โดยเอา column Content และ ada_embedding
    """

    combined_df = pd.DataFrame()

    for file in file_list:

        df = pd.read_csv("./" + file, usecols=['Content', 'ada_embedding'])
        

        combined_df = pd.concat([combined_df, df], ignore_index=True)
    
    combined_df['ada_embedding'] = combined_df.ada_embedding.apply(eval).apply(np.array)

    return combined_df

In [None]:
file_list = ['CSV_VectorStore.csv', 'PDF_VectorStore.csv']
vector_stores = concatenate_csv_columns(file_list)

vector_stores

# Search

In [None]:
from sklearn.metrics.pairwise import cosine_similarity


def search_in_docs(vector_stores, query, n=5, pprint=True):
   """
    จะเป็นการนำ Query มา Embedding และ Search ใน vector_stores
   """

   embedding = get_embedding(query, model='text-embedding-3-small')
   query_embedding_2d = [embedding]
   
   vector_stores['similarities'] = vector_stores.ada_embedding.apply(lambda x: cosine_similarity([x], query_embedding_2d))
   
   k_search = vector_stores.sort_values('similarities', ascending=False).head(n)
   
   return k_search

In [None]:
query = "วิธีการทำ RAG"
answer = search_in_docs(vector_stores, query, 5)
print(answer)

# QA with LLM

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from langchain_core.documents.base import Document

In [None]:
def create_document_from_query(df):
    doc_list = []
    
    for index, row in df.iterrows():
        embedding = row['ada_embedding']
        document = Document(
            page_content=row['Content'],
            metadata={
                # 'source': f'./{row['FileName']}',
                'embedding': embedding
            }
        )
        doc_list.append(document)
    return doc_list

In [None]:
def documents_question(query,vector_store=vector_stores,  n=8):
    
    search = search_in_docs(vector_store, query, n)
    
    documents = create_document_from_query(search)
    
    return documents, search

### LLM

In [None]:
project = "<project>"
location = "<location>"

In [None]:
llm = ChatAnthropicVertex(
    model_name="claude-3-5-sonnet@20240620",
    project=project,
    location=location,
    temperature=0.1,
    max_tokens=6046,
    timeout=None,
)

In [None]:
template = """
    Your AI is named Wachi , and ...... (Assitant Prompt)
    
    {context}

    Question: {query}
    Answer:
    """

In [None]:
prompt = PromptTemplate(input_variables=["query"], template=template)

chain = load_qa_chain(llm, chain_type="stuff", prompt=prompt)

## QA

In [None]:
def QA(query, chain=chain):
    
    documents, df = documents_question(query)

    inputs = {
        'input_documents': documents,
        'query': query,
    }

    answer = chain.run(**inputs)
    

    return answer

In [None]:
query = "วิธีการทำ RAG"
print(QA(query))