In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredExcelLoader
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings, CacheBackedEmbeddings
from langchain.vectorstores import Chroma
from langchain.storage import LocalFileStore
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnablePassthrough, RunnableLambda



llm = ChatOpenAI(temperature=0.1)
cache_dir = LocalFileStore("./.cache/")

# splitter = CharacterTextSplitter.from_tiktoken_encoder(
#     separator="\n",
#     chunk_size=600,
#     chunk_overlap=100,
# )
# Split by newline, treating each line as a separate chunk
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  # Adjust based on your requirements
    chunk_overlap=50,  # Overlap between chunks
    separators=["\n"]  # Split based on newlines (you can add other separators too)
)

loader = UnstructuredExcelLoader("./ETC(yschang-Total).xlsx")
docs = loader.load_and_split(text_splitter=splitter)

embeddings = OpenAIEmbeddings()
cached_embeddings = CacheBackedEmbeddings.from_bytes_store(embeddings, cache_dir)
vectorstore = Chroma.from_documents(docs, cached_embeddings)
retriever = vectorstore.as_retriever()

map_doc_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """
            Use the following portion of a long document to see if any of the text is relevant to answer the question. Return any relevant text verbatim. If there is no relevant text, return : ''
            -------
            {context}
            """,
        ),
        ("human", "{question}"),
    ]
)

map_doc_chain = map_doc_prompt | llm

def map_docs(inputs):
    documents = inputs["documents"]
    question = inputs["question"]
    results =[]
    for doc in documents:
        result =map_doc_chain.invoke(
            {"context": doc.page_content, "question": question}
        ).content
        print("======doc======> \n\n", doc.page_content)
        print("result ===> \n\n", result)
        results.append(result)
    results = "\n\n".join(results)
    return results

# def map_docs(inputs):
#     documents = inputs["documents"]
#     question = inputs["question"]
#     return "\n\n".join(
#         map_doc_chain.invoke(
#             {"context": doc.page_content, "question": question}
#         ).content
#         for doc in documents
#     )

map_chain = {"documents": retriever, "question":RunnablePassthrough()} | RunnableLambda(map_docs)

final_prompt = ChatPromptTemplate.from_messages([
    ("system", 
     """
     Given the following extracted parts of a long document and a question, create a final answer.
     I you don't know the answer, just say that you don't know. Don't try to make up an answer."
     --------
     {context}
     """
     ),
    ("human", "{question}")
])
chain = {"context": map_chain, "question":RunnablePassthrough()} | final_prompt | llm 

chain.invoke("Could you find expression for people character words?")


 (= tears me up inside)
ETC(2010-H1)


127
Do I need to read find print or Have you told me everything?

ETC(2010-H1)


128
fire away
go ahead
ETC(2010-H1)


129
Close but no cigar
(implied failure)
ETC(2010-H1)


130
just shy of 2 years
just under 2 years
ETC(2010-H1)


131
pull an all-nighter

ETC(2010-H1)


132
be a workhorse

ETC(2010-H1)


133
be stress-out

ETC(2010-H1)


134
pass the half way point

ETC(2010-H1)


135
Keep my finger crossed for something

ETC(2010-H1)


136
commencing
usally graduation, commencement ceremony
ETC(2010-H1)


137
you wanna bet? / you are on?

ETC(2010-H1)


138
something is frowned upon

ETC(2010-H1)


139
morals and values

ETC(2010-H1)


140
moraly wrong

ETC(2010-H1)


141
to shot down

ETC(2010-H1)


142
ran through my head

ETC(2010-H1)


143
moral dilema

ETC(2010-H1)


144
where do you draw the line

ETC(2010-H1)


145
Play the devil's advocate

ETC(2010-H1)


146
for the sake of conversation

ETC(2010-H1)


147
to do something for my sake


AIMessage(content="I don't know.")

In [4]:
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredExcelLoader
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings, CacheBackedEmbeddings
from langchain.vectorstores import Chroma
from langchain.storage import LocalFileStore
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnablePassthrough, RunnableLambda



llm = ChatOpenAI(temperature=0.1)
cache_dir = LocalFileStore("./.cache/")

# splitter = CharacterTextSplitter.from_tiktoken_encoder(
#     separator="\n",
#     chunk_size=600,
#     chunk_overlap=100,
# )
# Split by newline, treating each line as a separate chunk
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  # Adjust based on your requirements
    chunk_overlap=50,  # Overlap between chunks
    separators=["\n"]  # Split based on newlines (you can add other separators too)
)

loader = UnstructuredExcelLoader("./ETC(yschang-Total).xlsx")
docs = loader.load_and_split(text_splitter=splitter)


In [5]:
print(docs)

[Document(page_content="Seq\nIdiom\nDescription\nUnnamed: 3\n\n\n\n\n1\nTell on me\n\nETC(2010-H1)\n\n\n2\nbutter someone up\n(~을 두고) 몹시 자책하다\nETC(2010-H1)\n\n\n3\nput bread on the table\n\nETC(2010-H1)\n\n\n4\nget it over with\n(idiomatic) To do or finish, especially said of something unpleasant. \\nHe didn’t want to go to the doctor for his shots, but he decided it would be better just to get it over with\nETC(2010-H1)\n\n\n5\nbeat yourself up\n\nETC(2010-H1)\n\n\n6\nblow my top\nangry at someone\\nsomeone make me blow my top\nETC(2010-H1)\n\n\n7\nicing on the cake\nextra benefit\nETC(2010-H1)\n\n\n8\nnerve wracking\n\nETC(2010-H1)\n\n\n9\nvedge out\n\nETC(2010-H1)\n\n\n10\nI almost had me\n\nETC(2010-H1)\n\n\n11\nlet it slide\nget your mind off\nETC(2010-H1)\n\n\n12\nclear the air\n\nETC(2010-H1)\n\n\n13\nto go above and beyond call of duty\n본인의 의무 이상으로 훌륭히 처리해내는\nETC(2010-H1)\n\n\n14\nstole my thunder\n\nETC(2010-H1)\n\n\n15\nbe harsh on\n(?)\nETC(2010-H1)\n\n\n16\nit's on tip out 