In [4]:
import os

api_key = ""
os.environ["OPENAI_API_KEY"] = api_key

In [5]:
from langchain_community.document_loaders import TextLoader

loader = TextLoader("datasets/buffett_quotes.txt")
docs = loader.load()

from langchain_text_splitters import CharacterTextSplitter

text_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=500,
    chunk_overlap=100,
    length_function=len,
    is_separator_regex=False,
)
documents = text_splitter.split_documents(docs)
documents

[Document(metadata={'source': 'datasets/buffett_quotes.txt'}, page_content="Warren Buffett: Only when the tide goes out do you discover who's been swimming naked.\nWarren Buffett: Price is what you pay. Value is what you get.\nWarren Buffett: When a management with a reputation for brilliance tackles a business with a reputation for bad economics, it is the reputation of the business that remains intact.\nWarren Buffett: Rule No.1: Never lose money. Rule No.2: Never forget rule No.1."),
 Document(metadata={'source': 'datasets/buffett_quotes.txt'}, page_content="Warren Buffett: Rule No.1: Never lose money. Rule No.2: Never forget rule No.1.\nWarren Buffett: I don't look to jump over 7-foot bars: I look around for 1-foot bars that I can step over.\nWarren Buffett: I buy expensive suits. They just look cheap on me.\nWarren Buffett: Derivatives are financial weapons of mass destruction.\nWarren Buffett: Beware of geeks bearing formulas."),
 Document(metadata={'source': 'datasets/buffett_qu

In [6]:
from langchain_openai.embeddings import OpenAIEmbeddings
embed_model = OpenAIEmbeddings(api_key=api_key,
                                  model='text-embedding-3-small')

In [10]:
# Chroma DB 사용
from langchain_community.vectorstores import Chroma
vector_index = Chroma.from_documents(documents, embed_model)
retrieved = vector_index.similarity_search("What is MistralAI?")
retrieved[0].page_content

"Warren Buffett: I sent one e-mail in my life. I sent it to Jeff Raikes at Microsoft, and it ended up in court in Minneapolis, so I am one for one.\nWarren Buffett: The best thing I did was to choose the right heroes.\nWarren Buffett: I bought a company in the mid-'90s called Dexter Shoe and paid $400 million for it. And it went to zero. And I gave about $400 million worth of Berkshire stock, which is probably now worth $400 billion. But I've made lots of dumb decisions. That's part of the game."

In [14]:
from langchain_community.vectorstores import FAISS

vector_index = FAISS.from_documents(documents, embed_model)
retrieved = vector_index.similarity_search("What is MistralAI?")
retrieved[0].page_content

"Warren Buffett: I sent one e-mail in my life. I sent it to Jeff Raikes at Microsoft, and it ended up in court in Minneapolis, so I am one for one.\nWarren Buffett: The best thing I did was to choose the right heroes.\nWarren Buffett: I bought a company in the mid-'90s called Dexter Shoe and paid $400 million for it. And it went to zero. And I gave about $400 million worth of Berkshire stock, which is probably now worth $400 billion. But I've made lots of dumb decisions. That's part of the game."

In [16]:
query = 'OpenAI의 sora모델에 대해 알려줘'

In [31]:
# similarity_search
# 벡터 공간에서 주어진 쿼리와 가장 유사한 항목들을 검색하는 데 사용
# mmr: maximal marginal relevance retrieval : 검색 결과에서 중복을 최소화하고 다양한 결과를 반환 (쿼리와 문서 거리는 최소화 / 문서들간의 거리는 최대화)
retriever = vector_index.as_retriever(search_type="mmr")
retriever.get_relevant_documents(query)

# similarity_score_threshold : 검색 결과의 유사도 점수의 최소 임계값 (쿼리와의 유사도 점수가 이 임계값보다 낮은 항목은 검색 결과에 포함되지 않음)
retriever = vector_index.as_retriever(
    search_type="similarity_score_threshold", search_kwargs={"score_threshold": 0.8}
)
retriever.get_relevant_documents(query)

# top_k : 검색 결과로 반환할 항목의 최대 개수를 설정
retriever = vector_index.as_retriever(search_kwargs={"k": 3})
retriever.get_relevant_documents(query)

# 함께 사용
retriever = vector_index.as_retriever(
    mmr=True,
    search_type="similarity_score_threshold", search_kwargs={"score_threshold": 0.8},
    top_k="3"
)
retriever.get_relevant_documents(query)



[]

# Multi Query retriever
* 여러 쿼리로 변환후 검색

In [32]:
import logging # 애플리케이션의 특정 부분에서 발생하는 이벤트를 추적하고 디버깅

logging.basicConfig()
# langchain.retrievers.multi_query 로거에 대해 INFO 레벨의 로깅을 설정
logging.getLogger("langchain.retrievers.multi_query").setLevel(logging.INFO)

In [33]:
from langchain_openai import ChatOpenAI
llm = ChatOpenAI(temperature=0)

In [34]:
from langchain.retrievers.multi_query import MultiQueryRetriever
retriever_multi = MultiQueryRetriever.from_llm(
    retriever=vector_index.as_retriever(), llm=llm
)

In [35]:
retriever_multi.get_relevant_documents(query="OpenAI의 sora모델에 대해 알려줘")

INFO:langchain.retrievers.multi_query:Generated queries: ['1. sora 모델에 대한 OpenAI의 정보를 알려줄래?', '2. OpenAI에서 개발한 sora 모델에 대해 자세히 설명해줄 수 있나요?', '3. sora 모델과 OpenAI의 관련성에 대해 알려주세요.']


[Document(metadata={'source': 'datasets/buffett_quotes.txt'}, page_content="Warren Buffett: Beware of geeks bearing formulas.\nWarren Buffett: The business schools reward difficult complex behavior more than simple behavior, but simple behavior is more effective.\nWarren Buffett: When you combine ignorance and leverage, you get some pretty interesting results.\nWarren Buffett: Only buy something that you'd be perfectly happy to hold if the market shut down for 10 years."),
 Document(metadata={'source': 'datasets/buffett_quotes.txt'}, page_content="Warren Buffett: Rule No.1: Never lose money. Rule No.2: Never forget rule No.1.\nWarren Buffett: I don't look to jump over 7-foot bars: I look around for 1-foot bars that I can step over.\nWarren Buffett: I buy expensive suits. They just look cheap on me.\nWarren Buffett: Derivatives are financial weapons of mass destruction.\nWarren Buffett: Beware of geeks bearing formulas."),
 Document(metadata={'source': 'datasets/buffett_quotes.txt'}, pa

# Parent Document Retriever
* 주어진 쿼리에 대해 관련 있는 문서 또는 텍스트 블록을 효율적으로 검색
* 대규모 문서 집합에서 특정 쿼리에 대한 적절한 문서를 찾는 데 효과적
* 문서 집합을 계층적 구조로 인덱싱
* 쿼리에 가장 관련성이 높은 상위 레벨 문서를 검색
* 쿼리에 가장 적합한 상위 문서(Parent Document)를 반환하여 사용자가 필요한 정보를 포함하는 문서 부분 전체를 볼 수 있게함

In [47]:
from langchain.retrievers import ParentDocumentRetriever
from langchain.storage import InMemoryStore
from langchain_text_splitters import RecursiveCharacterTextSplitter

store = InMemoryStore()

parent_splitter = RecursiveCharacterTextSplitter(chunk_size=1000)
child_splitter = RecursiveCharacterTextSplitter(chunk_size=200)

vector_index = Chroma(collection_name='split_parents', embedding_function=embed_model)

retriever = ParentDocumentRetriever(
    vectorstore=vector_index,
    docstore=store,
    child_splitter=child_splitter,
    parent_splitter=parent_splitter,
)
retriever.add_documents(docs)
retriever.get_relevant_documents(query="OpenAI의 sora모델에 대해 알려줘")

[Document(metadata={'source': 'datasets/buffett_quotes.txt'}, page_content="Warren Buffett: Americans are in a cycle of fear which leads to people not wanting to spend and not wanting to make investments, and that leads to more fear. We'll break out of it. It takes time.\nWarren Buffett: If anything, taxes for the lower and middle class and maybe even the upper middle class should even probably be cut further. But I think that people at the high end - people like myself - should be paying a lot more in taxes. We have it better than we've ever had it.\nWarren Buffett: I think the most important factor in getting out of the recession actually is just the regenerative capacity of - of American capitalism.\nWarren Buffett: I would say the most satisfying thing actually is watching my three children each pick up on their own interests and work many more hours per week than most people that have jobs at trying to intelligently give away that money in fields that they particularly care about.

# GENERATOR

In [49]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo")
llm.invoke("OpenAI의 sora모델에 대해 알려줘")

AIMessage(content='OpenAI의 sora 모델은 자연어 처리 모델 중 하나로, 특히 한국어에 특화된 모델입니다. 이 모델은 GPT-3와 같은 대규모 언어 모델을 기반으로 하며, 한국어 텍스트를 생성하고 이해하는 데 사용됩니다.\n\nsora 모델은 한국어 자연어 처리 분야에서 다양한 작업에 활용될 수 있으며, 특히 대화 시스템, 번역, 요약, 질문 응답 등의 작업에 적합합니다. 이 모델은 한국어 데이터를 학습하여 다양한 문제를 해결할 수 있는 능력을 갖추고 있습니다.\n\n또한 sora 모델은 OpenAI가 공개한 다른 모델과 마찬가지로 공개적으로 사용 가능하며, 개발자들이 이 모델을 활용하여 자연어 처리 관련 프로젝트를 진행할 수 있습니다. 이를 통해 한국어 자연어 처리 기술의 발전에 기여할 수 있습니다.', response_metadata={'token_usage': {'completion_tokens': 321, 'prompt_tokens': 25, 'total_tokens': 346}, 'model_name': 'gpt-3.5-turbo-0125', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-645a09d4-c3f4-44ea-bd15-101eba141520-0', usage_metadata={'input_tokens': 25, 'output_tokens': 321, 'total_tokens': 346})

# CHAIN

In [50]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough

prompt = ChatPromptTemplate.from_template("Tell me something")
chain = prompt | llm
chain.invoke({})

AIMessage(content="Did you know that honey never spoils? Archaeologists have found pots of honey in ancient Egyptian tombs that are over 3,000 years old and still perfectly edible. Honey's low water content and acidic pH make it resistant to bacteria and spoilage.", response_metadata={'token_usage': {'completion_tokens': 52, 'prompt_tokens': 10, 'total_tokens': 62}, 'model_name': 'gpt-3.5-turbo-0125', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-2cc815fe-501f-42cb-abb7-dde021c42e60-0', usage_metadata={'input_tokens': 10, 'output_tokens': 52, 'total_tokens': 62})

In [51]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

prompt = ChatPromptTemplate.from_template("Tell me something")
chain = prompt | llm | StrOutputParser()
chain.invoke({})

"Did you know that honey never spoils? Archaeologists have found pots of honey in ancient Egyptian tombs that are over 3,000 years old and still perfectly edible. Honey's high sugar content and low moisture levels create an environment that is inhospitable to bacteria and microorganisms, allowing it to remain preserved indefinitely."

In [52]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

prompt = ChatPromptTemplate.from_template("Tell me something about {topic}")
chain = prompt | llm | StrOutputParser()
chain.invoke({'topic': '디아블로4'})

'디아블로 4는 블리자드 엔터테인먼트가 개발 중인 액션 RPG 게임으로, 디아블로 시리즈의 네 번째 작품입니다. 이 게임은 공식적으로 발표되었으며, 플레이어들은 공포와 어둠이 넘치는 고어한 판타지 세계를 탐험하며 몬스터를 물리치고 보물을 찾아야 합니다. 디아블로 4는 멀티플레이어 기능을 강화하고, 더욱 현실적인 그래픽과 다양한 캐릭터 클래스를 제공하여 플레이어들에게 더욱 흥미로운 경험을 제공할 것으로 기대됩니다. 아직 출시일은 공개되지 않았지만, 많은 팬들이 기대하고 있습니다.'

In [None]:
# 위 체인은 아래와 같음1
prompt_formatted = prompt.invoke({'topic': '디아블로4'})
model_output = llm.invoke(prompt_formatted)
parser = StrOutputParser()
parser.invoke(model_output)

# 위 체인은 아래와 같음2 
(prompt | llm | parser).invoke({'topic': '디아블로4'})

In [55]:
# 질문을 retrieval 해서 찾아온 뒤 정제 하고 해당 llm 에 묻기 chain => 하지만 retreiver 결과만 넘겨서 답이 잘 안나옴 => RunnableParallel 사용
retrieved_docs = retriever.invoke("디아블로4")
def merge_docs(retrieved_docs):
    return "\n\n".join([d.page_content for d in retrieved_docs])
(retriever | merge_docs | llm | StrOutputParser()).invoke("Tell me something about 디아블로4")

"Warren Buffett: Risk comes from not knowing what you're doing.\nWarren Buffett: Price is what you pay. Value is what you get.\nWarren Buffett: It takes 20 years to build a reputation and five minutes to ruin it. If you think about that, you'll do things differently.\nWarren Buffett: The stock market is designed to transfer money from the active to the patient.\nWarren Buffett: The most important quality for an investor is temperament, not intellect. You need a temperament that neither derives great pleasure from being with the crowd or against the crowd.\nWarren Buffett: In the business world, the rearview mirror is always clearer than the windshield.\nWarren Buffett: It's far better to buy a wonderful company at a fair price than a fair company at a wonderful price."

In [57]:
# Runnable Parallel:  병렬로 여러 작업을 실행
from langchain_core.runnables import RunnableParallel

chain_parallel = RunnableParallel({"context": retriever, "llm": llm})
chain_parallel.invoke("Tell me something about 디아블로4")

{'context': [Document(metadata={'source': 'datasets/buffett_quotes.txt'}, page_content="Warren Buffett: The best thing I did was to choose the right heroes.\nWarren Buffett: I bought a company in the mid-'90s called Dexter Shoe and paid $400 million for it. And it went to zero. And I gave about $400 million worth of Berkshire stock, which is probably now worth $400 billion. But I've made lots of dumb decisions. That's part of the game.\nWarren Buffett: The first rule is not to lose. The second rule is not to forget the first rule.\nWarren Buffett: We believe that according the name 'investors' to institutions that trade actively is like calling someone who repeatedly engages in one-night stands a 'romantic.'\nWarren Buffett: Why not invest your assets in the companies you really like? As Mae West said, 'Too much of a good thing can be wonderful'.\nWarren Buffett: Someone is sitting in the shade today because someone planted a tree a long time ago.\nWarren Buffett: Chains of habit are t

In [58]:
# 입력을 그대로 출력으로 전달
from langchain_core.runnables import RunnablePassthrough

chain_parallel = RunnableParallel({"context": retriever, "query": RunnablePassthrough()})
chain_parallel.invoke("Tell me something about 디아블로4")

{'context': [Document(metadata={'source': 'datasets/buffett_quotes.txt'}, page_content="Warren Buffett: The best thing I did was to choose the right heroes.\nWarren Buffett: I bought a company in the mid-'90s called Dexter Shoe and paid $400 million for it. And it went to zero. And I gave about $400 million worth of Berkshire stock, which is probably now worth $400 billion. But I've made lots of dumb decisions. That's part of the game.\nWarren Buffett: The first rule is not to lose. The second rule is not to forget the first rule.\nWarren Buffett: We believe that according the name 'investors' to institutions that trade actively is like calling someone who repeatedly engages in one-night stands a 'romantic.'\nWarren Buffett: Why not invest your assets in the companies you really like? As Mae West said, 'Too much of a good thing can be wonderful'.\nWarren Buffett: Someone is sitting in the shade today because someone planted a tree a long time ago.\nWarren Buffett: Chains of habit are t

In [66]:
template = """
Utilizing the context given below, answer the question.
[context]
{context}

question: {query}
"""

prompt = ChatPromptTemplate.from_template(template)
chain = RunnableParallel({"context": retriever, "query": RunnablePassthrough()})\
        | prompt \
        | llm \
        | StrOutputParser()

In [67]:
chain.invoke("Tell me something about 디아블로4")

"I'm sorry, but there is no information provided in the context about Diablo 4. Would you like to ask about something else?"

In [64]:
# prompt 까지만 확인
chain = RunnableParallel({"context": retriever, "query": RunnablePassthrough()})\
        | prompt
chain.invoke("Tell me something about 디아블로4")

ChatPromptValue(messages=[HumanMessage(content='\nUtilizing the context given below, answer the question.\n[context]\n[Document(metadata={\'source\': \'datasets/buffett_quotes.txt\'}, page_content="Warren Buffett: The best thing I did was to choose the right heroes.\\nWarren Buffett: I bought a company in the mid-\'90s called Dexter Shoe and paid $400 million for it. And it went to zero. And I gave about $400 million worth of Berkshire stock, which is probably now worth $400 billion. But I\'ve made lots of dumb decisions. That\'s part of the game.\\nWarren Buffett: The first rule is not to lose. The second rule is not to forget the first rule.\\nWarren Buffett: We believe that according the name \'investors\' to institutions that trade actively is like calling someone who repeatedly engages in one-night stands a \'romantic.\'\\nWarren Buffett: Why not invest your assets in the companies you really like? As Mae West said, \'Too much of a good thing can be wonderful\'.\\nWarren Buffett: 

In [65]:
# RunnableParallel 사용하지않은 경우 (병렬실행안됨)
chain = {"context": retriever, "query": RunnablePassthrough()}\
        | prompt
chain.invoke("Tell me something about 디아블로4")

ChatPromptValue(messages=[HumanMessage(content='\nUtilizing the context given below, answer the question.\n[context]\n[Document(metadata={\'source\': \'datasets/buffett_quotes.txt\'}, page_content="Warren Buffett: The best thing I did was to choose the right heroes.\\nWarren Buffett: I bought a company in the mid-\'90s called Dexter Shoe and paid $400 million for it. And it went to zero. And I gave about $400 million worth of Berkshire stock, which is probably now worth $400 billion. But I\'ve made lots of dumb decisions. That\'s part of the game.\\nWarren Buffett: The first rule is not to lose. The second rule is not to forget the first rule.\\nWarren Buffett: We believe that according the name \'investors\' to institutions that trade actively is like calling someone who repeatedly engages in one-night stands a \'romantic.\'\\nWarren Buffett: Why not invest your assets in the companies you really like? As Mae West said, \'Too much of a good thing can be wonderful\'.\\nWarren Buffett: 