<a href="https://colab.research.google.com/github/variouscafe/variouscafe.github.io/blob/master/building_a_simple_vectorstore.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 키 설정

In [None]:
import os

from google.colab import userdata
os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_API_KEY')


# 패키지 설치

In [None]:
!pip install openai # openai 라이브러리를 설치합니다.
!pip install langchain # 랭체인 라이브러리를 설치합니다.
!pip install tqdm
!pip install chromadb # 벡터스토어
!pip install tiktoken # 토큰 계산용
!pip install sentence-transformers

Collecting openai
  Downloading openai-1.14.3-py3-none-any.whl (262 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m262.9/262.9 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
Collecting httpx<1,>=0.23.0 (from openai)
  Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai)
  Downloading httpcore-1.0.4-py3-none-any.whl (77 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.8/77.8 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->openai)
  Downloading h11-0.14.0-py3-none-any.whl (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: h11, httpcore, httpx, openai
Successfully installed h11-0.14.0 httpcore-1.0.4 ht

# RAG를 위한 파일 준비

In [None]:

import urllib.request

urllib.request.urlretrieve(
    "https://raw.githubusercontent.com/hwchase17/chat-your-data/master/state_of_the_union.txt",
    filename="state_of_the_union.txt"
)

# 랭체인기반 벡터스토어 구축

In [None]:
from langchain.document_loaders import TextLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma

raw_documents = TextLoader('state_of_the_union.txt').load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
documents = text_splitter.split_documents(raw_documents)
db = Chroma.from_documents(documents, OpenAIEmbeddings())

In [None]:
documents

In [None]:
len(documents)

In [None]:
documents[0:4]

In [None]:
query = "What did the president say about Ketanji Brown Jackson"
docs = db.similarity_search(query)
print(docs[0].page_content)

In [None]:
embedding_vector = OpenAIEmbeddings().embed_query(query)
docs = db.similarity_search_by_vector(embedding_vector)
print(docs[0].page_content)

In [None]:
len(embedding_vector)

In [None]:
embedding_vector

In [None]:
from tqdm import tqdm

# SimpleTextLoader 구현해보기

In [None]:
class SimpleTextLoader:

    def __init__(self, file_path):
        self.file_path = file_path

    def load(self):
        text = ''
        with open(self.file_path, 'r', encoding='utf-8') as file:
            text = file.read()
        return text

# SimpleCharacterTextSplitter 구현해보기

In [None]:
class SimpleCharacterTextSplitter:

    def __init__(self, chunk_size, chunk_overlap, separator_pattern='\n\n'):
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.separator_pattern = separator_pattern

    def split_documents(self, documents):

        splits = documents.split(self.separator_pattern)

        chunks = []
        current_chunk = splits[0]

        for split in tqdm(splits[1:], desc="splitting..."):

            if len(current_chunk) + len(split) + len(self.separator_pattern) > self.chunk_size:
                chunks.append(current_chunk.strip())
                current_chunk = split
            else:
                current_chunk += self.separator_pattern
                current_chunk += split

        if current_chunk:
            chunks.append(current_chunk.strip())

        return chunks

# SimpleOpenAIEmbeddings 구현해보기

In [None]:
from openai import OpenAI

class SimpleOpenAIEmbeddings:

    def embed_query(self, text):
        client = OpenAI()
        response = client.embeddings.create(
            input=text,
            model="text-embedding-ada-002"
        )
        return response.data[0].embedding

# SimpleVectorStore 구현해보기

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

class SimpleVectorStore:
    def __init__(self, docs, embedding):
        self.embedding = embedding
        self.documents = []
        self.vectors = []

        for doc in tqdm(docs, desc="embedding..."):
            self.documents.append(doc)
            vector = self.embedding.embed_query(doc)
            self.vectors.append(vector)

    def similarity_search(self, query, k=4):
        query_vector = self.embedding.embed_query(query)

        if not self.vectors:
            return []

        similarities = cosine_similarity([query_vector], self.vectors)[0]
        sorted_doc_similarities = sorted(zip(self.documents, similarities), key=lambda x: x[1], reverse=True)

        return sorted_doc_similarities[:k]

    def as_retriever(self, k=4):
        return SimpleRetriever(self, k)

# SimpleRetriever 구현해보기

In [None]:
class SimpleRetriever:
    def __init__(self, vector_store, k=4):
        self.vector_store = vector_store
        self.k = k

    def get_relevant_documents(self, query):
        docs = self.vector_store.similarity_search(query, self.k)
        return docs

In [None]:
raw_documents = SimpleTextLoader('state_of_the_union.txt').load()
text_splitter = SimpleCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
documents = text_splitter.split_documents(raw_documents)
db = SimpleVectorStore(documents, SimpleOpenAIEmbeddings())

In [None]:
len(documents)

In [None]:
documents[0:4]

In [None]:
query = "What did the president say about Ketanji Brown Jackson"
docs = db.similarity_search(query)

In [None]:
docs

In [None]:
print(docs[0][0])

# 한글 벡터스토어

# 헌법 예시

In [None]:
import urllib.request

urllib.request.urlretrieve(
    "https://raw.githubusercontent.com/puzzlet/constitution-kr/master/%EB%8C%80%ED%95%9C%EB%AF%BC%EA%B5%AD%20%ED%97%8C%EB%B2%95.txt",
    filename="korea_constitution.txt"
)

In [None]:
raw_documents = SimpleTextLoader('korea_constitution.txt').load()
text_splitter = SimpleCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
documents = text_splitter.split_documents(raw_documents)

In [None]:
documents

In [None]:
db = SimpleVectorStore(documents, SimpleOpenAIEmbeddings())

In [None]:
query = "대통령 임기는?"
docs = db.similarity_search(query)

In [None]:
docs

# 벡터스토어 튜닝하기

In [None]:
raw_documents = SimpleTextLoader('korea_constitution.txt').load()
text_splitter = SimpleCharacterTextSplitter(chunk_size=100, chunk_overlap=0)
documents = text_splitter.split_documents(raw_documents)

In [None]:
documents[0:10]

In [None]:
db = SimpleVectorStore(documents, SimpleOpenAIEmbeddings())

In [None]:
query = "대통령 임기는 몇 년인가?"
docs = db.similarity_search(query)

In [None]:
docs

In [None]:
query = "대통령 임기는?"
docs = db.similarity_search(query)

In [None]:
docs

# 검색기 만들기

In [None]:
retriever = db.as_retriever()

In [None]:
unique_docs = retriever.get_relevant_documents(query="대통령의 임기는 몇 년인가?")

In [None]:
unique_docs

# 챗봇 만들기

In [None]:
import openai

system_prompt_template = ("You are a helpful assistant. "
                          "Based on the following content, "
                          "kindly and comprehensively respond to user questions. write in Korean."
                          "[Content]"
                          "{content}"
                          "")

class SimpleRetrievalQA():

    def __init__(self, retriever):
        self.retriever = retriever

    def invoke(self, query):
        docs = self.retriever.get_relevant_documents(query)
        print(docs)

        for i, doc in enumerate(docs):
            print("[#" + str(i) + "]", doc[1])
            print(doc[0])

        completion = openai.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": system_prompt_template.format(content=docs)},
                {"role": "user", "content": query}
            ]
        )

        return completion.choices[0].message.content

In [None]:
chain = SimpleRetrievalQA(retriever)

answer = chain.invoke("대통령의 임기는?")

print(">> ", answer)

In [None]:
chain = SimpleRetrievalQA(retriever)

answer = chain.invoke("대통령은 중임할 수 있나요?")

print(">> ", answer)

In [None]:
def chat_with_user(user_message):
    ai_message = chain.invoke(user_message)
    return ai_message

while True:
    user_message = input("USER > ")
    if user_message.lower() == "quit":
        break
    ai_message = chat_with_user(user_message)
    print(f" A I > {ai_message}")

In [None]:
retriever = db.as_retriever(k=3)
chain = SimpleRetrievalQA(retriever)
answer = chain.invoke("대통령의 임기는?")

print(">> ", answer)

# 로컬 임베딩 모델

In [None]:
import langchain
from langchain.embeddings import HuggingFaceEmbeddings

In [None]:
raw_documents = SimpleTextLoader('korea_constitution.txt').load()
text_splitter = SimpleCharacterTextSplitter(chunk_size=10, chunk_overlap=0)
documents = text_splitter.split_documents(raw_documents)

embed_model = HuggingFaceEmbeddings(model_name="jhgan/ko-sbert-sts")

db = SimpleVectorStore(documents, embed_model)

In [None]:
query = "대통령의 임기는?"
docs = db.similarity_search(query)

In [None]:
docs

In [None]:
retriever = db.as_retriever(k=5)
chain = SimpleRetrievalQA(retriever)
answer = chain.invoke("대통령의 임기는?")

print(">> ", answer)

In [None]:
def chat_with_user(user_message):
    ai_message = chain.invoke(user_message)
    return ai_message

while True:
    user_message = input("USER > ")
    if user_message.lower() == "quit":
        break
    ai_message = chat_with_user(user_message)
    print(f" A I > {ai_message}")

# 다른 검색기 사용하기

In [None]:
!pip install duckduckgo-search

In [None]:
from duckduckgo_search import DDGS

with DDGS() as ddgs:
    results = [r for r in ddgs.text("2024 small llm?", max_results=5)]

print(results)

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

class SimpleWebSearch:
    def __init__(self, docs=None, embedding=None):
        self.documents = []

    def similarity_search(self, query, k=4):
        docs = []

        with DDGS() as ddgs:
            results = [r for r in ddgs.text(query, max_results=k)]

        for result in results:
            doc = (result['title'] + ":" + result['body'] + " - " + result['href'], 0.0)
            docs.append(doc)

        return docs

    def as_retriever(self, k=4):
        return SimpleRetriever(self, k)

In [None]:
sws = SimpleWebSearch()
web_retriever = sws.as_retriever()
chain = SimpleRetrievalQA(web_retriever)
answer = chain.invoke("What is the latest model created by OpenAI?")

print(">> ", answer)