In [1]:
!pip install openai
!pip install langchain
!pip install tqdm
!pip install chromadb
!pip install tiktoken
!pip install sentence_transformers

Collecting openai
  Downloading openai-1.37.0-py3-none-any.whl (337 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m337.0/337.0 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
Collecting httpx<1,>=0.23.0 (from openai)
  Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai)
  Downloading httpcore-1.0.5-py3-none-any.whl (77 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->openai)
  Downloading h11-0.14.0-py3-none-any.whl (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: h11, httpcore, httpx, openai
Successfully installed h11-0.14.0 httpcore-1.0.5 ht

In [3]:
!pip install langchain_community

Collecting langchain_community
  Downloading langchain_community-0.2.10-py3-none-any.whl (2.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.3/2.3 MB[0m [31m24.0 MB/s[0m eta [36m0:00:00[0m
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain_community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl (28 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain_community)
  Downloading marshmallow-3.21.3-py3-none-any.whl (49 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.2/49.2 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain_community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl (8.8 kB)
Collecting mypy-extensions>=0.3.0 (from typing-inspect<1,>=0.4.0->dataclasses-json<0.7,>=0.5.7->langchain_community)
  Downloading mypy_extensions-1.0.0-py3-none-any.whl (4.7 kB)
Installing collected packages: mypy-extens

In [4]:
from langchain.chat_models import ChatOpenAI
import os

from google.colab import userdata
os.environ["OPENAI_API_KEY"] = userdata.get('open_ai_key_team2')

In [5]:
# rag 위한 파일 준비
import urllib.request
urllib.request.urlretrieve(
    "https://raw.githubusercontent.com/hwchase17/chat-your-data/master/state_of_the_union.txt",
    filename="state_of_the_union.txt"
)

('state_of_the_union.txt', <http.client.HTTPMessage at 0x7c82de9684c0>)

In [6]:
from tqdm import tqdm



Simple TextLoader 구현해보기

In [7]:
class SimpleTextLoader:
    def __init__(self, file_path):
        self.file_path = file_path


    def load(self):
        text = '' #파일에서 읽은 데이터 저장 변수
        with open(self.file_path, 'r', encoding='utf-8') as f :
            text = f.read()
        return text


In [8]:
# SimpleCharacterTextSplitter 구현

class SimpleCharacterTextSplitter:
    def __init__(self, chunk_size, chunk_overlap, separator_pattern = '\n\n'):
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.separator_pattern = separator_pattern

    #문장 분할 함수(메소드: 클래스내 함수)
    def split_document(self, documents):

        #파일 전체내용(documents) >> 문단 단위로 나누기
        splits = documents.split(self.separator_pattern) #분할 기준 : separator_pattern

        chunks = [] #최종적으로 생성될 chunks 저장
        current_chunk = splits[0]
        #분할된 첫번째 문단 >> 초기 chunk로 저장

        for split in tqdm(splits[1:], desc='spliting...'):
            # splits[1:] >> current_chunk 다음 문단
            # 구분자가 chunk_size를 초과했는지 확인
            if len(current_chunk) + len(split) + len(self.separator_pattern) > self.chunk_size:
                chunks.append(current_chunk.strip())
                current_chunk = split
                #새로운 chunk의 시작
            else:
                 #초과하지 않으면 >> 헌재 청크(current_chunk)에 구분자 + 다음 문단 추가
                 current_chunk += self.separator_pattern
                 current_chunk += split

        #마지막 청크 추가
        if current_chunk:
            chunks.append(current_chunk.strip())

        return chunks



SimpleOpenAIEmbeddings 구현해보기

In [9]:
from openai import OpenAI

class SimpleOpenAIEmbeddings:
    def embed_query(self, text):
        client = OpenAI()
        response = client.embeddings.create(
            input = text,
            model = 'text-embedding-ada-002'
        )
        return response.data[0].embedding


SimpleRetriever 구현

In [11]:
class SimpleRetriever:

    def __init__(self, vector_store, k=4):
        self.vector_store = vector_store
        self.k = k

    def get_relevant_documents(self, query):
        return self.vector_store.similarity_search(query, self.k)


SimpleVectorStore(구현)

In [10]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

class SimpleVectorStore:
    def __init__(self, docs, embedding):
        self.embedding = embedding
        self.documents = [] #문서 내용을 저장할 리스트
        self.vectors = [] # 문서의 벡터 저장할 리스트

        for doc in tqdm(docs, desc = 'embedding...'):
            self.documents.append(doc)
            vector = self.embedding.embed_query(doc) #문서 >> 벡터(숫자) 변환
            self.vectors.append(vector)

    #유사도 검색 함수(메소드)
    def similarity_search(self, query, k=4) : #가장 유사한 4개 문서 변환
        query_vector = self.embedding.embed_query(query) #query >> 벡터(숫자) 변환

        if not self.vectors: #저장된 벡터가 없으면 >> 빈 리스트 반환
            return []

        similarities = cosine_similarity([query_vector], self.vectors)[0]
        #쿼리 벡터(query vector) : 저장된 벡터 간 코사인 유사도 계산
        sorted_doc_similarities = sorted(zip(self.documents, similarities), key = lambda x : x[1], reverse=True)
        # zip(문서, 유사도) tuple 형태 >> key x[1](유사도 기준) 내림차순
        return sorted_doc_similarities[:k] #유사도가 큰 상위 4개만 반환

    def as_retriever(self, k = 4):
        return SimpleRetriever(self, k)




In [12]:
path = '/content/state_of_the_union.txt'

raw_documents = SimpleTextLoader(path).load()
text_splitter = SimpleCharacterTextSplitter(chunk_size = 1000, chunk_overlap= 0 )
documents = text_splitter.split_document(raw_documents)

spliting...: 100%|██████████| 358/358 [00:00<00:00, 254157.22it/s]


In [13]:
documents[0]

'Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans.  \n\nLast year COVID-19 kept us apart. This year we are finally together again. \n\nTonight, we meet as Democrats Republicans and Independents. But most importantly as Americans. \n\nWith a duty to one another to the American people to the Constitution. \n\nAnd with an unwavering resolve that freedom will always triumph over tyranny. \n\nSix days ago, Russia’s Vladimir Putin sought to shake the foundations of the free world thinking he could make it bend to his menacing ways. But he badly miscalculated. \n\nHe thought he could roll into Ukraine and the world would roll over. Instead he met a wall of strength he never imagined. \n\nHe met the Ukrainian people. \n\nFrom President Zelenskyy to every Ukrainian, their fearlessness, their courage, their determination, inspires the world.'

In [17]:
len(documents)

42

In [15]:
db = SimpleVectorStore(documents, SimpleOpenAIEmbeddings())

embedding...: 100%|██████████| 42/42 [00:10<00:00,  4.01it/s]


In [16]:
query = "What did the president say about Ketanji Brown Jackson"

In [18]:
docs = db.similarity_search(query)
docs

[('Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \n\nTonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \n\nOne of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \n\nAnd I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.',
  0.8152000921997997),
 ('A former top litigator in private practice. A former federal public defender. And from a family of public school educators and police officers. A consensus builder. Since she’

In [19]:
print(docs[0])

('Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \n\nTonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \n\nOne of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \n\nAnd I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.', 0.8152000921997997)


In [22]:
docs[0][0] # 문장

'Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \n\nTonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \n\nOne of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \n\nAnd I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.'

In [23]:
docs[0][1] # 유사도


0.8152000921997997