In [None]:
import os

api_key = ""
os.environ["OPENAI_API_KEY"] = api_key
os.environ.get("OPENAI_API_KEY")

# DATA LOAD

In [19]:
# TEXT 파일 읽기
from langchain_community.document_loaders import TextLoader
loader = TextLoader("datasets/buffett_quotes.txt")
docs = loader.load()
docs

[Document(metadata={'source': 'datasets/buffett_quotes.txt'}, page_content="Warren Buffett: Only when the tide goes out do you discover who's been swimming naked.\nWarren Buffett: Price is what you pay. Value is what you get.\nWarren Buffett: When a management with a reputation for brilliance tackles a business with a reputation for bad economics, it is the reputation of the business that remains intact.\nWarren Buffett: Rule No.1: Never lose money. Rule No.2: Never forget rule No.1.\nWarren Buffett: I don't look to jump over 7-foot bars: I look around for 1-foot bars that I can step over.\nWarren Buffett: I buy expensive suits. They just look cheap on me.\nWarren Buffett: Derivatives are financial weapons of mass destruction.\nWarren Buffett: Beware of geeks bearing formulas.\nWarren Buffett: The business schools reward difficult complex behavior more than simple behavior, but simple behavior is more effective.\nWarren Buffett: When you combine ignorance and leverage, you get some pre

In [None]:
# PDF 파일 읽기
from langchain_community.document_loaders import PyPDFLoader
loader = PyPDFLoader("datasets/Part 4. ChatBot with Persona.pdf")
docs = loader.load()
docs

In [None]:
# 폴더에서 읽기
from langchain_community.document_loaders import DirectoryLoader

loader = DirectoryLoader('datasets', glob="*", show_progress=True)
docs = loader.load()

# Chunking / 텍스트 Split 하기

In [27]:
# 일반 split
from langchain_text_splitters import CharacterTextSplitter

text_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=500,
    chunk_overlap=100,
    length_function=len,
    is_separator_regex=False,
)
documents = text_splitter.split_documents(docs)

In [50]:
# recursive split => 여러개 separator로 recursive하게 텍스트를 분리 => 첫 번째 separator로 분리 -> 너무 긴 청크는 다시 두 번째 separator로 분리
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", " "],
    chunk_size=500,
    chunk_overlap=20,
    length_function=len,
    is_separator_regex=False,
)
documents = text_splitter.split_documents(docs)

In [28]:
len(documents)

27

In [49]:
# 토큰 단위로 split => LM의 컨텍스트 제한은 보통 토큰 사이즈
# tiktoken: OPENAI가 개발한 byte pair tokenizer
# chunk size는 토큰의 개수
from langchain_text_splitters import CharacterTextSplitter

text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=100, chunk_overlap=0
)
documents = text_splitter.split_documents(docs)

ImportError: cannot import name 'CharacterTokenSplitter' from 'langchain_text_splitters' (/Users/act/.pyenv/versions/3.11.4/envs/py311/lib/python3.11/site-packages/langchain_text_splitters/__init__.py)

In [42]:
# SemanticChunker
# OpenAI 임베딩 콜로 각 문장에 대한 embedding 생성 
# next sentence와의 embedding distance가 threshold 이하일 때는 chunk를 결합하고, 이상일 때는 chunk 분리
# 사용자의 질의를 단순한 키워드 매칭을 넘어서 그 의미를 파악하여 관련된 결과를 반환하는 검색 방식
# https://wikidocs.net/234003
from langchain_experimental.text_splitter import SemanticChunker

text_splitter = SemanticChunker(OpenAIEmbeddings(api_key=api_key))

documents = text_splitter.split_documents(docs)

Document(metadata={'source': 'datasets/buffett_quotes.txt'}, page_content="Warren Buffett: Only when the tide goes out do you discover who's been swimming naked.\nWarren Buffett: Price is what you pay. Value is what you get.\nWarren Buffett: When a management with a reputation for brilliance tackles a business with a reputation for bad economics, it is the reputation of the business that remains intact.\nWarren Buffett: Rule No.1: Never lose money. Rule No.2: Never forget rule No.1.\nWarren Buffett: I don't look to jump over 7-foot bars: I look around for 1-foot bars that I can step over.\nWarren Buffett: I buy expensive suits. They just look cheap on me.\nWarren Buffett: Derivatives are financial weapons of mass destruction.\nWarren Buffett: Beware of geeks bearing formulas.\nWarren Buffett: The business schools reward difficult complex behavior more than simple behavior, but simple behavior is more effective.\nWarren Buffett: When you combine ignorance and leverage, you get some pret

# Embedding

In [45]:
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS

embed_model = OpenAIEmbeddings(api_key=api_key,
                                  model='text-embedding-3-small')

# 단건 변환
emb = embed_model.embed_query("What is Mistral AI?")

# 다건 변환
emb_doc = embed_model.embed_documents(['Hi', 'Mistal', 'ML'])


100%|███████████████████████████████████████████| 2/2 [36:21<00:00, 1090.53s/it]


In [47]:
len(emb)

1536

In [48]:
emb_doc

[[-0.006932560354471207,
  -0.03534681349992752,
  0.001600959338247776,
  0.06536941975355148,
  0.03289176523685455,
  -0.02421034872531891,
  -0.026118190959095955,
  0.049396805465221405,
  0.01622403971850872,
  -0.05176311731338501,
  -0.013406647369265556,
  -0.014508462511003017,
  -0.026073822751641273,
  -0.0032629251945763826,
  0.024550506845116615,
  0.0011692919069901109,
  -0.05347869545221329,
  0.015085251070559025,
  0.011469228193163872,
  0.03392702713608742,
  0.049308065325021744,
  0.02037987858057022,
  -0.013916883617639542,
  0.018900932744145393,
  0.01712619699537754,
  0.024225138127803802,
  0.01826498657464981,
  -0.0011960976989939809,
  0.01959603652358055,
  -0.03676660358905792,
  0.02767108380794525,
  -0.028233082965016365,
  0.02762671560049057,
  -0.016238829120993614,
  -0.01181678008288145,
  -0.0160317774862051,
  -0.014094357378780842,
  0.037565235048532486,
  0.0188861433416605,
  -0.03765397146344185,
  0.04345143958926201,
  -0.01240835897