In [None]:
!pip install "unstructured[epub]" langchain-community
!pip install pypdf

In [3]:
import shutil
import numpy as np
import re

from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma
from dotenv import load_dotenv
from pathlib import Path
from langchain.document_loaders import DirectoryLoader, PyPDFLoader, UnstructuredEPubLoader
from langchain.text_splitter import CharacterTextSplitter

In [4]:
load_dotenv()

True

In [7]:
loader = PyPDFLoader(file_path="experiment-data/Dale Carnegie.pdf")
pages = loader.load()
for page in pages:
    page.metadata["type"] = "Dale Carnegie"
print(len(pages))

folder = Path("experiment-data")
dir_loader = DirectoryLoader(folder, glob="*.epub", loader_cls=UnstructuredEPubLoader)
epub_docs = dir_loader.load()

pattern = re.compile("(.+)/(.+)(\..+).epub$")

for e_doc in epub_docs:
    source = e_doc.metadata["source"]
    match = pattern.match(source)
    doc_type = match.group(2)
    e_doc.metadata["type"] = doc_type

print(len(epub_docs))

documents = []
documents += pages
documents += epub_docs

print(len(documents))

213
2
215


In [8]:
splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = splitter.split_documents(documents)
print(len(chunks))

Created a chunk of size 1692, which is longer than the specified 1000
Created a chunk of size 1530, which is longer than the specified 1000
Created a chunk of size 1087, which is longer than the specified 1000
Created a chunk of size 1062, which is longer than the specified 1000
Created a chunk of size 1139, which is longer than the specified 1000
Created a chunk of size 1093, which is longer than the specified 1000
Created a chunk of size 1068, which is longer than the specified 1000
Created a chunk of size 1229, which is longer than the specified 1000
Created a chunk of size 1113, which is longer than the specified 1000
Created a chunk of size 1047, which is longer than the specified 1000
Created a chunk of size 1174, which is longer than the specified 1000
Created a chunk of size 1832, which is longer than the specified 1000
Created a chunk of size 1991, which is longer than the specified 1000
Created a chunk of size 1020, which is longer than the specified 1000
Created a chunk of s

3996


In [9]:
embeddings = OpenAIEmbeddings(model="text-embedding-3-large", )
db_name = "experiment_db"
db_path = Path(db_name)

if db_path.exists():
    shutil.rmtree(db_path, ignore_errors=True)
    Chroma(embedding_function=embeddings, persist_directory=db_name).delete_collection()

store = Chroma(persist_directory=db_name, embedding_function=embeddings)
print(f"Vector storage {db_name} is created.")

In [None]:
def get_batch(docs, size):
    for i in range(0, len(docs), size):
        yield docs[i:i+size]

batch_size = 512

for batch in get_batch(chunks, batch_size):
    store.add_documents(batch)

print(f"{store._collection.count()} docs have been added to vector store.")

In [13]:
data = store.get(include=["embeddings", "documents", "metadatas"])

In [14]:
vectors = np.array(data["embeddings"])
print(vectors.shape)
print(vectors[0].shape)

(3996, 3072)
(3072,)


In [15]:
documents = data["documents"]
print(documents[1][:30])

First Published in 1937. 
 
Th


In [17]:
import plotly.graph_objects as go

from sklearn.manifold import TSNE

In [30]:
tsne = TSNE(n_components=2, random_state=37)
reduced_vectors = tsne.fit_transform(vectors)
print(reduced_vectors.shape)

(3996, 2)


In [21]:
types = [md["type"] for md in data["metadatas"]]
type_set = set(types)
print(type_set)
type_list = list(type_set)

texts = [f"Type: {t}<br>Text: {doc[:100]}" for doc, t in zip(documents, types)]

color_list = ["blue", "red", "green"]
colors = [color_list[type_list.index(t)] for t in types]

{'Уопник Кеннет', 'Dale Carnegie', 'Тит Хань'}


In [None]:
scattered_data = go.Scatter(
    x = reduced_vectors[:,0],
    y = reduced_vectors[:,1],
    mode="markers",
    marker=dict(size=5, color=colors, opacity=1),
    text=texts,
    hoverinfo="text"
)

fig = go.Figure(data=[scattered_data])
fig.update_layout(
    title = "2D Experiment Vector Store Visualization",
    width=800,
    height=600,
    margin=dict(r=20, l=10, b=10, t=40)
)
fig.show()

In [None]:
tsne = TSNE(n_components=3, random_state=37)
reduced_vectors = tsne.fit_transform(vectors)

In [None]:
scattered_data = go.Scatter3d(
    x = reduced_vectors[:,0],
    y = reduced_vectors[:,1],
    z = reduced_vectors[:,2],
    mode="markers",
    marker=dict(size=5, color=colors, opacity=1),
    text=texts,
    hoverinfo="text"
)

fig = go.Figure(data=[scattered_data])
fig.update_layout(
    title = "3D Experiment Vector Store Visualization",
    width=1600,
    height=1000,
    margin=dict(r=20, l=10, b=10, t=40)
)
fig.show()