In [None]:
!pip -q install youtube-transcript-api pytube langchain_community pypdf

In [1]:
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.document_loaders import YoutubeLoader
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders.merge import MergedDataLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from uuid import uuid4

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [2]:
docs = []

In [3]:
# load document from youtube video on Atomic Habits
loader1 = YoutubeLoader.from_youtube_url(
    "https://www.youtube.com/watch?v=hh_rNTdx9t0", add_video_info=False
)

doc_atomic_habits = loader1.load()
doc_atomic_habits[0].metadata['source'] = 'Atomic Habits by James Clear'
docs += doc_atomic_habits

In [4]:
# load document from youtube video on The Game of Life and How to Play It
loader2 = YoutubeLoader.from_youtube_url(
    "https://www.youtube.com/watch?v=eYoZECdoLUQ", add_video_info=False
)
doc_untroubled_mind = loader2.load()
doc_untroubled_mind[0].metadata['source'] = "The Untroubled Mind by Herbert J. Hall"
docs += doc_untroubled_mind

In [5]:
# load document from pdf file
file_path = "./The Subtle Art of Not Giving a Fck.pdf"
loader3 = PyPDFLoader(file_path)
doc_subtle_art = loader3.load()
for doc_page in doc_subtle_art:
    doc_page.metadata['source'] = "The Subtle Art of Not Giving a Fck by Mark Manson"
docs += doc_subtle_art[4:]
doc_subtle_art

[Document(metadata={'source': 'The Subtle Art of Not Giving a Fck by Mark Manson', 'page': 0}, page_content=''),
 Document(metadata={'source': 'The Subtle Art of Not Giving a Fck by Mark Manson', 'page': 1}, page_content=''),
 Document(metadata={'source': 'The Subtle Art of Not Giving a Fck by Mark Manson', 'page': 2}, page_content='CONTENTS\nC\nHAPTER\n 1: Don’t Try\nThe Feedback Loop from Hell\nThe Subtle Art of Not Giving a Fuck\nSo Mark, What the Fuck Is the Point of This Book Anyway?\nC\nHAPTER\n 2: Happiness Is a Problem\nThe Misadventures of Disappointment Panda\nHappiness Comes from Solving Problems\nEmotions Are Overrated\nChoose Your Struggle\nC\nHAPTER\n 3: You Are Not Special\nThings Fall Apart\nThe Tyranny of Exceptionalism\nB-b-b-but, If I’m Not Going to Be Special or Extraordinary, What’s the\nPoint?\nC\nHAPTER\n 4: The Value of Suffering\nThe Self-Awareness Onion\nRock Star Problems\nShitty Values\nDefining Good and Bad Values\nC\nHAPTER\n 5: You Are Always Choosing\nTh

In [6]:
# split into chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=2000,
    chunk_overlap=200)

splits = text_splitter.split_documents(docs)

splits

[Document(metadata={'source': 'Atomic Habits by James Clear'}, page_content="welcome to Atomic habits a groundbreaking guide to mastering the small changes that lead to remarkable results imagine unlocking the potential Within You by making tiny seemingly insignificant adjustments to your daily routines these are the very changes that compounded over time can revolutionize your life James Clear an expert on habit formation brings you a masterpiece that isn't just about forming good habits and Breaking Bad Ones it's about understanding the very essence of human behavior and harnessing it to achieve great in a world where we're often bombarded with quick fixes and instant gratification Atomic habits offers a refreshing perspective clear delves into the science of habits explaining how they are formed and why they can be so difficult to change he reveals the hidden forces that shape our Behavior often without us even realizing it through engaging storytelling and relatable examples you'll

In [7]:
# get uuids for each document
uuids = [str(uuid4()) for _ in range(len(splits))]

# get embedding model
from langchain_huggingface import HuggingFaceEmbeddings
embed_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

# get vector store
from langchain_chroma import Chroma
chroma_vector_store = Chroma(persist_directory="./chroma_db",embedding_function=embed_model)

  from tqdm.autonotebook import tqdm, trange


In [None]:
# add documents to vector store
chroma_vector_store.add_documents(documents=splits, ids=uuids)

['7f515ff1-8660-454e-afe6-d43f8567d513',
 '4025872c-f7b9-47f1-8ad1-9c8a6ed92fb3',
 '7229f726-e928-4a27-8f28-cf1d4e727276',
 '2820d753-618e-4bac-a090-753c69d9c653',
 'f53d1ac4-d120-4196-9f59-eb6aaef97129',
 'dd75739f-cb77-4ea9-951c-338874c76762',
 'd42e581f-5bb0-4087-96d1-852f3133cc43',
 'fd98ca3c-3212-42c4-8210-e7971f04e05b',
 '39220eb5-c932-49dc-930a-a7dd19f5503d',
 '73549253-0e7b-4458-9fde-2936e19cf30e',
 'c9db797c-0569-4110-9a05-717bbae42bc1',
 'd627da03-c959-4254-a865-e0cf38b70a4b',
 '463a935e-7a2d-4e59-8c76-3d4815bae9f1',
 '30d77826-2ef5-4ed4-8d29-907409df0605',
 'bab63917-9e37-4808-bcae-5824b6ac34ed',
 '1a950ccf-0cbe-4d5a-9474-0f4102557285',
 '46be06a2-002d-4a85-b854-f63ca4016875',
 '03f1d03c-f691-4036-ad3f-811ee42e383a',
 '49c167e1-bc00-4e7d-9ee6-44b8c5d4b3f7',
 '1c0bc948-63c3-47d3-b4d9-a0313a1d3e74',
 'b929cf69-2035-4675-8618-13631ca68d8d',
 'b238dde5-820a-4e05-8c8b-a0d57bf8fad3',
 '4aafa382-eca5-4ac4-8486-3a221a8a196d',
 '1fc9eabe-17ac-4ebc-8883-f7cb54c7aed8',
 'adcbe82c-7393-

In [10]:
# check that documents are added
query = "how do i relieve anxiety?"
chroma_vector_store.similarity_search(query, k=5)

[Document(metadata={'page': 7, 'source': 'The Subtle Art of Not Giving a Fck by Mark Manson'}, page_content='And while there’s nothing wrong with good business, the problem is that\ngiving too many fucks is bad for your mental health. It causes you to become\noverly attached to the superficial and fake, to dedicate your life to chasing a\nmirage of happiness and satisfaction. The key to a good life is not giving a\nfuck about more; it’s giving a fuck about less, giving a fuck about only what\nis true and immediate and important.\nThe Feedback Loop from Hell\nThere’s an insidious quirk to your brain that, if you let it, can drive you\nabsolutely batty. Tell me if this sounds familiar to you:\nYou get anxious about confronting somebody in your life. That anxiety\ncripples you and you start wondering why you’re so anxious. Now you’re\nbecoming \nanxious about being anxious.\n Oh no! Doubly anxious! Now you’re\nanxious about your anxiety, which is causing \nmore\n anxiety. Quick, where’s\n