In [5]:
%%capture
#After executing the cell,please RESTART the kernel and run all the cells.
!pip install --user "langchain-community==0.2.1"
!pip install --user "pypdf==4.2.0"
!pip install --user "PyMuPDF==1.24.5"
!pip install --user "unstructured==0.14.8"
!pip install --user "markdown==3.6"
!pip install --user  "jq==1.7.0"
!pip install --user "pandas==2.2.2"
!pip install --user "docx2txt==0.8"
!pip install --user "requests==2.32.3"
!pip install --user "beautifulsoup4==4.12.3"
!pip install --user "nltk==3.8.0"


In [None]:
!pip install --user "ibm-watsonx-ai==1.1.2"

In [11]:
%%capture
!pip install --user "langchain-ibm==0.1.11"

In [14]:
%%capture
!pip install --user "chromadb==0.4.24"

In [1]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn
warnings.filterwarnings('ignore')

from pprint import pprint
import json
from pathlib import Path
from langchain_community.document_loaders import TextLoader
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_community.document_loaders import UnstructuredMarkdownLoader
from langchain_community.document_loaders import JSONLoader
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain_community.document_loaders.csv_loader import UnstructuredCSVLoader
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.document_loaders import Docx2txtLoader
from langchain_community.document_loaders import UnstructuredFileLoader

## Load Documents

In [4]:
paper_link = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/WgM1DaUn2SYPcCg_It57tA/A-Comprehensive-Review-of-Low-Rank-Adaptation-in-Large-Language-Models-for-Efficient-Parameter-Tuning-1.pdf"
loader = PyPDFLoader(paper_link)
data = loader.load_and_split()
pprint(data[0].page_content[:1000])

('A Comprehensive Review of Low-Rank\n'
 'Adaptation in Large Language Models for\n'
 'Efficient Parameter Tuning\n'
 'September 10, 2024\n'
 'Abstract\n'
 'Natural Language Processing (NLP) often involves pre-training large\n'
 'models on extensive datasets and then adapting them for specific tasks\n'
 'through fine-tuning. However, as these models grow larger, like GPT-3\n'
 'with 175 billion parameters, fully fine-tuning them becomes computa-\n'
 'tionally expensive. We propose a novel method called LoRA (Low-Rank\n'
 'Adaptation) that significantly reduces the overhead by freezing the orig-\n'
 'inal model weights and only training small rank decomposition matrices.\n'
 'This leads to up to 10,000 times fewer trainable parameters and reduces\n'
 'GPU memory usage by three times. LoRA not only maintains but some-\n'
 'times surpasses fine-tuning performance on models like RoBERTa, De-\n'
 'BERTa, GPT-2, and GPT-3. Unlike other methods, LoRA introduces\n'
 'no extra latency during in

## Apply Text Splitting Techniques

In [7]:
from langchain.text_splitter import Language, RecursiveCharacterTextSplitter

latex_text = """

    \documentclass{article}

    \begin{document}

    \maketitle

    \section{Introduction}

    Large language models (LLMs) are a type of machine learning model that can be trained on vast amounts of text data to generate human-like language. In recent years, LLMs have made significant advances in various natural language processing tasks, including language translation, text generation, and sentiment analysis.

    \subsection{History of LLMs}

The earliest LLMs were developed in the 1980s and 1990s, but they were limited by the amount of data that could be processed and the computational power available at the time. In the past decade, however, advances in hardware and software have made it possible to train LLMs on massive datasets, leading to significant improvements in performance.

\subsection{Applications of LLMs}

LLMs have many applications in the industry, including chatbots, content creation, and virtual assistants. They can also be used in academia for research in linguistics, psychology, and computational linguistics.

\end{document}

"""

latex_splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.LATEX, chunk_size=60, chunk_overlap=0
)
latex_docs = latex_splitter.create_documents([latex_text])
latex_docs

[Document(page_content='\\documentclass{article}\n\n    \x08egin{document}'),
 Document(page_content='\\maketitle\n\n    \\section{Introduction}\n\n    Large language'),
 Document(page_content='models (LLMs) are a type of machine learning model that can'),
 Document(page_content='be trained on vast amounts of text data to generate'),
 Document(page_content='human-like language. In recent years, LLMs have made'),
 Document(page_content='significant advances in various natural language processing'),
 Document(page_content='tasks, including language translation, text generation, and'),
 Document(page_content='sentiment analysis.\n\n    \\subsection{History of LLMs}\n\nThe'),
 Document(page_content='earliest LLMs were developed in the 1980s and 1990s, but'),
 Document(page_content='they were limited by the amount of data that could be'),
 Document(page_content='processed and the computational power available at the'),
 Document(page_content='time. In the past decade, however, advances in h

## Embed Documents

In [12]:
from ibm_watsonx_ai.metanames import EmbedTextParamsMetaNames
from langchain_ibm import WatsonxEmbeddings

embed_params = {
    EmbedTextParamsMetaNames.TRUNCATE_INPUT_TOKENS: 3,
    EmbedTextParamsMetaNames.RETURN_OPTIONS: {"input_text": True},
}

watsonx_embedding = WatsonxEmbeddings(
    model_id="ibm/slate-125m-english-rtrvr",
    url="https://us-south.ml.cloud.ibm.com",
    project_id="skills-network",
    params=embed_params,
)
query = "How are you?"

query_result = watsonx_embedding.embed_query(query)
query_result[:5]

[-0.06722454, -0.023729993, 0.017487843, -0.013195328, -0.039584607]

## Create and Configure Vector Databases to Store Embeddings

In [15]:
from langchain_community.document_loaders import TextLoader
from langchain.vectorstores import Chroma

loader = TextLoader("companypolicies.txt")
data = loader.load()

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,
    chunk_overlap=20,
    length_function=len,
)

chunks = text_splitter.split_documents(data)
ids = [str(i) for i in range(0, len(chunks))]
vectordb = Chroma.from_documents(chunks, watsonx_embedding, ids=ids)

query = "Smoking policy"
docs = vectordb.similarity_search(query)
docs

[Document(metadata={'source': 'companypolicies.txt'}, page_content='Smoking Restrictions: Smoking inside company buildings, offices, meeting rooms, and other enclosed'),
 Document(metadata={'source': 'companypolicies.txt'}, page_content='Designated Smoking Areas: Smoking is only permitted in designated smoking areas, as marked by'),
 Document(metadata={'source': 'companypolicies.txt'}, page_content='No Smoking in Company Vehicles: Smoking is not permitted in company vehicles, whether they are'),
 Document(metadata={'source': 'companypolicies.txt'}, page_content='Policy Purpose: The Smoking Policy has been established to provide clear guidance and expectations')]

## Develop a retriever to fetch document segments based on queries 

In [20]:
!wget "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/Ec5f3KYU1CpbKRp1whFLZw/new-Policies.txt"
loader = TextLoader("new-Policies.txt")
data = loader.load()

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,
    chunk_overlap=20,
    length_function=len,
)

chunks = text_splitter.split_documents(data) 
vectordb = Chroma.from_documents(documents=chunks, embedding=watsonx_embedding)
retriever = vectordb.as_retriever(search_kwargs={"k":2})
query = "Email policy"
docs = retriever.invoke(query)
docs

--2025-05-24 22:23:35--  https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/Ec5f3KYU1CpbKRp1whFLZw/new-Policies.txt
Resolving cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud (cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud)... 169.63.118.104
Connecting to cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud (cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud)|169.63.118.104|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6363 (6.2K) [text/plain]
Saving to: ‘new-Policies.txt’


2025-05-24 22:23:35 (297 MB/s) - ‘new-Policies.txt’ saved [6363/6363]



[Document(metadata={'source': 'companypolicies.txt'}, page_content='to this policy. Non-compliance may lead to appropriate disciplinary action, which could include'),
 Document(metadata={'source': 'new-Policies.txt'}, page_content='This policy encourages the responsible use of mobile devices in line with legal and ethical')]