## Install libraries

In [None]:
%pip install --upgrade --quiet fsspec==2025.3.2 huggingface_hub sentence-transformers transformers datasets hf_xet
%pip install --upgrade --quiet  langchain langchain-community langchain-huggingface langchain-chroma
%pip install beautifulsoup4


## Secret keys

In [4]:
from google.colab import userdata
HF_TOKEN = userdata.get('HF_TOKEN')
LANGSMITH_API_KEY = userdata.get('LANGSMITH_API_KEY')

## Process pdf documents

In [None]:
# upload pdf

%pip install PyPDF2
import PyPDF2

# Open the PDF file in binary mode
with open('manage-stress-workbook.pdf', 'rb') as file:
    # Create a PdfFileReader object
    pdf_reader = PyPDF2.PdfReader(file)
    pdf_texts = [p.extract_text().strip() for p in pdf_reader.pages]

    # Filter the empty strings
    pdf_texts = [text for text in pdf_texts if text]

    print(pdf_texts[0])

In [None]:
# chunk pdf

from langchain.text_splitter import RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter, CharacterTextSplitter

character_splitter = RecursiveCharacterTextSplitter(
    separators=["\n"],
    chunk_size=100,
    chunk_overlap=10
)
docs = character_splitter.create_documents(pdf_texts)

print(docs[12])

print(f"Number of document chunks: {len(docs)}")
print(f"First chunk: {docs[0].page_content}")

## Process information from the web

In [None]:
# Step 1: load website content

from langchain_community.document_loaders import WebBaseLoader
from bs4 import BeautifulSoup

def load_website_content(urls):
    loader = WebBaseLoader(urls)
    documents = loader.load()
    print(f"Loaded content from {len(urls)} URLs")
    return documents


In [None]:
# Step 2: Clean and chunk the content


def process_web_documents(documents):
    # HTML cleaning is handled by WebBaseLoader, but you can add additional processing
    character_splitter = RecursiveCharacterTextSplitter(
        chunk_size=50,
        chunk_overlap=10
    )
    # Extract text content from Document objects
    texts = [doc.page_content for doc in documents]

    chunks = character_splitter.create_documents(texts)
    print(f"Split into {len(chunks)} chunks")
    return chunks

# Example usage
urls = [
    "https://www.rcpsych.ac.uk/mental-health/mental-illnesses-and-mental-health-problems",
    "https://www.psychiatry.org/patients-families"
]

web_documents = load_website_content(urls)
chunks = process_web_documents(web_documents)

## Create embeddings using HuggingFaceEmbeddings

In [None]:
# create embeddings with hugging face sentence transformer

from langchain_huggingface import HuggingFaceEmbeddings


# Choose a suitable sentence transformer model from Hugging Face Hub
# Recommended models: 'all-mpnet-base-v2', 'sentence-transformers/all-MiniLM-L6-v2'
model_name = "sentence-transformers/all-MiniLM-L6-v2"
embeddings = HuggingFaceEmbeddings(model_name=model_name)


## Create vector store

### Chromadb used. It automatically indexes the embeddings

In [11]:
# create chromadb vector store and index the embeddings
from langchain_chroma import Chroma

persist_directory="chroma_db"

#vector_store = Chroma.from_documents(docs, embeddings, persist_directory=persist_directory)
all_documents = docs + chunks

vector_store = Chroma.from_documents(documents=all_documents,
                                     embedding=embeddings,
                                     persist_directory="web_and_pdf_db")


## Prompt creation using a template

### Templates enable creation of longer and with well described context

In [12]:
template = """You are an experienced mental health counselor \n
providing support and information about mental health by web chat, to a service user. \n
Use ONLY the following information to answer the user's question at the end. \n
You need to make sure that you provide the most accurate information and best support! \n
Provide concise, accurate and as complete an answer as possible. \n
Do not make any assumptions or make up an answer. \n
If the information does not contain the answer, please respond with: \n
"Based on the information provided, I cannot answer your question." \n\n


Information: {context}


Question: {question} \n\n
Answer:"""

In [None]:
from langchain.prompts import PromptTemplate

prompt = PromptTemplate(template=template, input_variables=["context", "question"])
print(prompt)

## Tokenizing, HuggingFace pipeline and information retrieval mechanism

In [None]:
# Define the retrieval mechanism

from langchain.chains import RetrievalQA
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from langchain.llms import HuggingFacePipeline
from langchain.chat_models import init_chat_model


# Choose a Hugging Face language model for generation
model_name = "gpt2-large"      # You can try other models
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Create a Hugging Face pipeline for text generation
pipe = pipeline("text-generation",
                model=model,
                tokenizer=tokenizer,
                max_new_tokens=200,
                temperature=0.1,
                top_p=0.1,
                repetition_penalty=1.15)

from langchain_huggingface import HuggingFacePipeline
llm = HuggingFacePipeline(pipeline=pipe)

# Initialize the RetrievalQA chain with the ChromaDB retriever
retriever = vector_store.as_retriever()

rag_chain = RetrievalQA.from_llm(llm=llm, retriever=retriever, prompt=prompt)

# Test your model

In [None]:
#ask a question to perform an embeddings search on the chroma db database

query = "What are the symptoms of depression?"

result = rag_chain({"query": query})
print(result["result"])

In [None]:
query = "Using bullet points, list five examples of mindfulness based stress reduction techniques."

result = rag_chain({"query": query})
print(result["result"])