# Mount the drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

---

#Ollama + Langchain

Sometimes, when the runtime is exceeded, the server from Ollama goes down, and Ollama must be restarted.

In [None]:
!pip install colab-xterm
%load_ext colabxterm

!pip install langchain langchain-core langchain_community -qqq

In [None]:
%xterm

In the terminal, write down the texts in bold.

**curl -fsSL https://ollama.com/install.sh | sh**

**ollama serve & ollama pull llama3.1 & ollama pull llama3**

We will use llama3.1 for the model, and llama3 for the embedding.

- https://ollama.com/

In [None]:
from torch import cuda

device = 'cuda' if cuda.is_available() else 'cpu'

print(device)

In [None]:
from langchain_community.llms import Ollama

llm = Ollama(model="llama3.1")

In [None]:
!pip install ollama chromadb -qqq

In [None]:
import ollama
from bs4 import BeautifulSoup as bs
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import OllamaEmbeddings

## Extract the data in web pages

### New medicines

In [None]:
# List of URLs to include in the RAG system
urls = [
    'https://www.ema.europa.eu/en/medicines/human/EPAR/anzupgo',
    'https://www.ema.europa.eu/en/medicines/human/EPAR/balversa',
    'https://www.ema.europa.eu/en/medicines/human/EPAR/adzynma'
    # adzynma is a new orphan medicine.
]

# Load documents from all URLs
all_docs = []
for url in urls:
    loader = WebBaseLoader(url)
    docs = loader.load()
    all_docs.extend(docs)  # Collect documents from all URLs

# Split the loaded documents into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(all_docs)

# Create Ollama embeddings and vector store
embeddings = OllamaEmbeddings(model="llama3")
vectorstore = Chroma.from_documents(documents=splits, embedding=embeddings)

In [None]:
# Define the function to call the Ollama Llama3 model
def ollama_llm(question, context):
    formatted_prompt = f"Question: {question}\n\nContext: {context}"
    response = ollama.chat(model='llama3.1', messages=[{'role': 'user', 'content': formatted_prompt}])
    return response['message']['content']

# Define the RAG setup
retriever = vectorstore.as_retriever()

def rag_chain(question):
    retrieved_docs = retriever.invoke(question)
    formatted_context = "\n\n".join(doc.page_content for doc in retrieved_docs)
    return ollama_llm(question, formatted_context)

In [None]:
from time import time

In [None]:
time_start = time()
response = llm.invoke(rag_chain("Extract the full therapeutic indication of each medicine and their patient group without summarization: "))
print(response)
time_end = time()
print(f"time taken: {round(time_end-time_start, 3)} sec.")

However, the model summarizes the indication.

### Existing medicines

In [None]:
# List of URLs to include in the RAG system
urls = [
    'https://www.ema.europa.eu/en/medicines/human/variation/braftovi',
    'https://www.ema.europa.eu/en/medicines/human/variation/arexvy',
    'https://www.ema.europa.eu/en/medicines/human/variation/beyfortus',
    'https://www.ema.europa.eu/en/medicines/human/variation/cresemba'
]

# Load documents from all URLs
all_docs = []
for url in urls:
    loader = WebBaseLoader(url)
    docs = loader.load()
    all_docs.extend(docs)  # Collect documents from all URLs

# Split the loaded documents into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(all_docs)

# Create Ollama embeddings and vector store
embeddings = OllamaEmbeddings(model="llama3")
vectorstore = Chroma.from_documents(documents=splits, embedding=embeddings)

In [None]:
time_start = time()
response = llm.invoke(rag_chain("What is the newly added indication for each medicine? The newly added or changed indication might be in bold or have strikethrough text.: "))
print(response)
time_end = time()
print(f"time taken: {round(time_end-time_start, 3)} sec.")

In [None]:
# List of URLs to include in the RAG system
url = 'https://www.ema.europa.eu/en/medicines/human/variation/braftovi'
#'https://www.ema.europa.eu/en/medicines/human/variation/arexvy'
#'https://www.ema.europa.eu/en/medicines/human/variation/beyfortus'
#'https://www.ema.europa.eu/en/medicines/human/variation/cresemba'

loader = WebBaseLoader(url)
doc = loader.load()

# Split the loaded documents into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(doc)

# Create Ollama embeddings and vector store
embeddings = OllamaEmbeddings(model="llama3")
vectorstore = Chroma.from_documents(documents=splits, embedding=embeddings)

In [None]:
time_start = time()
response = llm.invoke(rag_chain("What is the newly added indication for this medicine? The newly added or changed indication might be in bold or have strikethrough text.: "))
print(response)
time_end = time()
print(f"time taken: {round(time_end-time_start, 3)} sec.")

All text on web pages is extracted as plain text and stored as vectors, ignoring bold or strikethrough formatting. This makes it difficult to extract new indications from those pages.

## Extract the data in .pdf files

In [None]:
!pip install pypdf2 -qqq

In [None]:
import PyPDF2

def extract_text_from_pdf(pdf_path):
  with open(pdf_path, "rb") as file:
    reader = PyPDF2.PdfReader(file)
    text = ""
    for page in reader.pages:
      text += page.extract_text()
    return text

In [None]:
from time import time

### New medicines

In [None]:
pdf_path = "/content/drive/MyDrive/adzynma-new medicine-info.pdf"
pdf_text = extract_text_from_pdf(pdf_path)

In [None]:
time_start = time()
response = llm.invoke(f"Extract the full therapeutic indication of the medicine and its patient group without summarization: \n\n{pdf_text[:2000]}")
print(response)
time_end = time()
print(f"time taken: {round(time_end-time_start, 3)} sec.")

The model summarizes the indication and cannot find this medicine's patient group.

### Existing medicines

#### From a single pdf file

In [None]:
a_pdf_path = "/content/drive/MyDrive/alecensa-newly added indication.pdf"
one_pdf_text = extract_text_from_pdf(a_pdf_path)

In [None]:
time_start = time()
response = llm.invoke(f"Extract the newly added therapeutic indication of each medicine medicine and also its commission decision issued date : \n\n{one_pdf_text[:2000]}")
print(response)
time_end = time()
print(f"time taken: {round(time_end-time_start, 3)} sec.")

In [None]:
a_pdf_path = "/content/drive/MyDrive/dupixent-newly added indication.pdf"
one_pdf_text = extract_text_from_pdf(a_pdf_path)

In [None]:
time_start = time()
response = llm.invoke(f"Extract the newly added therapeutic indication of each medicine medicine and also its commission decision issued date : \n\n{one_pdf_text[:2000]}")
print(response)
time_end = time()
print(f"time taken: {round(time_end-time_start, 3)} sec.")

In [None]:
a_pdf_path = "/content/drive/MyDrive/arexvy-no new indication even though it has one.pdf"
one_pdf_text = extract_text_from_pdf(a_pdf_path)

In [None]:
time_start = time()
response = llm.invoke(f"If a new therapeutic indication has been recently added, extract the newly added therapeutic indication and the date of the commission's decision. The newly added indication may begin with 'extension of indication...': \n\n{one_pdf_text[:2000]}")
print(response)
time_end = time()
print(f"time taken: {round(time_end-time_start, 3)} sec.")

In [None]:
a_pdf_path = "/content/drive/MyDrive/betmiga-no new indication even though it has one.pdf"
one_pdf_text = extract_text_from_pdf(a_pdf_path)

In [None]:
time_start = time()
response = llm.invoke(f"If a new therapeutic indication has been recently added, extract the newly added therapeutic indication and the date of the commission's decision. The newly added indication may begin with 'extension of indication...': \n\n{one_pdf_text[:2000]}")
print(response)
time_end = time()
print(f"time taken: {round(time_end-time_start, 3)} sec.")

####From the list of pdf files

In [None]:
# List of .pdf files
#pdfs_path = [
#    '/content/drive/MyDrive/alecensa-newly added indication.pdf',
#    '/content/drive/MyDrive/dupixent-newly added indication.pdf',
#    '/content/drive/MyDrive/arexvy-no new indication even though it has one.pdf',
#    '/content/drive/MyDrive/betmiga-no new indication even though it has one.pdf'
#]

# Function to extract the text from each pdf
#def extract_texts_from_pdfs(pdfs_path):
#    """Extract text from a single PDF file."""
#    with open(pdfs_path, "rb") as file:
#        reader = PyPDF2.PdfReader(file)
#        text = ""
#        for page in range(len(reader.pages)):
#            text += reader.pages[page].extract_text()
#    return text

# Extract text from all PDFs
#all_pdf_text = []
#for pdf in pdfs_path:
#    text = extract_texts_from_pdfs(pdf)
#    all_pdf_text.append(text)

In [None]:
def extract_text_from_pdf(pdf_path):
    """Extract text from a single PDF file."""
    try:
        with open(pdf_path, "rb") as file:
            reader = PyPDF2.PdfReader(file)
            text = ""
            for page in range(len(reader.pages)):
                text += reader.pages[page].extract_text()
        return text
    except Exception as e:
        print(f"Error processing {pdf_path}: {e}")
        return ""

# List of PDF files
pdfs_path = [
    '/content/drive/MyDrive/alecensa-newly added indication.pdf',
    '/content/drive/MyDrive/dupixent-newly added indication.pdf',
    '/content/drive/MyDrive/arexvy-no new indication even though it has one.pdf',
    '/content/drive/MyDrive/betmiga-no new indication even though it has one.pdf'
]

# Extract text from all PDFs
all_pdf_text = []
for pdf in pdfs_path:
    print(f"Processing: {pdf}")
    text = extract_text_from_pdf(pdf)
    print(f"Length of extracted text: {len(text)} characters")
    all_pdf_text.append(text)

# Verify the results
for i, pdf_text in enumerate(all_pdf_text):
    print(f"\n--- Text from PDF {i+1} ({pdfs_path[i]}) ---")
    print(pdf_text[:500], "...")

In [None]:
time_start = time()
response = llm.invoke(f"If a new therapeutic indication has been recently added, extract the newly added therapeutic indication and the date of the commission's decision of each medicine. The newly added indication may begin with 'extension of indication...': \n\n{all_pdf_text[:2000]}")
print(response)
time_end = time()
print(f"time taken: {round(time_end-time_start, 3)} sec.")

It seems the model cannot properly extract data from the text in merged text from each pdf file.

---

#Hugging Face + Langchain

It takes much more time compared to using Ollama, despite using a GPU.

Also, it exceeds the GPU usage limit while importing the embedding model.

In [None]:
!pip install "transformers>=4.43.2" --upgrade

In [None]:
!pip install langchain langchain-core langchain_community -qqq

In [None]:
from torch import cuda

device = 'cuda' if cuda.is_available() else 'cpu'

print(device)

In [None]:
!pip install chromadb -qqq

In [None]:
import transformers
import torch
from langchain.llms import HuggingFacePipeline
from bs4 import BeautifulSoup as bs
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA

In [None]:
model_id = "meta-llama/Meta-Llama-3.1-8B"

access_token = "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"

In [None]:
pipeline = transformers.pipeline("text-generation", model=model_id, token=access_token, model_kwargs={"torch_dtype": torch.bfloat16}, device_map="auto")

In [None]:
llm = HuggingFacePipeline(pipeline=pipeline)

In [None]:
url = 'https://www.ema.europa.eu/en/medicines/human/EPAR/anzupgo'
loader = WebBaseLoader(url)
docs = loader.load()

# Split the documents into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)

In [None]:
!pip install sentence-transformers -qqq

In [None]:
from langchain_community.embeddings import HuggingFaceEmbeddings

In [None]:
model_name = "meta-llama/Meta-Llama-3.1-8B"
encode_kwargs = {'normalize_embeddings': False}
embedding_model = HuggingFaceEmbeddings(model_name=model_name, encode_kwargs=encode_kwargs)

In [None]:
vectorstore = Chroma.from_documents(documents=splits, embedding=embedding_model)

In [None]:
# Set up the retrieval system
retriever = vectorstore.as_retriever()
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True
)