# Medical Chat Bot - The Gale Encyclopedia of Medicine

## Extraction Text from Book

In [1]:
from langchain_community.document_loaders import PyPDFLoader
pdfloader = PyPDFLoader('The_Gale_Encyclopedia_of_Medicine.pdf')
medicalEncyclopedia = pdfloader.load()

  from .autonotebook import tqdm as notebook_tqdm
Ignoring wrong pointing object 12 0 (offset 0)
Ignoring wrong pointing object 6532 0 (offset 0)
Ignoring wrong pointing object 8115 0 (offset 0)
Ignoring wrong pointing object 8124 0 (offset 0)
Ignoring wrong pointing object 9468 0 (offset 0)
Ignoring wrong pointing object 9546 0 (offset 0)
Ignoring wrong pointing object 9554 0 (offset 0)
Ignoring wrong pointing object 22998 0 (offset 0)
Ignoring wrong pointing object 23548 0 (offset 0)
Ignoring wrong pointing object 23900 0 (offset 0)
Ignoring wrong pointing object 27752 0 (offset 0)
Ignoring wrong pointing object 29258 0 (offset 0)
Ignoring wrong pointing object 35179 0 (offset 0)
Ignoring wrong pointing object 35304 0 (offset 0)
Ignoring wrong pointing object 36262 0 (offset 0)
Ignoring wrong pointing object 36565 0 (offset 0)
Ignoring wrong pointing object 37762 0 (offset 0)
Ignoring wrong pointing object 37939 0 (offset 0)
Ignoring wrong pointing object 41227 0 (offset 0)
Ignoring 

## Cleaning the text

In [2]:
import re

def remove_headers_footers(text):
    text = re.sub(r'Page\s+\d+', '', text)
    text = re.sub(r'GALE ENCYCLOPEDIA OF MEDICINE.*?\n', '', text, flags=re.I)
    return text

def fix_hyphenation(text):
    # anti-\nbody → antibody
    return re.sub(r'(\w+)-\n(\w+)', r'\1\2', text)

def merge_lines(text):
    # Replace single line breaks with space
    text = re.sub(r'(?<!\n)\n(?!\n)', ' ', text)
    return text

def normalize_whitespace(text):
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

def remove_junk(text):
    junk_patterns = [
        r'ISBN[-–]\d+',
        r'Copyright.*?\d{4}',
        r'All rights reserved',
    ]
    for pattern in junk_patterns:
        text = re.sub(pattern, '', text, flags=re.I)
    return text

def clean_medical_text(text):
    text = remove_headers_footers(text)
    text = fix_hyphenation(text)
    text = merge_lines(text)
    text = remove_junk(text)
    text = normalize_whitespace(text)
    return text

for doc in medicalEncyclopedia:
    doc.page_content = clean_medical_text(doc.page_content)

In [3]:
print("Length of Doc:",len(medicalEncyclopedia))

gt = 0
for doc in medicalEncyclopedia:
    wordLen = len(doc.page_content.split(" "))
    if wordLen > gt:
        print(f"Long Doc with word count: {wordLen}")
        gt = wordLen

Length of Doc: 4032
Long Doc with word count: 578
Long Doc with word count: 761
Long Doc with word count: 832
Long Doc with word count: 845
Long Doc with word count: 904
Long Doc with word count: 911
Long Doc with word count: 937
Long Doc with word count: 939
Long Doc with word count: 1020
Long Doc with word count: 1086


## Chunking

In [4]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

recursiveSplitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap = 200,
    separators=''
)

chunkedEncyclopedia = recursiveSplitter.split_documents(medicalEncyclopedia)
print("Length of chunked Doc:",len(chunkedEncyclopedia))

Length of chunked Doc: 21188


## Embedding and Vectore Store - Pinecone

In [5]:
from langchain_pinecone import PineconeVectorStore
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from dotenv import load_dotenv
from pinecone import Pinecone
import os

load_dotenv()

embeddings = GoogleGenerativeAIEmbeddings(model="models/gemini-embedding-001")

pc = Pinecone(api_key=os.getenv('PINECONE_API_KEY'))
index = pc.Index('medical-chatbot')

vector_store = PineconeVectorStore(embedding=embeddings, index=index)
vector_store

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x132526120>