In [None]:
import os
import re
import requests
from bs4 import BeautifulSoup
import pdfplumber
import pubchempy as pcp
import pandas as pd
from PyPDF2 import PdfReader
import time
import pandas as pd
import unicodedata

In [None]:
#Fetch the main page
main_url = "https://www.who.int/teams/health-product-and-policy-standards/inn/inn-lists"
response = requests.get(main_url)
soup = BeautifulSoup(response.text, 'html.parser')

In [None]:
#Find relevant links
links = soup.find_all('a', href=True)

filtered_links = [link['href'] for link in links if 'inn-pl-' in link['href']]

In [None]:
cdn_links = []

#Iterate over each link in filtered_links
for link in filtered_links:
    #Construct the full URL
    full_url = link if link.startswith("http") else f"https://www.who.int{link}"
    
    #print(f"Accessing: {full_url}")
    
    #Make a GET request
    response = requests.get(full_url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    #Find all <a> tags with href attributes
    page_links = soup.find_all('a', href=True)
    
    #Filter links that start with the desired URL prefix
    cdn_links += [a['href'] for a in page_links if a['href'].startswith("https://cdn.who.int/media/docs/default-source/international-nonproprietary-names-(inn)")]

#Print the collected links
print("\n Links starting with 'https://cdn.who.int/media/docs/default-source/international-nonproprietary-names-(inn)':")
print(cdn_links)

In [None]:
#Directory for temporary files
download_dir = "temp_pdf"
os.makedirs(download_dir, exist_ok=True)

#Validate if the file is a proper PDF
def is_valid_pdf(file_path):
    try:
        reader = PdfReader(file_path)
        return True
    except Exception:
        return False

#Extract chemical names with filtering logic
def extract_filtered_chemical_names(pdf_path):
    chemical_names = []
    seen = set()

    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            if not text:
                continue
            for line in text.split('\n'):
                match = re.match(r'^([A-Za-záéíóúÁÉÍÓÚñÑüÜ-]+)', line.strip())
                if match:
                    name = match.group(1).strip()
                    normalized = unicodedata.normalize('NFKD', name).encode('ASCII', 'ignore').decode('utf-8')
                    #Apply filtering rules
                    if (
                        any(char.isupper() for char in normalized) or
                        any(char.isdigit() for char in normalized) or
                        normalized.endswith('-') or
                        normalized.startswith('-') or
                        len(normalized) <= 4
                    ):
                        continue
                    if normalized not in seen:
                        seen.add(normalized)
                        chemical_names.append(normalized)

    return sorted(chemical_names)

#PubChem search and batch process
def batch_process(names, batch_size=10, delay=5):
    found_data = []
    total_batches = (len(names) + batch_size - 1) // batch_size
    for batch_num in range(total_batches):
        start = batch_num * batch_size
        end = start + batch_size
        batch = names[start:end]
        print(f"\n Processing batch {batch_num + 1}/{total_batches} ({len(batch)} items)...")

        for name in batch:
            try:
                results = pcp.get_compounds(name, 'name')
                if results:
                    compound = results[0]
                    found_data.append({
                        "Name": name,
                        "PubChem ID": compound.cid,
                        "Molecular Formula": compound.molecular_formula,
                        "Molecular Weight": compound.molecular_weight,
                        "IUPAC Name": compound.iupac_name
                    })
            except pcp.PubChemHTTPError as e:
                if 'PUGREST.ServerBusy' in str(e):
                    print(f" Server busy for {name}, retrying...")
                    time.sleep(delay)
                    continue
                else:
                    print(f" Failed to process {name}: {e}")

        print(f" Waiting {delay} seconds before next batch...")
        time.sleep(delay)
    return found_data

#Iterate over each link in `cdn_links`
for link in cdn_links:
    #Extract the file name from the link
    pdf_name = link.split("/")[-1].split("?")[0]
    pdf_path = os.path.join(download_dir, pdf_name)

    #Download the PDF
    print(f"\n Downloading: {link}")
    response = requests.get(link)
    with open(pdf_path, 'wb') as file:
        file.write(response.content)

    #Validate the PDF
    if not is_valid_pdf(pdf_path):
        print(f" File {pdf_name} is not a valid PDF. Skipping...")
        os.remove(pdf_path)
        continue

    print(f"{pdf_name} is a valid PDF. Proceeding with processing...")

    #Extract chemical names
    chemical_names = extract_filtered_chemical_names(pdf_path)
    print(f"Extracted chemicals count: {len(chemical_names)}")

    #Process chemical names in batches and save results
    processed_data = batch_process(chemical_names, batch_size=50, delay=2)

    #Save results to a CSV file named after the PDF
    output_csv = f"{pdf_name.split('.')[0]}_chemical_data.csv"
    pd.DataFrame(processed_data).to_csv(output_csv, index=False)
    print(f"Data saved to {output_csv}")

    #Delete the processed PDF file
    os.remove(pdf_path)
    print(f"Deleted temporary file: {pdf_name}")