In [3]:
import requests
from bs4 import BeautifulSoup
import csv

In [None]:
base_url = "https://photochemcad.com/databases"

urls_to_scrape = [
    "/common-compounds/aromatic-hydrocarbons",
    "/common-compounds/heterocycles",  
    "/common-compounds/biomolecules",
    "/common-compounds/quinones",
    "/common-compounds/coumarins",
    "/common-compounds/acridines",
    "/common-compounds/azo-dyes",
    "/common-compounds/cyanine-dyes",
    "/common-compounds/arylmethane-dyes",
    "/common-compounds/perylenes",
    "/common-compounds/xanthenes",
    "/common-compounds/miscellaneous-dyes",
    "/common-compounds/dipyrrins",
    "/common-compounds/porphyrins",
    "/common-compounds/oligopyrroles",
    "/common-compounds/phthalocyanines",
    "/common-compounds/chlorins-bacteriochlorins",
    "/naturally-derived-porphyrins",
    "/natural-chlorophylls",
    "/tolyporphins",
    "/flavonoids",
    "/phyllobilins",
    "/open-chain-tetrapyrrole"
]

In [None]:
# Fonction pour obtenir les URLs des composés à partir d'une page donnée
def get_compound_urls_from_page(url):
    print(f"Requesting URL: {url}")
    response = requests.get(url)

    if response.status_code != 200:
        print(f"Failed to retrieve {url} (Status code: {response.status_code})")
        return []

    soup = BeautifulSoup(response.text, 'html.parser')

    # Trouver tous les liens pertinents (liens vers les pages des molécules)
    compound_links = soup.find_all('a', href=True)
    print(f"Found {len(compound_links)} links on {url}")

    compound_urls = []
    for link in compound_links:
        href = link['href']
        
        # Si le lien est relatif, ajouter le base_url
        if not href.startswith('http'):
            full_url = base_url + href if href.startswith('/') else href
        else:
            full_url = href

        # Filtrer les liens pertinents
        if "common-compounds" in full_url:
            compound_urls.append(full_url)

    return compound_urls



In [None]:
# Fonction principale pour scraper toutes les pages
def main():
    all_urls = []

    # Pour chaque page spécifiée dans urls_to_scrape
    for url in urls_to_scrape:
        # Si l'URL est relative, la compléter avec le base_url
        full_url = url if url.startswith('http') else base_url + url
        print(f"Scraping {full_url}...")

        # Extraire les URLs des composés sur cette page
        compound_urls = get_compound_urls_from_page(full_url)
        all_urls.extend(compound_urls)

    # Vérification si des URLs ont été récupérées et écriture dans un fichier CSV
    if not all_urls:
        print("Aucun lien de molécule trouvé.")
        return

    with open('compound_urls.csv', 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["Compound URL"])  # En-tête
        for url in all_urls:
            writer.writerow([url])  # Ajouter chaque URL au CSV

if __name__ == "__main__":
    main()

In [4]:
# Fonction pour extraire les données d'une molécule
def extract_molecule_data(compound_url):
    print(f"Extracting data from {compound_url}")
    response = requests.get(compound_url)

    if response.status_code != 200:
        print(f"Failed to retrieve {compound_url} (Status code: {response.status_code})")
        return None

    soup = BeautifulSoup(response.text, 'html.parser')

    # Extraire tous les tableaux sur la page
    data_tables = soup.find_all('table')
    if not data_tables:
        print(f"Aucune table trouvée pour {compound_url}")
        return None

    # Dictionnaire pour stocker les données
    data = {
        "PhotochemCAD ID": "",
        "Compound Category": "",
        "Name": "",
        "Synonym": "",
        "CAS": "",
        "Source": "",
        "Absorption Wavelength": "",
        "Absorption epsilon": "",
        "Absorption coefficient": "",
        "Quantum Yield": "",
        "Solvent": "",
    }

    # Remplissage des données à partir des tableaux
    for table in data_tables:
        rows = table.find_all('tr')
        for row in rows:
            header_cell = row.find('th')
            data_cell = row.find('td')

            if header_cell and data_cell:
                key = header_cell.text.strip()
                value = data_cell.text.strip()

                # Vérifier si c'est une valeur qu'on veut
                if key in data:
                    # Gérer les liens (ex. source ou fichiers)
                    if data_cell.find('a'):
                        link = data_cell.find('a').get('href')
                        # Compléter l'URL si nécessaire
                        if link.startswith("/"):
                            value = f"https://photochemcad.com{link}"
                    data[key] = value

    return data




In [5]:
# Fonction principale pour lire les URLs et extraire les informations
def main():
    # Lire les URLs à partir du fichier CSV
    compound_urls = []
    with open('compound_urls.csv', 'r', encoding='utf-8') as csvfile:
        reader = csv.reader(csvfile)
        next(reader)  # Ignorer l'en-tête
        compound_urls = [row[0].replace('/databases/databases/', '/databases/') for row in reader]

    # Liste pour stocker les données de toutes les molécules
    all_molecule_data = []

    # Extraire les informations pour chaque URL
    for url in compound_urls:
        data = extract_molecule_data(url)
        if data:
            all_molecule_data.append(data)

    # Vérifier si des données ont été extraites et écrire dans un fichier CSV
    if not all_molecule_data:
        print("Aucune donnée n'a été extraite.")
        return

    with open('molecule_data.csv', 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = [
            "PhotochemCAD ID", "Compound Category", "Name", "Synonym", "CAS", "Source",
            "Absorption Wavelength", "Absorption epsilon", "Absorption coefficient", 
            "Quantum Yield", "Solvent"
        ]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()  # Écriture de l'en-tête
        writer.writerows(all_molecule_data)  # Écriture des données

    print("Extraction terminée. Les données sont enregistrées dans 'molecule_data.csv'.")

if __name__ == "__main__":
    main()

Extracting data from https://photochemcad.com/databases/common-compounds
Aucune table trouvée pour https://photochemcad.com/databases/common-compounds
Extracting data from https://photochemcad.com/databases/spectra-viewers/common-compounds
Extracting data from https://photochemcad.com/databases/common-compounds
Aucune table trouvée pour https://photochemcad.com/databases/common-compounds
Extracting data from https://photochemcad.com/databases/common-compounds/aromatic-hydrocarbons/benzene
Extracting data from https://photochemcad.com/databases/common-compounds/aromatic-hydrocarbons/toluene
Extracting data from https://photochemcad.com/databases/common-compounds/aromatic-hydrocarbons/o-xylene
Extracting data from https://photochemcad.com/databases/common-compounds/aromatic-hydrocarbons/m-xylene
Extracting data from https://photochemcad.com/databases/common-compounds/aromatic-hydrocarbons/p-xylene
Extracting data from https://photochemcad.com/databases/common-compounds/aromatic-hydrocarb