In [1]:
import os
import requests
import pandas as pd
from bs4 import BeautifulSoup
from urllib.parse import urljoin

BASE_URL = "https://photochemcad.com/databases/common-compounds/aromatic-hydrocarbons/"
OUTPUT_DIR = "PhotochemCAD_Data"
CSV_FILE = "photochemcad_data.csv"

os.makedirs(OUTPUT_DIR, exist_ok=True)



In [2]:
def get_molecule_links(base_url):
    response = requests.get(base_url)
    soup = BeautifulSoup(response.content, 'html.parser')
    links = [urljoin(base_url, a['href']) for a in soup.select(".entry-content a[href]")]
    return links

In [3]:

def extract_data(molecule_url):
    """Extrait les données de chaque molécule et télécharge les fichiers associés."""
    response = requests.get(molecule_url)
    soup = BeautifulSoup(response.content, 'html.parser')
    data = {}
    
    # Extraction des données du tableau
    table = soup.find("table")
    rows = table.find_all("tr")
    for row in rows:
        cells = row.find_all("td")
        if len(cells) == 2:
            key = cells[0].text.strip().replace(":", "")
            value = cells[1].text.strip()
            data[key] = value
    
    # Récupération du nom pour les fichiers
    molecule_name = data.get("Name", "Unknown").replace(" ", "_")

    # Téléchargement des fichiers disponibles
    file_links = soup.select(".entry-content a[href]")
    for link in file_links:
        file_url = urljoin(molecule_url, link['href'])
        file_name = link.text.strip().replace(" ", "_")
        extension = os.path.splitext(file_url)[-1]
        full_file_name = f"{OUTPUT_DIR}/{file_name}-{molecule_name}{extension}"
        
        # Téléchargement du fichier
        try:
            file_response = requests.get(file_url)
            with open(full_file_name, 'wb') as f:
                f.write(file_response.content)
            print(f"✅ Fichier téléchargé : {full_file_name}")
        except Exception as e:
            print(f"❌ Erreur lors du téléchargement de {file_url} : {e}")
    
    return data

In [4]:
def main():
    # Récupération des liens de molécules
    molecule_links = get_molecule_links(BASE_URL)
    all_data = []

    # Extraction des données pour chaque molécule
    for link in molecule_links:
        try:
            print(f"🔄 Extraction des données pour : {link}")
            molecule_data = extract_data(link)
            all_data.append(molecule_data)
        except Exception as e:
            print(f"❌ Erreur lors de l'extraction pour {link} : {e}")

    # Sauvegarde des données dans un CSV
    df = pd.DataFrame(all_data)
    df.to_csv(CSV_FILE, index=False)
    print(f"\n📁 Données sauvegardées dans {CSV_FILE}")

In [5]:
if __name__ == "__main__":
    main()


📁 Données sauvegardées dans photochemcad_data.csv


In [6]:
import subprocess
from datetime import datetime

def run_command(command):
    subprocess.run(command, shell=True, check=True)

run_command("git add .")

# Commit automatique avec la date et l'heure
commit_message = f"Auto commit {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
run_command(f"git commit -m \"{commit_message}\"")

# Pousser vers GitHub
run_command("git push origin main")

[main b167d51] Auto commit 2025-05-14 17:13:00
 Committer: Victor Carré <victorcarre@Mac-mini-de-Victor.local>
Your name and email address were configured automatically based
on your username and hostname. Please check that they are accurate.
You can suppress this message by setting them explicitly:

    git config --global user.name "Your Name"
    git config --global user.email you@example.com

After doing this, you may fix the identity used for this commit with:

    git commit --amend --reset-author

 2 files changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 .DS_Store
 rename Untitled.ipynb => Scrap.ipynb (100%)


To https://github.com/victorcarre6/photocatalysts-database.git
   2fa1278..b167d51  main -> main


In [2]:
import sys
print(sys.executable)


/Library/Developer/CommandLineTools/usr/bin/python3


In [3]:
source /Users/victorcarre/myenv/bin/activate


NameError: name 'source' is not defined