In [7]:
import pubchempy as pcp
from rdkit import Chem
from rdkit.Chem import AllChem
import pandas as pd
import numpy as np
import sqlite3
from pathlib import Path

# List of simple molecules to download
molecule_names = [
    "methane", "water", "ammonia", "carbon dioxide", "ethane",
    "benzene", "acetone", "ethanol", "acetic acid", "glucose"
]

def download_and_convert_to_xyz(molecule_name, mol_id):
    """Download molecule from PubChem and convert to XYZ format"""
    try:
        # Search for the molecule
        compounds = pcp.get_compounds(molecule_name, 'name')
        if not compounds:
            print(f"Molecule {molecule_name} not found")
            return None
        
        # Get the first compound
        compound = compounds[0]
        
        # Get SDF format. We need the 3D record.
        sdf = pcp.download('SDF', f'/tmp/{compound.cid}.sdf', compound.cid, record_type='3d', overwrite=True)
        
        # Read the SDF file content
        with open(f'/tmp/{compound.cid}.sdf', 'r') as f:
            sdf_content = f.read()

        if not sdf_content:
            print(f"No 3D SDF content for {molecule_name}")
            return None

        # Convert to RDKit molecule
        mol = Chem.MolFromMolBlock(sdf_content)
        if mol is None:
            print(f"Could not parse molecule {molecule_name}")
            return None
        
        # Add hydrogens and generate 3D coordinates if needed
        # The 3D record should already have this, but we ensure it.
        if mol.GetNumAtoms() == 0:
             mol = Chem.AddHs(mol)
             AllChem.EmbedMolecule(mol, randomSeed=42)
             AllChem.MMFFOptimizeMolecule(mol)
        
        # Convert to XYZ format
        conf = mol.GetConformer()
        xyz_content = [str(mol.GetNumAtoms())]
        xyz_content.append(f"{molecule_name} - PubChem CID: {compound.cid}")
        
        for i, atom in enumerate(mol.GetAtoms()):
            pos = conf.GetAtomPosition(i)
            symbol = atom.GetSymbol()
            xyz_content.append(f"{symbol:2s} {pos.x:12.6f} {pos.y:12.6f} {pos.z:12.6f}")
        
        # Save XYZ file
        xyz_dir = Path("pubchem_xyz_structures")
        xyz_dir.mkdir(exist_ok=True)
        
        filename = f"molecule_{mol_id}_{molecule_name.replace(' ', '_')}.xyz"
        filepath = xyz_dir / filename
        
        with open(filepath, 'w') as f:
            f.write('\n'.join(xyz_content))
        
        print(f"Downloaded and saved: {filename}")
        return str(filepath), compound.cid
        
    except Exception as e:
        print(f"Error processing {molecule_name}: {str(e)}")
        return None

# Your original free energy data
np.random.seed(42)
num_molecules = 10
molecule_ids = np.arange(1, num_molecules + 1)
free_energies = -np.abs(np.random.normal(loc=50, scale=10, size=num_molecules))

# Create enhanced database
def create_enhanced_database():
    conn = sqlite3.connect('molecules_pubchem.db')
    cursor = conn.cursor()
    
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS molecules (
            molecule_id INTEGER PRIMARY KEY,
            molecule_name TEXT,
            pubchem_cid INTEGER,
            free_energy REAL,
            xyz_filepath TEXT,
            molecular_formula TEXT,
            molecular_weight REAL,
            created_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP
        )
    ''')
    
    conn.commit()
    conn.close()

# Download molecules and populate database
def populate_database_with_pubchem():
    create_enhanced_database()
    
    # Use context manager to ensure proper connection handling
    with sqlite3.connect('molecules_pubchem.db', timeout=30.0) as conn:
        cursor = conn.cursor()
        
        for i, (mol_id, molecule_name, free_energy) in enumerate(zip(molecule_ids, molecule_names, free_energies)):
            print(f"Processing molecule {mol_id}: {molecule_name}")
            result = download_and_convert_to_xyz(molecule_name, mol_id)
            
            if result:
                filepath, cid = result
                
                # Get additional molecular information
                try:
                    compound = pcp.Compound.from_cid(cid)
                    formula = compound.molecular_formula
                    # FIX: Convert molecular_weight from string to float.
                    # Handle cases where it might be missing.
                    weight = float(compound.molecular_weight) if compound.molecular_weight else 0.0
                except Exception as e:
                    print(f"Could not get compound info for {molecule_name}: {e}")
                    formula = "Unknown"
                    weight = 0.0
                
                try:
                    # FIX: Explicitly convert numpy types to standard Python types (int, float)
                    # This prevents potential datatype mismatch errors.
                    insert_data = (
                        int(mol_id), 
                        molecule_name, 
                        int(cid), 
                        float(free_energy), 
                        filepath, 
                        formula, 
                        weight
                    )

                    cursor.execute('''
                        INSERT OR REPLACE INTO molecules 
                        (molecule_id, molecule_name, pubchem_cid, free_energy, xyz_filepath, 
                         molecular_formula, molecular_weight)
                        VALUES (?, ?, ?, ?, ?, ?, ?)
                    ''', insert_data)
                    
                    # Commit after each insertion to avoid long transactions
                    conn.commit()
                    print(f"Added {molecule_name} to database")
                    
                except Exception as e:
                    print(f"Database error for {molecule_name}: {e}")
                    conn.rollback()
            else:
                print(f"Skipped {molecule_name} due to download error")
        
        print("Enhanced database creation completed!")

# Query function
def query_enhanced_database():
    try:
        with sqlite3.connect('molecules_pubchem.db', timeout=30.0) as conn:
            query_df = pd.read_sql_query('''
                SELECT molecule_id, molecule_name, pubchem_cid, molecular_formula, 
                       molecular_weight, free_energy, xyz_filepath 
                FROM molecules 
                ORDER BY molecule_id
            ''', conn)
            return query_df
    except Exception as e:
        print(f"Database query error: {e}")
        return pd.DataFrame()

# Additional utility function to check database status
def check_database_status():
    """Check if database and table exist and are populated."""
    db_path = Path('molecules_pubchem.db')
    if not db_path.exists():
        print("Database file not found.")
        return False

    try:
        with sqlite3.connect(db_path, timeout=5.0) as conn:
            cursor = conn.cursor()
            # Check if the table exists
            cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='molecules'")
            if cursor.fetchone() is None:
                print("Database exists, but 'molecules' table is missing.")
                return False
            
            # Check if the table has any rows
            cursor.execute("SELECT COUNT(*) FROM molecules")
            count = cursor.fetchone()[0]
            if count > 0:
                print(f"Database is accessible and contains {count} molecules.")
                return True
            else:
                print("Database and table exist, but are empty.")
                return False
    except Exception as e:
        print(f"Database access error: {e}")
        return False

# Main execution logic
print("Starting PubChem downloader script...")

# Check if we need to download data
if check_database_status():
    print("Database is already populated. Skipping download.")
else:
    print("Database is empty or missing. Starting fresh download...")
    populate_database_with_pubchem()

print("\nFinal database contents:")
enhanced_df = query_enhanced_database()
print(enhanced_df)

print("\nPubChem downloader completed!")


Starting PubChem downloader script...
Database file not found.
Database is empty or missing. Starting fresh download...
Processing molecule 1: methane
Downloaded and saved: molecule_1_methane.xyz
Added methane to database
Processing molecule 2: water
Downloaded and saved: molecule_2_water.xyz
Added water to database
Processing molecule 3: ammonia
Downloaded and saved: molecule_3_ammonia.xyz
Added ammonia to database
Processing molecule 4: carbon dioxide




Downloaded and saved: molecule_4_carbon_dioxide.xyz
Added carbon dioxide to database
Processing molecule 5: ethane
Downloaded and saved: molecule_5_ethane.xyz
Added ethane to database
Processing molecule 6: benzene
Downloaded and saved: molecule_6_benzene.xyz
Added benzene to database
Processing molecule 7: acetone
Downloaded and saved: molecule_7_acetone.xyz
Added acetone to database
Processing molecule 8: ethanol
Downloaded and saved: molecule_8_ethanol.xyz
Added ethanol to database
Processing molecule 9: acetic acid
Downloaded and saved: molecule_9_acetic_acid.xyz
Added acetic acid to database
Processing molecule 10: glucose
Downloaded and saved: molecule_10_glucose.xyz
Added glucose to database
Enhanced database creation completed!

Final database contents:
   molecule_id   molecule_name  pubchem_cid molecular_formula  \
0            1         methane          297               CH4   
1            2           water          962               H2O   
2            3         ammonia   

In [8]:
enhanced_df

Unnamed: 0,molecule_id,molecule_name,pubchem_cid,molecular_formula,molecular_weight,free_energy,xyz_filepath
0,1,methane,297,CH4,16.043,-54.967142,pubchem_xyz_structures/molecule_1_methane.xyz
1,2,water,962,H2O,18.015,-48.617357,pubchem_xyz_structures/molecule_2_water.xyz
2,3,ammonia,222,H3N,17.031,-56.476885,pubchem_xyz_structures/molecule_3_ammonia.xyz
3,4,carbon dioxide,280,CO2,44.009,-65.230299,pubchem_xyz_structures/molecule_4_carbon_dioxi...
4,5,ethane,6324,C2H6,30.07,-47.658466,pubchem_xyz_structures/molecule_5_ethane.xyz
5,6,benzene,241,C6H6,78.11,-47.65863,pubchem_xyz_structures/molecule_6_benzene.xyz
6,7,acetone,180,C3H6O,58.08,-65.792128,pubchem_xyz_structures/molecule_7_acetone.xyz
7,8,ethanol,702,C2H6O,46.07,-57.674347,pubchem_xyz_structures/molecule_8_ethanol.xyz
8,9,acetic acid,176,C2H4O2,60.05,-45.305256,pubchem_xyz_structures/molecule_9_acetic_acid.xyz
9,10,glucose,5793,C6H12O6,180.16,-55.4256,pubchem_xyz_structures/molecule_10_glucose.xyz
