In [98]:
import selenium
import pandas as pd
import os
import time
import csv
from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options

### Function to Get the Structural Formula and Save the .sdf File of A Compound

In [99]:
def get_single_molecule_data(link):
    if link == '':
        return ''
    
    # Directory path to save the sdf Files
    download_dir = os.path.join(os.getcwd(), "3d_sdf_files")
    os.makedirs(download_dir, exist_ok=True)

    # Options are required to automate the Download process
    chrome_options = Options()
    chrome_options.add_experimental_option("prefs", {
        "download.default_directory": download_dir,
        "download.prompt_for_download": False,
        "download.directory_upgrade": True,
        "safebrowsing.enabled": True
    })
    # Open the web Browser and get the HTML info
    driver = webdriver.Chrome(options=chrome_options)
    driver.get(link)
    time.sleep(2)
    
    # get smiles text (structural formula) of a compound
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    smiles_tag = soup.find("strong", text="SMILES:")
    smiles_text = smiles_tag.find_next_sibling().find_next_sibling().text
    if smiles_text is None:
        print(f'SMILES Text Not available for {link}')
    
    # To downlaod .sdf file of the Compound
    try:
        sdf_link = driver.find_element(By.XPATH, '//a[contains(@href, "3D/SDF") and contains(@href, ".sdf")]')
        sdf_url = sdf_link.get_attribute("href")
        
        # automating the downlaod of the .sdf file
        driver.execute_script("window.open(arguments[0]);", sdf_url)
        driver.switch_to.window(driver.window_handles[-1])
        time.sleep(5) 
        driver.close()
        driver.switch_to.window(driver.window_handles[0])
    
    except:
        pass
    
    driver.quit()
    return smiles_text


### Function to Get the Data of all Compounds for a Particular Species

In [100]:
def get_all_data_of_a_species(url):
    # Get HTMl of the Table Page
    driver = webdriver.Chrome()
    driver.get(url)
    time.sleep(1)

    wait = WebDriverWait(driver, 10)
    all_data = []
    
    # keep going until next button is not disabled
    while True:
        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "table tbody")))
        rows = driver.find_elements(By.CSS_SELECTOR, "table tbody tr")
        # get the data of all cols within all rows
        for row in rows:
            cells = row.find_elements(By.TAG_NAME, "td")
            row_data = [cell.text.strip() for cell in cells]
            # 4th col has the compound link
            if len(cells) >= 4:
                try:
                    link_tag = cells[3].find_element(By.TAG_NAME, "a")
                    url = link_tag.get_attribute("href")
                except:
                    url = ""  # No link found
            else:
                url = ""
            # get the structural formula and save the sdf file of the compound
            smiles_text = get_single_molecule_data(url)
            # store the dataa
            row_data.append(smiles_text)
            row_data.append(url)
            all_data.append(row_data)
        
        # if the table is not finished, go to the next page
        next_btn = driver.find_element(By.ID, "table_id_next")
        if "disabled" in next_btn.get_attribute("class"):
            break
        next_btn.click()
        time.sleep(2)
        
    driver.quit()
    return all_data

### Save the data of all Compounds to CSV File

In [106]:
def save_data_of_all_compounds(data, file_path):
    try:
        with open(file_path, 'w', encoding='utf-8') as file:
            csvWriter = csv.writer(file)
            header = ['Indian Medicinal Plant', 'Plant Part', 'IMPPAT Phytochemical Identifier', 'Phytochemical Name', 'References', 'Structural Formula', 'Compound URL']
            csvWriter.writerow(header)
            csvWriter.writerows(data)
    except Exception as e:
        print(f'Following Error Occured During Writing the Data to File: {e}')

### Functions to Set the Rename .sdf File with their Names

In [102]:
def get_rename_dict(data):
    rename_dict = {}
    for molecule in data:
        identifier = molecule[2].strip()
        name = molecule[3].strip()
        rename_dict[identifier] = name
    return rename_dict

def rename_sdf_files(folder_path, rename_dict):
    for old_name, new_name in rename_dict.items():
        old_name = old_name+'_3D.sdf'
        new_name = new_name +'.sdf'
        
        old_path = os.path.join(folder_path, old_name)
        new_path = os.path.join(folder_path, new_name)
        
        if os.path.exists(old_path):
            os.rename(old_path, new_path)
#             print(f'{old_name} renamed to {new_name}')
        else:
            print(f'{old_name} Does not exists.')
    return 

In [104]:
link = 'https://cb.imsc.res.in/imppat/phytochemical/Cheilocostus%20speciosus'
data = get_all_data_of_a_species(link)

  smiles_tag = soup.find("strong", text="SMILES:")


In [107]:
save_data_of_all_compounds(data, 'phytochemicals_dataset.csv')

In [105]:
rdict = get_rename_dict(data)
folder_path = '/home/vikas/Internship/3d_sdf_files/'
rename_sdf_files(folder_path, rdict)