## THIS CODE IS FOR EXTRACTING PRICES FROM COMMON SUPPLIERS
## ONLY AMBEED IS SUPPORTED AS OF 4 NOVEMBER 2024
This script is a work in progress.

## IMPORTS

In [1]:
# Built ins
import re
import time
import math
import urllib
from pathlib import Path
from pprint import pprint

# Data manipulation
import pandas as pd
import numpy as np

# Requests
from requests_html import HTMLSession, AsyncHTMLSession

# Old version
from selenium import webdriver
from selenium.webdriver import Chrome
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup

# Custom
from utils import canonicalize_smiles, smiles_to_inchi_key, smiles_to_inchi
from utils import remove_duplicate_inchi_keys
from utils import get_cid_from_inchi_key, get_vendor_list_from_cid, get_vendor_json
from utils import filter_vendor_objects
from utils import convert_str_list
from utils import remove_specific_vendors_from_dataframe
from utils import draw_molecules_to_grid_image

## Read in a file that has a CID column

In [2]:
df = pd.read_csv('FINAL_LIBRARY_CURATED.csv')

display(df)

Unnamed: 0,SMILES,INCHI,INCHI_KEY,CID,VENDORS,"VWR, Part of Avantor_link",Sigma-Aldrich_link,Thermo Fisher Scientific_link,TCI (Tokyo Chemical Industry)_link,Combi-Blocks_link,Ambeed_link
0,c1ccc(-c2cc3ccccc3[nH]2)cc1,InChI=1S/C14H11N/c1-2-6-11(7-3-1)14-10-12-8-4-...,KLLLJCACIRKBDT-UHFFFAOYSA-N,13698,"['Sigma-Aldrich', 'VWR, Part of Avantor', 'Amb...",https://us.vwr.com/store/product/7514896/2-phe...,https://www.sigmaaldrich.com/catalog/product/a...,https://www.thermofisher.com/order/catalog/pro...,http://www.tcichemicals.com/eshop/en/us/commod...,https://www.combi-blocks.com/cgi-bin/find.cgi?...,https://www.ambeed.com/products/948-65-2.html
1,c1ccc2c(c1)Cn1c-2cc2ccccc21,InChI=1S/C15H11N/c1-3-7-13-12(6-1)10-16-14-8-4...,JJUQRPHMJPHFBH-UHFFFAOYSA-N,13207630,"['Combi-Blocks', 'Ambeed']",NONE,NONE,NONE,NONE,https://www.combi-blocks.com/cgi-bin/find.cgi?...,https://www.ambeed.com/products/248-71-5.html
2,Cn1c(-c2ccccc2)cc2ccccc21,InChI=1S/C15H13N/c1-16-14-10-6-5-9-13(14)11-15...,SFWZZSXCWQTORH-UHFFFAOYSA-N,77095,"['TCI (Tokyo Chemical Industry)', 'Ambeed', 'S...",NONE,https://www.sigmaaldrich.com/catalog/product/a...,https://www.thermofisher.com/order/catalog/pro...,http://www.tcichemicals.com/eshop/en/us/commod...,https://www.combi-blocks.com/cgi-bin/find.cgi?...,https://www.ambeed.com/products/3558-24-5.html
3,Cc1ccc2cc(-c3ccccc3)[nH]c2c1,InChI=1S/C15H13N/c1-11-7-8-13-10-15(16-14(13)9...,WHOVJSPCXWJPBL-UHFFFAOYSA-N,261513,"['Combi-Blocks', 'Ambeed']",NONE,NONE,NONE,NONE,https://www.combi-blocks.com/cgi-bin/find.cgi?...,https://www.ambeed.com/products/66354-87-8.html
4,O=C(Cl)C(Cl)Cl,InChI=1S/C2HCl3O/c3-1(4)2(5)6/h1H,FBCCMZVIWNDFMO-UHFFFAOYSA-N,6593,"['Thermo Fisher Scientific', 'Thermo Fisher Sc...",https://us.vwr.com/store/product/7514087/dichl...,https://www.sigmaaldrich.com/catalog/product/a...,https://www.thermofisher.com/order/catalog/pro...,http://www.tcichemicals.com/eshop/en/us/commod...,NONE,NONE
5,Cc1ccc2[nH]c(-c3ccccc3)cc2c1,InChI=1S/C15H13N/c1-11-7-8-14-13(9-11)10-15(16...,JPFTUUXPCFNLIX-UHFFFAOYSA-N,83247,"['Ambeed', 'Sigma-Aldrich']",NONE,https://www.sigmaaldrich.com/catalog/product/a...,NONE,NONE,NONE,https://www.ambeed.com/products/13228-36-9.html


## Functions

In [3]:
def convert_mass_units(value: str, to: str = 'g') -> str:
    if to not in ['ng', 'ug','mg', 'g', 'kg', 'lbs', 'lb']:
        raise ValueError(f'{to} is not an accepted destination unit.')

    if to != 'g':
        raise NotImplementedError(f'Conversion to {to} is not implemented.')

    unit = ''.join([x for x in value if x.isalpha()])

    # Convert everything to grams
    if unit == 'g':
        conversion = 1
    elif unit == 'mg':
        conversion = 0.001
    elif unit == 'ug':
        conversion = 0.000001
    elif unit == 'ng':
        conversion = 0.000000001
    elif unit == 'kg':
        conversion = 1000
    elif unit in ['lb', 'lbs']:
        conversion = 453.592
    else:
        raise ValueError(f'Could not convert {unit} to grams.')

    value = float(''.join([x for x in value if x.isdigit() or x == '.']))

    return f'{value * conversion}g'

def parse_ambeed_table(raw_table: str) -> list[list[str]]:
    '''
    Parses a raw ambeed table string
    into a list of list of strings that contain
    '''
    raw_table = raw_table.strip()
    raw_table = re.sub('\s+', ' ', raw_table)
    items = re.split('\s+', raw_table)
    results = []
    entry_list = []
    for i in items:
        if any([x in i for x in ['ng', 'mg', 'g', 'kg']]):
            entry_list.append(i)
        elif '$' in i:
            entry_list.append(i)
            results.append(entry_list)
            entry_list = []
    return results


def get_price_from_ambeed(url: str) -> dict:
    '''
    Price from ambeed (old version)
    '''

    # Define the Chrome webdriver options
    options = webdriver.ChromeOptions()
    options.add_argument("--headless") # Set the Chrome webdriver to run in headless mode for scalability

    # By default, Selenium waits for all resources to download before taking actions.
    # However, we don't need it as the page is populated with dynamically generated JavaScript code.
    options.page_load_strategy = "none"

    # Pass the defined options objects to initialize the web driver
    driver = Chrome(options=options)
    # Set an implicit wait of 5 seconds to allow time for elements to appear before throwing an exception
    driver.implicitly_wait(5)

    #content = driver.find_element(By.CSS_SELECTOR, "div[class*='p-details-table'")

    driver.get(url)

    time.sleep(1.5)

    source = driver.page_source.split('\n')

    table = driver.find_elements(By.TAG_NAME, "tbody")
    elements = [x.text for x in table]
    print(elements)
    elements = [re.sub('\s+', ' ', x.strip()) for x in elements if '%' in x or 'Inquiry' in x]

    return elements[0]

async def get_price_table_from_ambeed_using_requestshtml(link: str,
                                                   session: AsyncHTMLSession,
                                                   sleep_time: int = 7) -> str:
    r = await session.get(link)
    #r = session.get('https://www.ambeed.com/products/1628-89-3.html')
    await r.html.arender(sleep=sleep_time)
    #html = r.html.raw_html
    #await session.close()
    tables = r.html.find('tbody')

    text = ''
    for item in tables:
        if '$' in item.full_text or 'Inquiry' in item.full_text:
            text = item.full_text
            break
    return text

## Get Ambeed Prices

In [7]:
# Define a directory in which to store price data
price_storage_dir = Path('./results/ambeed_prices/')

# Make the directory
price_storage_dir.mkdir(exist_ok=True)

# Begin Async session
session = AsyncHTMLSession()

for i, row in df.iterrows():
    if i % 10 == 0:
        print(f'[INFO] Working on index {i} of {df.shape[0]}')
    # Define a record entry file
    record_entry = price_storage_dir / f'{row["CID"]}.txt'

    # If it exists, read it in
    if record_entry.exists():
        print(f'[INFO] Found {record_entry.name} for Ambeed pricing.')
        with open(record_entry, 'r') as infile:
            data = infile.read()
    else:
        link = row['Ambeed_link']
        if link == 'NONE':
            print(f'Skipping compound {i} because no ambeed link.')
            continue
        else:
            data = await get_price_table_from_ambeed_using_requestshtml(link=link, session=session, sleep_time=0.1)

        with open(record_entry, 'w') as outfile:
            outfile.write(str(data))

display(df)


[INFO] Working on index 0 of 6
Found 13698.txt for Ambeed pricing.
Found 13207630.txt for Ambeed pricing.
Found 77095.txt for Ambeed pricing.
Found 261513.txt for Ambeed pricing.
a
Skipping compound 4 because no ambeed link.
Found 83247.txt for Ambeed pricing.


Unnamed: 0,SMILES,INCHI,INCHI_KEY,CID,VENDORS,"VWR, Part of Avantor_link",Sigma-Aldrich_link,Thermo Fisher Scientific_link,TCI (Tokyo Chemical Industry)_link,Combi-Blocks_link,Ambeed_link
0,c1ccc(-c2cc3ccccc3[nH]2)cc1,InChI=1S/C14H11N/c1-2-6-11(7-3-1)14-10-12-8-4-...,KLLLJCACIRKBDT-UHFFFAOYSA-N,13698,"['Sigma-Aldrich', 'VWR, Part of Avantor', 'Amb...",https://us.vwr.com/store/product/7514896/2-phe...,https://www.sigmaaldrich.com/catalog/product/a...,https://www.thermofisher.com/order/catalog/pro...,http://www.tcichemicals.com/eshop/en/us/commod...,https://www.combi-blocks.com/cgi-bin/find.cgi?...,https://www.ambeed.com/products/948-65-2.html
1,c1ccc2c(c1)Cn1c-2cc2ccccc21,InChI=1S/C15H11N/c1-3-7-13-12(6-1)10-16-14-8-4...,JJUQRPHMJPHFBH-UHFFFAOYSA-N,13207630,"['Combi-Blocks', 'Ambeed']",NONE,NONE,NONE,NONE,https://www.combi-blocks.com/cgi-bin/find.cgi?...,https://www.ambeed.com/products/248-71-5.html
2,Cn1c(-c2ccccc2)cc2ccccc21,InChI=1S/C15H13N/c1-16-14-10-6-5-9-13(14)11-15...,SFWZZSXCWQTORH-UHFFFAOYSA-N,77095,"['TCI (Tokyo Chemical Industry)', 'Ambeed', 'S...",NONE,https://www.sigmaaldrich.com/catalog/product/a...,https://www.thermofisher.com/order/catalog/pro...,http://www.tcichemicals.com/eshop/en/us/commod...,https://www.combi-blocks.com/cgi-bin/find.cgi?...,https://www.ambeed.com/products/3558-24-5.html
3,Cc1ccc2cc(-c3ccccc3)[nH]c2c1,InChI=1S/C15H13N/c1-11-7-8-13-10-15(16-14(13)9...,WHOVJSPCXWJPBL-UHFFFAOYSA-N,261513,"['Combi-Blocks', 'Ambeed']",NONE,NONE,NONE,NONE,https://www.combi-blocks.com/cgi-bin/find.cgi?...,https://www.ambeed.com/products/66354-87-8.html
4,O=C(Cl)C(Cl)Cl,InChI=1S/C2HCl3O/c3-1(4)2(5)6/h1H,FBCCMZVIWNDFMO-UHFFFAOYSA-N,6593,"['Thermo Fisher Scientific', 'Thermo Fisher Sc...",https://us.vwr.com/store/product/7514087/dichl...,https://www.sigmaaldrich.com/catalog/product/a...,https://www.thermofisher.com/order/catalog/pro...,http://www.tcichemicals.com/eshop/en/us/commod...,NONE,NONE
5,Cc1ccc2[nH]c(-c3ccccc3)cc2c1,InChI=1S/C15H13N/c1-11-7-8-14-13(9-11)10-15(16...,JPFTUUXPCFNLIX-UHFFFAOYSA-N,83247,"['Ambeed', 'Sigma-Aldrich']",NONE,https://www.sigmaaldrich.com/catalog/product/a...,NONE,NONE,NONE,https://www.ambeed.com/products/13228-36-9.html


## Convert Ambeed Raw Table Data to Prices

In [None]:
for i, row in df.iterrows():
    record_entry = price_storage_dir / f'{row["CID"]}.txt'

    if record_entry.exists():
        with open(record_entry, 'r') as infile:
            data = infile.read()
    else:
        continue

    if data == '':
        print(f'[WARNING] Empty data for {record_entry.name}')
        continue

    # Get data as a list of lists
    data = parse_ambeed_table(data)

    # Convert everything to price per gram
    ppg_data = []
    min_ppg_value = np.inf
    min_ppg_entry = None
    for entry in data:

        try:
            entry[0] = convert_mass_units(entry[0], to='g')
        except ValueError as e:
            print(f'[WARNING] Could not parse data for {record_entry.name} because {e}')
            continue
        price_per_gram = float(''.join([x for x in entry[1] if x.isdigit() or x == '.'])) / float(''.join([x for x in entry[0] if x.isdigit() or x == '.']))

        # Add the PPG to the new data for the particular molecule
        ppg_data.append([entry[0], price_per_gram])

        # If it is the lowest ppg, replace the current ppg
        if price_per_gram < min_ppg_value:
            min_ppg_value = price_per_gram
            min_ppg_entry = entry


    # Get the minimum price per gram
    if len(ppg_data) == 0:
        print(f'[WARNING] No price data for {record_entry.name}')
        continue

    min_ppg = [x for x in ppg_data if x[1] == min([z[1] for z in ppg_data])][0][1]

    df.loc[df['CID'] == row['CID'], 'Ambeed_price_per_gram'] = min_ppg

    #print(f'{row["CID"]}\t${min_ppg}/g\t(purchase unit of {min_ppg_entry})')

display(df)

df.to_csv('./test_prices.csv', index=False)



## Sigma-Aldrich Pricing

In [None]:
def get_price_table_from_sigma_aldrich(link: str,
                                             session: AsyncHTMLSession,
                                             sleep_time: int = 7) -> list[list[str]]:
    #r = await session.post(url=link)
    #await r.html.arender(sleep=sleep_time)
    #print(dir(r.html))
    #print(r)


    #request = urllib.request.Request(link,
    #                                 data=None,
    #headers={
    #    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'
    #})
    #f = urllib.request.urlopen(request)
    #pprint(f.read().decode('utf-8'))

    # Define the Chrome webdriver options
    options = webdriver.ChromeOptions()
    options.add_argument("--headless") # Set the Chrome webdriver to run in headless mode for scalability
    #options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"
    options.add_argument("user-agent=" + user_agent)

    # By default, Selenium waits for all resources to download before taking actions.
    # However, we don't need it as the page is populated with dynamically generated JavaScript code.
    options.page_load_strategy = "none"

    # Pass the defined options objects to initialize the web driver
    driver = Chrome(options=options)
    # Set an implicit wait of 5 seconds to allow time for elements to appear before throwing an exception
    driver.implicitly_wait(5)

    driver.execute_cdp_cmd('Network.setUserAgentOverride', {"userAgent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'})
    #driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")

    driver.get(link)

    time.sleep(sleep_time)

    source = driver.page_source
    parsed = BeautifulSoup(source, 'html.parser')

    with open('tmp.txt', 'w', encoding='utf-8') as outfile:
        outfile.write(parsed.prettify())

    units = parsed.find_all('span', {'class': 'MuiChip-label'})

    product_not_found_test = parsed.find_all('div', string=re.compile('Product Not Found'))

    if units == []:
        print(f'[WARNING] Product link for CID {row["CID"]} failed.')
        return ''

    return str(units)




price_storage_dir = Path('./results/sigma_prices/')

for i, row in df.iterrows():

    # Skip the rows that have prices from Ambeed
    if not math.isnan(row['Ambeed_price_per_gram']):
        continue

    record_entry = price_storage_dir / f'{row["CID"]}.txt'

    # Add or read in record data for this entry
    if record_entry.exists():
        print(f'[INFO] Found {record_entry.name} for Sigma-Aldrich pricing.')
        with open(record_entry, 'r') as infile:
            data = infile.read()
        if data == '':
            print(f'[WARNING] Sigma-Aldrich record for {row["CID"]} was empty.')
    else:
        link = row['Sigma-Aldrich_link']
        if link == 'NONE':
            print(f'[INFO] Skipping {i} because no Sigma-Aldrich link.')
            continue
        else:
            print(f'[INFO] Getting data for {link}. CID: {row["CID"]}')
            data = get_price_table_from_sigma_aldrich(link=link,
                                                session=session,
                                                sleep_time=8)

        with open(record_entry, 'w') as outfile:
            outfile.write(str(data))

    # Do calculations on price data

