In [40]:
import requests
import json

In [114]:
prolog = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug'
def get_cid_by_smiles(sm, prolog = "https://pubchem.ncbi.nlm.nih.gov/rest/pug"):
    
    """
    Get CID first
    """
    get_cid_part_url = "/compound/smiles/cids/txt"
    url = prolog + get_cid_part_url
    struct = {'smiles': sm}
    res = requests.get(url, params = struct)
    cid = res.text
    return cid

In [115]:
#test
sm_1 = 'CCC(O)OC(CC)O'
cid = get_cid_by_smiles(sm_1, prolog)
print(cid)

6537503



In [32]:
def get_full_data_by_cid(cid, prolog = "https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound/", form = "/JSON"):
    url = "".join([prolog, str(cid).rstrip(), form])
#     print(url)
    res = requests.get(url)
    return res.text
    

In [36]:
#test
full_data = get_full_data_by_cid(cid)
print(full_data)

{
  "Record": {
    "RecordType": "CID",
    "RecordNumber": 5564,
    "RecordTitle": "Triclosan",
    "Section": [
      {
        "TOCHeading": "Structures",
        "Description": "Structure depictions of this compound, including computationally generated two-dimensional (2D) and three-dimensional (3D) structures, as well as experimentally determined 3D single-crystal structures.",
        "Section": [
          {
            "TOCHeading": "2D Structure",
            "Description": "A two-dimensional (2D) structure representation of the compound.  Because this structure is processed through chemical structure standardization (Hähnke et al., J. Cheminform. 2018, 10, 36), it is not necessarily the same as the structures provided by individual data contributors.  ",
            "URL": "https://doi.org/10.1186/s13321-018-0293-8",
            "DisplayControls": {
              "MoveToTop": true
            },
            "Information": [
              {
                "ReferenceNumber":

In [41]:
full_data_json = json.loads(full_data)

In [44]:
print(full_data_json)



In [48]:
full_data_json["Record"]["Section"]

[{'TOCHeading': 'Structures',
  'Description': 'Structure depictions of this compound, including computationally generated two-dimensional (2D) and three-dimensional (3D) structures, as well as experimentally determined 3D single-crystal structures.',
  'Section': [{'TOCHeading': '2D Structure',
    'Description': 'A two-dimensional (2D) structure representation of the compound.  Because this structure is processed through chemical structure standardization (Hähnke et al., J. Cheminform. 2018, 10, 36), it is not necessarily the same as the structures provided by individual data contributors.  ',
    'URL': 'https://doi.org/10.1186/s13321-018-0293-8',
    'DisplayControls': {'MoveToTop': True},
    'Information': [{'ReferenceNumber': 111, 'Value': {'Boolean': [True]}}]},
   {'TOCHeading': '3D Conformer',
    'Description': 'A three-dimensional (3D) structure representation of the compound.  This 3D structure is not experimentally determined, but computed by PubChem.  This structure may 

In [50]:
for item in full_data_json["Record"]["Section"]:
    if item["TOCHeading"] == "Chemical Safety":
        print(item)

{'TOCHeading': 'Chemical Safety', 'Description': 'Link to the Safety and Hazard section of this page and link to the Laboratory Chemical Safety Summary (LCSS) datasheet for this compound.', 'DisplayControls': {'HideThisSection': True, 'MoveToTop': True}, 'Information': [{'ReferenceNumber': 111, 'Name': 'Chemical Safety', 'Value': {'StringWithMarkup': [{'String': '          ', 'Markup': [{'Start': 0, 'Length': 1, 'URL': 'https://pubchem.ncbi.nlm.nih.gov/images/ghs/GHS07.svg', 'Type': 'Icon', 'Extra': 'Irritant'}, {'Start': 1, 'Length': 1, 'URL': 'https://pubchem.ncbi.nlm.nih.gov/images/ghs/GHS09.svg', 'Type': 'Icon', 'Extra': 'Environmental Hazard'}]}]}}]}


In [177]:
def get_content_by_sec_head(full_data_json, sec_head = "Chemical Safety"):
    if "Record" in full_data_json.keys():
        found_item = 0
        for item in full_data_json["Record"]["Section"]:
            if "TOCHeading" in item.keys():
                if item["TOCHeading"] == sec_head:
                    return(item)
                    found_item = 1
            else:
                return -1
        if found_item == 0:
            return -1
    else:
        return -1

In [178]:
def get_safety_kw_from_safety_section(safety_section):
    if safety_section != -1:
        if "Information" in safety_section.keys():
            safety_info_list = safety_section["Information"]
            all_safety_value_dict = safety_info_list[0]["Value"]

            safety_kw_list = []
            if "StringWithMarkup" in all_safety_value_dict.keys():
                for kw_dict in all_safety_value_dict["StringWithMarkup"][0]["Markup"]:
                    safety_kw_list.append(kw_dict["Extra"])
                return safety_kw_list
            else:
                return -1
        else:
            return -1
    else:
        return -1

        
    

In [179]:
sm = "CC1=CC=CC=C1" #toluene
cid = get_cid_by_smiles(sm)
print(f"cid:{cid}")
full_data = get_full_data_by_cid(cid)
full_data_json = json.loads(full_data)
safety_section = get_content_by_sec_head(full_data_json, sec_head = "Chemical Safety")
# print(safety_section)
# print("\n")
safety_kw_list = get_safety_kw_from_safety_section(safety_section)
print(safety_kw_list)

cid:1140

['Flammable', 'Irritant', 'Health Hazard']


In [180]:
def get_safety_kw_from_sm(sm):
    cid = get_cid_by_smiles(sm)
    full_data = get_full_data_by_cid(cid)
    full_data_json = json.loads(full_data)
    safety_section = get_content_by_sec_head(full_data_json, sec_head = "Chemical Safety")
    safety_kw_list = get_safety_kw_from_safety_section(safety_section)
    return safety_kw_list
    

In [181]:
import pandas as pd
import time

In [182]:
# solv_db = pd.read_json("db_solv_pred_v2.json")
# db_dict = solv_db.to_dict("records")
# db_dict[0]

solv_db = pd.read_excel("db_mis.xlsx", na_values = "-1")
db_dict = solv_db.to_dict("records")
db_dict[0]

{'No.': 1,
 'CAS': '75-07-0',
 'Name': 'Acetaldehyde',
 'D': 14.7,
 'P': 12.5,
 'H': 7.9,
 'Mole_vol': '56.6',
 'ims_idx': nan,
 'bp': 20.8,
 'mw': 44.05,
 'viscosity': 0.21,
 'vis_temp': 20.0,
 'heat_of_vap': 25.73,
 'hov_temp': 20.2,
 'SMILES': 'O=CC',
 'alias': nan,
 'synonyms': 'acetic aldehyde;ethyl aldehyde',
 'Note': nan}

In [121]:
#test for time gap
# time_count = 0
# for i in range(20):
#     print(i)
#     time_count += 1
#     if time_count % 4 == 0:
#         print("sleep")
#         time.sleep(1)

0
1
2
3
sleep
4
5
6
7
sleep
8
9
10
11
sleep
12
13
14
15
sleep
16
17
18
19
sleep


In [183]:
time_count = 0

for entry in db_dict:
    print(entry["No."])
    print(entry["Name"])
    entry_sm = entry["SMILES"]
    if type(entry_sm) is str:
        safety_kw_list = get_safety_kw_from_sm(entry_sm)
        entry["safety_kw"] = safety_kw_list
    else:
        entry["safety_kw"] = -1
    time_count += 1
    if time_count % 4 == 0:
        time.sleep(1)
    print(entry["safety_kw"])
    

db_with_safe = pd.DataFrame.from_dict(data = db_dict)
db_with_safe.head()
        

1
Acetaldehyde
['Flammable', 'Irritant', 'Health Hazard']
2
Acetic acid
['Flammable', 'Corrosive']
3
Acetic anhydride
['Flammable', 'Corrosive', 'Irritant']
4
Acetone
['Flammable', 'Irritant']
5
Acetonitrile
['Flammable', 'Irritant']
6
Acetophenone
['Irritant']
7
Acrylonitrile
['Flammable', 'Corrosive', 'Acute Toxic', 'Health Hazard', 'Environmental Hazard']
8
Allyl alcohol
['Flammable', 'Acute Toxic', 'Environmental Hazard']
9
Amyl acetate
['Flammable']
10
Aniline
['Corrosive', 'Acute Toxic', 'Health Hazard', 'Environmental Hazard']
11
Anisole
['Flammable', 'Irritant']
12
Benzaldehyde
['Irritant']
13
Benzene
['Flammable', 'Irritant', 'Health Hazard']
14
1,3-Benzenediol
['Irritant', 'Environmental Hazard']
15
Benzoic acid
['Corrosive', 'Health Hazard']
16
Benzonitrile
['Irritant']
17
Benzyl alcohol
['Irritant']
18
Benzyl butyl phthalate
['Health Hazard', 'Environmental Hazard']
19
Benzyl chloride
['Corrosive', 'Acute Toxic', 'Health Hazard']
20
Biphenyl
['Irritant', 'Environmental Haza

['Health Hazard']
162
Methylene diiodide
['Corrosive', 'Irritant']
163
Methyl ethyl ketone
['Flammable', 'Irritant']
164
Methyl isoamyl ketone
['Flammable', 'Irritant']
165
Methyl isobutyl carbinol
['Flammable', 'Irritant']
166
Methyl isobutyl ketone
['Flammable', 'Irritant', 'Health Hazard']
167
Methyl methacrylate
['Flammable', 'Irritant']
168
1-Methylnaphthalene
['Irritant', 'Health Hazard', 'Environmental Hazard']
169
Methyl oleate
-1
170
2-Methyl-1-propanol(repeated)
['Flammable', 'Corrosive', 'Irritant']
171
Methyl-2-pyrrolidone
['Irritant', 'Health Hazard']
172
Methyl salicylate
['Irritant', 'Health Hazard']
173
Morpholine
['Flammable', 'Corrosive', 'Irritant']
174
Naphtha,high-flash
-1
175
Naphthalene
['Irritant', 'Health Hazard', 'Environmental Hazard']
176
Nitrobenzene
['Acute Toxic', 'Health Hazard']
177
Nitroethane
['Flammable', 'Irritant']
178
Nitromethane
['Flammable', 'Irritant']
179
1-Nitropropane
['Flammable', 'Irritant']
180
2-Nitropropane
['Flammable', 'Irritant', 'H

Unnamed: 0,No.,CAS,Name,D,P,H,Mole_vol,ims_idx,bp,mw,viscosity,vis_temp,heat_of_vap,hov_temp,SMILES,alias,synonyms,Note,safety_kw
0,1,75-07-0,Acetaldehyde,14.7,12.5,7.9,56.6,,20.8,44.05,0.21,20.0,25.73,20.2,O=CC,,acetic aldehyde;ethyl aldehyde,,"[Flammable, Irritant, Health Hazard]"
1,2,64-19-7,Acetic acid,14.5,8.0,13.5,57.1,139;190;39,117.0,60.05,1.056,25.0,23.7,117.9,CC(=O)O,,ethanoic acid;Ethylic acid;Vinegar acid,,"[Flammable, Corrosive]"
2,3,108-24-7,Acetic anhydride,16.0,11.7,10.2,94.5,40,139.0,102.09,0.843,25.0,38.2,139.5,CC(=O)OC(=O)C,,Acetyl acetate;Acetanhydride;Ethanoic anhydride,,"[Flammable, Corrosive, Irritant]"
3,4,67-64-1,Acetone,15.5,10.4,7.0,74.0,,56.2,58.08,0.32,20.0,29.1,56.05,CC(=O)C,,2-propanone,,"[Flammable, Irritant]"
4,5,75-05-8,Acetonitrile,15.3,18.0,6.1,52.6,139;47;136;240;190,81.6,41.05,0.35,20.0,29.81,80.0,CC#N,ACN,Methyl cyanide;Cyanomethane,,"[Flammable, Irritant]"


In [184]:
db_with_safe.to_csv("db_with_safe_info.csv", index = None)

In [176]:
# #debug for missing information 

# test_sm = db_dict[96]["SMILES"]

# cid = get_cid_by_smiles(test_sm)
# print(f"cid:{cid}")

# # get_safety_kw_from_sm(test_sm)
# full_data = get_full_data_by_cid(cid)
# full_data_json = json.loads(full_data)

# # print(full_data_json["Record"]["Section"])

# sec_head = "Chemical Safety"

# if "Record" in full_data_json.keys():
#     found_info = 0
#     for item in full_data_json["Record"]["Section"]:
            
#         if "TOCHeading" in item.keys():
# #             print(item["TOCHeading"])
#             if item["TOCHeading"] == sec_head:
#                 print("found")
#                 print(item)
#                 found_info = 1
#         else:
#             print (-1)
#     if found_info == 0:
#         print (-1)
# else:
#     print (-1)


# safety_section = get_content_by_sec_head(full_data_json, sec_head = "Chemical Safety")

# # print(safety_section)
# # # print("\n")
# # safety_kw_list = get_safety_kw_from_safety_section(safety_section)
# # print(safety_kw_list)

cid:6537503

-1


# load db with safety_kw, prepare candidate subset with safety classification

In [1]:
import pandas as pd
import json
import numpy as np

In [2]:
db_safe = pd.read_csv("db_with_safe_info.csv", na_values = "-1")

In [3]:
db_safe.to_json("db_with_safe_info.json", orient = "records")

In [4]:
db_safe_dict = db_safe.to_dict("records")

In [5]:
safety_kw_list = []
for entry in db_safe_dict:
    this_safety_kw = entry["safety_kw"]
#     print(this_safety_kw)
    if this_safety_kw is not np.NAN:
        update_safety_kw_format = []
        for kw in this_safety_kw[1:-1].replace("'", "").split(","):
            kw = kw.replace(" ", "")
            if kw not in safety_kw_list:
                safety_kw_list.append(kw)
            update_safety_kw_format.append(kw)
        entry["safety_kw"] = update_safety_kw_format
            
print(safety_kw_list)
# print(db_safe_dict)

['Flammable', 'Irritant', 'HealthHazard', 'Corrosive', 'AcuteToxic', 'EnvironmentalHazard', 'CompressedGas']


In [6]:
#the new db ending with _reform means safety kw has been processed to remove space, and to match with the full kw list

In [7]:
db_safe_update = pd.DataFrame(data = db_safe_dict)
db_safe_update.to_csv("db_with_safe_info_reform.csv", index = None)
db_safe_update.to_json("db_with_safe_info_reform.json", orient = "records")

In [8]:
print(safety_kw_list)

['Flammable', 'Irritant', 'HealthHazard', 'Corrosive', 'AcuteToxic', 'EnvironmentalHazard', 'CompressedGas']


In [9]:
#prepare subset by kw to avoid

In [25]:
def get_cand_without_safety_concern(full_data_dict = db_safe_dict, concern = ["AcuteToxic", "EnvironmentalHazard"]):
    safe_subset = []
    for entry in full_data_dict:
        safe_concern_found = 0
        entry["concern_type"] = []
        this_safety_kw = entry["safety_kw"]
#         print(this_safety_kw)
        if this_safety_kw is not np.NAN:
            for kw in this_safety_kw:
                if kw in concern:
                    safe_concern_found = 1
                    entry["concern_type"].append(kw)
                    
            if safe_concern_found == 0:
                entry["is_concern"] = "False"
                safe_subset.append(entry)
                entry["concern_type"] = -1
            else:
                entry["is_concern"] = "True"
        else:
            entry["is_concern"] = -1
            entry["concern_type"] = -1
            safe_subset.append(entry)

    return safe_subset, full_data_dict
            
                    
                    

In [26]:
concern_list = ["AcuteToxic", "EnvironmentalHazard", "HealthHazard"] #modify this list to inidcate safety concerns to be avoided
safe_subset, db_filt = get_cand_without_safety_concern(full_data_dict = db_safe_dict, concern = concern_list)
print(len(safe_subset))

138


In [27]:
print(safe_subset)

[{'No.': 2, 'CAS': '64-19-7', 'Name': 'Acetic acid', 'D': 14.5, 'P': 8.0, 'H': 13.5, 'Mole_vol': '57.1', 'ims_idx': '139;190;39', 'bp': 117.0, 'mw': 60.05, 'viscosity': 1.056, 'vis_temp': 25.0, 'heat_of_vap': 23.7, 'hov_temp': 117.9, 'SMILES': 'CC(=O)O', 'alias': nan, 'synonyms': 'ethanoic acid;Ethylic acid;Vinegar acid', 'Note': nan, 'safety_kw': ['Flammable', 'Corrosive'], 'is_concern': 'False', 'concern_type': -1}, {'No.': 3, 'CAS': '108-24-7', 'Name': 'Acetic anhydride', 'D': 16.0, 'P': 11.7, 'H': 10.2, 'Mole_vol': '94.5', 'ims_idx': '40', 'bp': 139.0, 'mw': 102.09, 'viscosity': 0.843, 'vis_temp': 25.0, 'heat_of_vap': 38.2, 'hov_temp': 139.5, 'SMILES': 'CC(=O)OC(=O)C  ', 'alias': nan, 'synonyms': 'Acetyl acetate;Acetanhydride;Ethanoic anhydride', 'Note': nan, 'safety_kw': ['Flammable', 'Corrosive', 'Irritant'], 'is_concern': 'False', 'concern_type': -1}, {'No.': 4, 'CAS': '67-64-1', 'Name': 'Acetone', 'D': 15.5, 'P': 10.4, 'H': 7.0, 'Mole_vol': '74.0', 'ims_idx': nan, 'bp': 56.2, '

In [28]:
print(db_filt)

[{'No.': 1, 'CAS': '75-07-0', 'Name': 'Acetaldehyde', 'D': 14.7, 'P': 12.5, 'H': 7.9, 'Mole_vol': '56.6', 'ims_idx': nan, 'bp': 20.8, 'mw': 44.05, 'viscosity': 0.21, 'vis_temp': 20.0, 'heat_of_vap': 25.73, 'hov_temp': 20.2, 'SMILES': 'O=CC', 'alias': nan, 'synonyms': 'acetic aldehyde;ethyl aldehyde', 'Note': nan, 'safety_kw': ['Flammable', 'Irritant', 'HealthHazard'], 'is_concern': 'True', 'concern_type': ['HealthHazard']}, {'No.': 2, 'CAS': '64-19-7', 'Name': 'Acetic acid', 'D': 14.5, 'P': 8.0, 'H': 13.5, 'Mole_vol': '57.1', 'ims_idx': '139;190;39', 'bp': 117.0, 'mw': 60.05, 'viscosity': 1.056, 'vis_temp': 25.0, 'heat_of_vap': 23.7, 'hov_temp': 117.9, 'SMILES': 'CC(=O)O', 'alias': nan, 'synonyms': 'ethanoic acid;Ethylic acid;Vinegar acid', 'Note': nan, 'safety_kw': ['Flammable', 'Corrosive'], 'is_concern': 'False', 'concern_type': -1}, {'No.': 3, 'CAS': '108-24-7', 'Name': 'Acetic anhydride', 'D': 16.0, 'P': 11.7, 'H': 10.2, 'Mole_vol': '94.5', 'ims_idx': '40', 'bp': 139.0, 'mw': 102.

In [29]:
db_with_safe_info_classified = pd.DataFrame(db_filt)

db_with_safe_info_classified.to_csv("db_with_safe_info_classified.csv", index = None)
db_with_safe_info_classified.to_json("db_with_safe_info_classified.json", orient = "records")

In [30]:
safe_subset_df = pd.DataFrame(safe_subset)
safe_and_green_candidates = pd.DataFrame()
safe_and_green_candidates["CAS"] = safe_subset_df["CAS"]
safe_and_green_candidates["Solvent"] = safe_subset_df["Name"]
safe_and_green_candidates.head()

Unnamed: 0,CAS,Solvent
0,64-19-7,Acetic acid
1,108-24-7,Acetic anhydride
2,67-64-1,Acetone
3,75-05-8,Acetonitrile
4,98-86-2,Acetophenone


In [31]:
safe_and_green_candidates.to_csv("safe_green_candidates.csv", index = None)
safe_and_green_candidates.to_json("safe_green_candidates.json", orient ="records")

In [36]:
test_str = "['Flammable', 'Irritant', 'Health Hazard']"
test_str[1:-1].replace("'", "").split(",")


['Flammable', ' Irritant', ' Health Hazard']

In [32]:
len(safe_and_green_candidates)

138