In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib import rcParams

In [32]:
import os
import numpy as np
import pandas as pd
import glob as glob
import bs4 as bs
import json

import pubchempy as pc

from collections import Hashable
from hashlib import md5



In [13]:
import cachegrab as cg
from cachegrab.cachers.basic import BasicCachingGetter as BCG

In [5]:
basedir = '/home/mike/data/chemistry/compatibility/parsed/'
files = glob.glob(basedir + '*.csv')
files

['/home/mike/data/chemistry/compatibility/parsed/tabula-Chemical-Resistance-Chart.csv',
 '/home/mike/data/chemistry/compatibility/parsed/ColeParmerRaw02.csv',
 '/home/mike/data/chemistry/compatibility/parsed/Graco_ChemCompGuidecsv.csv']

In [6]:
frames = [pd.read_csv(fn) for fn in files]

In [7]:
frames[0].head()

Unnamed: 0,chemical,302 Stainless Steel,304 Stainless Steel,316 Stainless Steel,440 Stainless Steel,Aluminum,Titanium,Hastelloy C,Cast Bronze,Brass,...,Carbon,Ceramic,Ceramagnet A,Viton,Buna-N (Nitrile),Silicon,Neoprene,EPDM,Natural rubber,Epoxy
0,Acetaldehyde 5,A,A,A,-,B,A,A,D,-,...,A,A,-,D,B,B,D,B,C,A
1,Acetamide,-,B,A,-,-,-,-,-,-,...,-,A,-,A,A,-,A,A,D,A
2,Acetate Solv. 2,A,B,A,B,B,-,-,A,C,...,A,A,-,D,D,-,D,-,-,A
3,"Acetic Acid, Glacia 1",-,B,A,A,B,A,A,C,C,...,A,A,-,D,D,B,C,B,C,B
4,Acetic Acid 20%,-,B,A,-,-,A,A,-,C,...,-,A,-,A,C,-,C,-,-,B


In [8]:
frames[1].head().replace({np.nan: '-'})

Unnamed: 0,chemical,Hypalon,PVC,LDPE,Kalrez,Polyetherether Ketone (PEEK),Viton,ABS plastic,Kel-F,CPVC,...,Carbon graphite,Acetal (Delrin),EPDM,stainless steel - 316,Ceramic Al203,Bronze,Silicone,Polyurethane,Neoprene,Polypropylene
0,Acetaldehyde,C,D,C,A,A,D,D,A,D,...,A,A,A,A,-,A,A,D,C,A
1,Acetamide,B,D,A,A,-,B,-,A,-,...,A,A,A,A,-,D,B,D,B,A
2,Acetate Solvent,C,D,A,C,-,D,-,A,C,...,A,-,A,A,-,C,C,D,D,B
3,Acetic Acid,C,D,A,C,A,B,D,A,C,...,A,D,A,B,A,C,C,D,C,B
4,Acetic Acid 20%,A,D,A,A,A,B,C,A,A,...,A,C,A,A,A,C,B,D,A,A


In [9]:
frames[2].head()

Unnamed: 0,chemical,Aluminum,Carbon Steel,Cast/Ductile Iron,304 Stainless Steel,316 Stainless Steel,Acetal,Buna,CSM (Hypalon),"EPR, EPDM",...,Fluoroelastomer (FKM),Nitrile (TPE),Nylon,Polychloroprene,Polypropylene,PTFE,PVDF,Santoprene (EPDM & Polypropylene),UHMWPE,Urethane
0,Acetaldehyde,B,D,C,A,A,A,D,C,A,...,D,D,B,D,C,A,D,B,B,D
1,Acetamide,A,D,D,D,A,A,B,B,A,...,B,A,B,B,A,A,D,A,A,D
2,Acetate Solvents,B,D,D,D,A,A,D,C,B,...,D,D,A,D,D,A,D,B,B,D
3,Acetic Acid,B,D,D,D,B,D,C,C,A,...,C,C,D,C,B,A,C,C,B,C
4,Acetic Acid — 20%,B,D,D,B,A,C,C,A,A,...,B,-,D,B,B,A,B,B,A,-


In [10]:
pc.get_compounds('Aspirin', 'name')

[Compound(2244)]

In [35]:
resp = pc.get_compounds('Acetaldehyde', 'name')
print(resp, type(resp))

[Compound(177)] <class 'list'>


In [39]:
c = resp[0]
type(c)

pubchempy.Compound

In [54]:
def pc_get_by_name_as_dict(name):
    resp = pc.get_compounds(name, 'name')
    return [x.to_dict() for x in resp]

def search_for_cid(name):
    """Search for a compound by name. Return only CIDs"""
    resp = pc.get_compounds(name, 'name')
    if resp:
        return({name: [{'cid': x.cid} for x in resp]})
    return {}

In [55]:
c = search_for_cid('ethanol')
c

{'ethanol': [{'cid': 702}]}

In [56]:
json.dumps(c)

'{"ethanol": [{"cid": 702}]}'

In [14]:
bcg = BCG()

In [26]:
def hash_md5(stringable, strict=True):
    if strict and not isinstance(stringable, Hashable):
        raise TypeError('Argument of type {} is not hashable. '
                        'Use a hashable object, or set strict=False'.format(type(stringable)))
    
    md5_key = md5(str(stringable).encode()).hexdigest()
    return md5_key

cfg = {'identifier': 'Acetaldehyde', 'namespace': 'name'}
hash_md5(cfg, False)

'b55c6dc96a547cb453a159426f7781a0'

In [62]:
def getfn(fn, kwargs, flush=False, basepath='./cache/'):
    """Wraps requests.get(), saves the response as a json file.
    :param url: Target URL
    :param flush: Refresh cache by forcing a request
    :return:
    """

    namestr = str(fn.__doc__) + str(kwargs) # this hackitude is over 9000! filthy, but fast and works for this purpose
    cachePath = "{}/{}.json".format(basepath, hash_md5(namestr))

    # Try to extract the file. If it fails, fall through
    data = None
    if os.path.exists(cachePath) and not flush:
        with open(cachePath, 'r') as f:
            try:
                data = json.load(f)
            except ValueError:
                pass

    if data is None:
        data = fn(**kwargs)
        with open(cachePath, 'w') as f:
            json.dump(data, f)

    return data

In [65]:
cfg = {'name': 'Aspirin'}
getfn(search_for_cid, kwargs=cfg)

{'Aspirin': [{'cid': 2244}]}

In [11]:
pc.get_substances('Acetaldehyde', 'name') # slow

[Substance(2088),
 Substance(3384),
 Substance(73431),
 Substance(587302),
 Substance(8143150),
 Substance(10534779),
 Substance(11528336),
 Substance(14709360),
 Substance(17390069),
 Substance(48413129),
 Substance(48414938),
 Substance(48416790),
 Substance(48421273),
 Substance(48422755),
 Substance(48424773),
 Substance(51071761),
 Substance(53790039),
 Substance(57570848),
 Substance(85083189),
 Substance(85285454),
 Substance(87563068),
 Substance(93166073),
 Substance(103429387),
 Substance(104294072),
 Substance(111677786),
 Substance(117397293),
 Substance(126677984),
 Substance(127322526),
 Substance(127322527),
 Substance(127322528),
 Substance(127322529),
 Substance(127322530),
 Substance(127322531),
 Substance(127322532),
 Substance(127322533),
 Substance(127322534),
 Substance(127322535),
 Substance(127322536),
 Substance(127322537),
 Substance(127322538),
 Substance(127322539),
 Substance(127322540),
 Substance(127322541),
 Substance(127322542),
 Substance(127322543),
 

In [19]:
c = pc.Compound.from_cid(177)
c

Compound(177)

In [20]:
c.to_dict()

{'atom_stereo_count': 0,
 'atoms': [{'aid': 1, 'element': 'O', 'number': 8, 'x': 3.732, 'y': 0.56},
  {'aid': 2, 'element': 'C', 'number': 6, 'x': 2, 'y': 0.56},
  {'aid': 3, 'element': 'C', 'number': 6, 'x': 2.866, 'y': 0.06},
  {'aid': 4, 'element': 'H', 'number': 1, 'x': 2.31, 'y': 1.0969},
  {'aid': 5, 'element': 'H', 'number': 1, 'x': 1.4631, 'y': 0.87},
  {'aid': 6, 'element': 'H', 'number': 1, 'x': 1.69, 'y': 0.0231},
  {'aid': 7, 'element': 'H', 'number': 1, 'x': 2.866, 'y': -0.56}],
 'bond_stereo_count': 0,
 'bonds': [{'aid1': 1, 'aid2': 3, 'order': 2},
  {'aid1': 2, 'aid2': 3, 'order': 1},
  {'aid1': 2, 'aid2': 4, 'order': 1},
  {'aid1': 2, 'aid2': 5, 'order': 1},
  {'aid1': 2, 'aid2': 6, 'order': 1},
  {'aid1': 3, 'aid2': 7, 'order': 1}],
 'cactvs_fingerprint': '100000000100000000100000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000

In [26]:
test = frames[1]['chemical'].head(20)
[n for n in test]

['Acetaldehyde',
 'Acetamide',
 'Acetate Solvent',
 'Acetic Acid',
 'Acetic Acid 20%',
 'Acetic Acid 80%',
 'Acetic Acid, Glacial',
 'Acetic Anhydride',
 'Acetone',
 'Acetyl Bromide',
 'Acetyl Chloride (dry)',
 'Acetylene',
 'Acrylonitrile',
 'Adipic Acid',
 'Alcohols:Amyl',
 'Alcohols:Benzyl',
 'Alcohols:Butyl',
 'Alcohols:Diacetone',
 'Alcohols:Ethyl',
 'Alcohols:Hexyl']

In [27]:
for n in test:
    c = pc.get_compounds(n, 'name')
    print(n, c)

Acetaldehyde [Compound(177)]
Acetamide [Compound(178)]
Acetate Solvent []
Acetic Acid [Compound(176)]
Acetic Acid 20% []
Acetic Acid 80% []
Acetic Acid, Glacial [Compound(176)]
Acetic Anhydride [Compound(7918)]
Acetone [Compound(180)]
Acetyl Bromide [Compound(10482)]
Acetyl Chloride (dry) []
Acetylene [Compound(6326)]
Acrylonitrile [Compound(7855)]
Adipic Acid [Compound(196)]
Alcohols:Amyl []
Alcohols:Benzyl []
Alcohols:Butyl []
Alcohols:Diacetone []
Alcohols:Ethyl []
Alcohols:Hexyl []


In [28]:
pc.get_compounds('Amyl alcohol', 'name')

[Compound(6276)]

In [29]:
pc.get_compounds('alcohol, Amyl', 'name')

[]