In [1]:
import pickle
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
from chemspipy import ChemSpider
cs = ChemSpider('0201ba66-585d-4135-9e6b-d28ba4724fcf')
from rdkit import Chem

In [26]:
def sameChemical(results):
    '''
    returns an rdkit chemical object if a the chemicals in a chemspipy result list have:
    -the same molecular weight, and
    -the same smiles representation
    returns None otherwise
    '''
    from rdkit import Chem
    
    if results.count == 0:
        return None
    
    smiles = []
    mws = []
    if results.count >= 1:
        for chemical in results:
            try:
                smilesBase = chemical.smiles
                chemBase = Chem.MolFromSmiles(smilesBase)

                smilesTemp = Chem.MolToSmiles(chemBase)
                smiles.append(smilesTemp)

                mwTemp = Descriptors.MolWt(chemBase)
                mws.append(mwTemp)
            except:
                continue
        if (len(set(smiles)) == 1 and
           len(set(mws)) == 1):
            return Chem.MolFromSmiles(Chem.MolToSmiles(chemBase))
    
    else:
        return None

def chemSearch(femaDictEntry, priorityList):
    '''
    returns a rdkit molecule after searching the chemspider database based on the items
    in the priority list.
    '''
    from chemspipy import ChemSpider
    cs = ChemSpider('0201ba66-585d-4135-9e6b-d28ba4724fcf')
    
    for tup in priorityList:
        
        try:
            t = femaDictEntry.get(tup[1])
        except AttributeError:
            continue
        
        if t:
            searchString = tup[0] + t
            #print('searching for: {}' .format(searchString))
            results = cs.search(searchString)
            #print('stopped searching')
            if sameChemical(results):
                #print(tup)
                return sameChemical(results)
            else:
                continue
    return None

def applyFunctions(results, functionList, noZeros=True):
    '''
    Applies a list of functions to a list of objects
    
    Inputs:
    -results: list of tuples in the form: (fema number, object)
    -functionList: list of tuples in the form (function name, function)
    
    returns:
    -panda dataframe with fema numbers as rows, function names as columns, 
    and the results of applying function to the fema number molecule as entry.
    If noZeros is True function will delete all columns that only have zeros.
    '''
    
    import pandas as pd
    from rdkit.Chem import Descriptors
    
    index = [i[0] for i in results]
    columns = [c[0] for c in functionList]
    df = pd.DataFrame(0, index=index, columns=columns)
    for chem in results:
        for function in functionList:
            try:
                r = function[1](chem[1])
            except:
                #print(function[0])
                r = 0
            try:
                df.loc[chem[0],function[0]] = r
            except:
                pass
                #print(chem[0], function[0])
    
    #Delete columns with only zeros if noZeros is True
    if noZeros:
        df = df.loc[:,(df != 0).any(axis=0)]
    
    return df

def getFunctions(module):
    '''
    returns a list of tuples with (function name, function) for all the functions found in a module
    '''
    from inspect import getmembers, isfunction
    
    functions_list = [o for o in getmembers(Descriptors) if isfunction(o[1])]
    
    return functions_list

def dictSearch(dictionary):
    '''
    returns: 
    Copy of the Dictionary mutated to that it includes a 'rdkit Mol' subentry for each key where a clear
    Molecule could be found in chemspider and created with rdKit.
    '''
    from rdkit.Chem import Descriptors
    
    copy = dictionary.copy()
    
    functionList = getFunctions(Descriptors)
    priorityList = [('fema ', 'FEMA'), ('jecfa ', 'JECFA'), ('', 'CAS'), ('', 'name')]
    results = []
    counter = 0
    
    for key in copy.keys():
        
        # See if a molecule can be generated for the key based on the priority list
        test = chemSearch(dictionary[key], priorityList)
        if test:
            copy[key]['rdkit Mol'] = test[1]
            results.append(test)
        
        # To display periods and % completed as the function is doing its thing:
        counter += 1
        if counter % 50 == 0:
            print(' {}% done ' .format(round(counter/len(dictionary)*100), 2))
            df = applyFunctions(results, functionList)
            df.to_pickle('temp_data')
        
        elif counter % 2 == 0:
            print('.', end='')
                
    return copy

In [50]:
from rdkit.Chem import Descriptors
def getDictLists(sourceKey, dictionary=femaDict):
    '''
    Generator returns key and result from a dictionary's given sourceKey when available.
    If the result is not available returns NoneType
    '''
    for key in dictionary.keys():
        try:
            ans = dictionary[key].get(sourceKey)
        except:
            continue
        if ans:
            yield key, ans
    
functionList = getFunctions(Descriptors)
results = [(key, ans) for (key, ans) in getDictLists('rdkit Mol')]

dfData = applyFunctions(results, functionList)

In [51]:
dfData.index = dfData.index.map(int)

In [54]:
tdf = dfData.copy()
l2 = list(tdf.columns)
l1 = ['data'] * len(l2)
z = list(zip(l1,l2))
tdf.columns = pd.MultiIndex.from_tuples(z, names=['type', 'names'])
tdf.head(3)

type,data,data,data,data,data,data,data,data,data,data,data,data,data,data,data,data,data,data,data,data,data
names,BalabanJ,BertzCT,Chi0,Chi0n,Chi0v,Chi1,Chi1n,Chi1v,Chi2n,Chi2v,...,fr_piperzine,fr_priamide,fr_pyridine,fr_quatN,fr_sulfide,fr_sulfonamd,fr_thiazole,fr_thiophene,fr_unbrch_alkane,fr_urea
3811,1.509104,1247.709777,31.584693,23.255986,23.255986,20.352682,13.450686,13.450686,10.372505,10.372505,...,0,0,0,0,0,0,0,0,0,0
2368,3.007845,154.814331,12.062632,11.094671,11.094671,7.80806,6.96469,6.96469,4.445344,4.445344,...,0,0,0,0,0,0,0,0,7,0
3906,3.152941,255.189433,7.560478,5.795,5.795,4.715214,3.07015,3.07015,2.143437,2.143437,...,0,0,0,0,0,0,0,0,0,0


In [55]:
df25 = pd.read_pickle('df25')
tdf25 = df25.copy()
l2 = list(tdf25.columns)
l1 = ['lbl'] * len(l2)
z = list(zip(l1, l2))
tdf25.columns = pd.MultiIndex.from_tuples(z, names = ['type', 'names'])
tdf25.head(3)

type,lbl,lbl,lbl,lbl,lbl,lbl,lbl,lbl,lbl,lbl,lbl,lbl,lbl,lbl,lbl,lbl,lbl,lbl,lbl,lbl,lbl
names,labels,acid,alcohol,almond,anis,appl,apricot,aris,aromat,asparagus,...,violet,walnut,warm,watermelon,wax,wet,wine,wood,wool,yeast
2401,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4263,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2209,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [56]:
dfAll = pd.concat([tdf25, tdf], axis=1, join='inner')
dfAll.shape

(1886, 409)

In [57]:
dfAll.head(3)

type,lbl,lbl,lbl,lbl,lbl,lbl,lbl,lbl,lbl,lbl,...,data,data,data,data,data,data,data,data,data,data
names,labels,acid,alcohol,almond,anis,appl,apricot,aris,aromat,asparagus,...,fr_piperzine,fr_priamide,fr_pyridine,fr_quatN,fr_sulfide,fr_sulfonamd,fr_thiazole,fr_thiophene,fr_unbrch_alkane,fr_urea
2368,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,7,0
3906,22,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4212,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [61]:
pd.to_pickle(dfAll, 'df25All')

In [62]:
df = dfAll['lbl']
del df['labels']
df.head()

names,acid,alcohol,almond,anis,appl,apricot,aris,aromat,asparagus,astring,...,violet,walnut,warm,watermelon,wax,wet,wine,wood,wool,yeast
2368,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3906,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4212,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3071,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4126,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [63]:
from sklearn.cluster import KMeans
X = df.values
totalLabels = len(df.columns)
nums = []
unities = []
for num in range(35, 5, -1):
    kmeans = KMeans(n_clusters=num, random_state=0).fit(X)
    dfTest = pd.DataFrame(kmeans.labels_, index = df.index)
    labels = dfTest.loc[meatIndices].values.tolist()
    labels = [label for lst in labels for label in lst]
    unity = len(set(labels))
    nums.append(num)
    unities.append(unity)
    if num%5 == 0:
        print(num)

NameError: name 'meatIndices' is not defined