# Support functions for dictMaker

In [6]:
import pickle
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
from chemspipy import ChemSpider
cs = ChemSpider('0201ba66-585d-4135-9e6b-d28ba4724fcf')
from rdkit import Chem
from rdkit.Chem import Descriptors
from inspect import getmembers, isfunction
import nltk
import re

def linkToSoup(link):
    '''
    support function for dictMaker and Search and Filter.
    makes a beautiful soup object from link. Disguises itself as a browser so its not confused for a bot

    input:
    link: to use as the source for the Beautiful soup object

    returns:
    -Soup object if one can be made
    -None otherwise
    '''
    try:
        req = Request(link, headers={'User-Agent': 'Mozilla/5.0'})
        page = urlopen(req).read()
        soup = BeautifulSoup(page, 'lxml')
    except:
        return None

    return soup

def searchAndFilter(number,
                    searchPrefix='http://www.femaflavor.org/search/apachesolr_search/',
                    subString='/flavor/library/'):
    '''
    support function for dictMaker
    searches the Fema website for the number given and returns a list of links that contain the subString. 
    Returns None otherwise
    
    Inputs:
    -number: Fema number to search for
    -searchPrefix: web address prefix to search in
    -subString: to filter results
    
    Returns:
    -pageHeadings
    -name
    -link
    
    or 
    -None if none are found
    '''
    
    searchLink = searchPrefix + str(number)
    soup = linkToSoup(searchLink)
    if soup:
        searchBlock = soup.find_all('dl', class_='search-results apachesolr_search-results')
    else:
        return None

    #See if there are any results and extract only the links to flavor compounds
    try:
        titles = searchBlock[0].find_all('dt', class_='title')
        links = [title.find('a').get('href') for title in titles] #extracts all search result links
        linksChecked = [link for link in links if subString in link] #selects only links with flavor compund substring
    except:
        return None
    
    if len(linksChecked) >= 1:
        for link in linksChecked:
            soup = linkToSoup(link)
            if soup:
                pageTitle = soup.find('h2', class_='pageTitle')
                pageHeadings = soup.find_all('div', class_='field field-type-header')
                title = pageTitle.text.split('|')
                title = [word.strip() for word in title]
                name = title[0] #compound name
                titleNo = title[-1] #compound number
                if titleNo == str(number):
                    return pageHeadings, name, link
    else:
        return None

def sameChemical(results):
    '''
    returns an rdkit chemical object if a the chemicals in a chemspipy result list have:
    -the same molecular weight, and
    -the same smiles representation
    returns None otherwise
    '''
    if results.count == 0:
        return None
    
    smiles = []
    mws = []
    
    if results.count >= 1:
        for chemical in results:
            try:
                smilesBase = chemical.smiles
                chemBase = Chem.MolFromSmiles(smilesBase)

                smilesTemp = Chem.MolToSmiles(chemBase)
                smiles.append(smilesTemp)

                mwTemp = Chem.Descriptors.MolWt(chemBase)
                mws.append(mwTemp)
            except:
                continue
        if (len(set(smiles)) == 1 and
           len(set(mws)) == 1):
            return Chem.MolFromSmiles(Chem.MolToSmiles(chemBase))
    
    else:
        return None

def chemSearch(femaDictEntry, priorityList):
    '''
    returns a rdkit molecule after searching the chemspider database based on the items
    in the priority list.
    '''
    
    for tup in priorityList:
        
        try:
            t = femaDictEntry.get(tup[1])
        except AttributeError:
            continue
        
        if t:
            searchString = tup[0] + t
            #print('searching for: {}' .format(searchString))
            results = cs.search(searchString)
            #print('stopped searching')
            if sameChemical(results):
                #print(tup)
                return sameChemical(results)
            else:
                continue
    return None

In [3]:
#Find individual FEMA website links for each chemical, which have the flavor information in them 

def dictMaker(numberIter):
    '''
    returns a dictionary of chemicals found in the femaflavor.org website with FEMA numbers in
    the given numberIter
    
    inputs:
    -numberIter: an iterable object with the fema numbers to be searched

    returns:
    dicty with fema number as primary key and the following subkeys:
    'link','name', 'descriptors', 'CAS', 'JECFA', 'CFR'
    '''

    dicty = {}
    count = 0
    priorityList = [('fema ', 'FEMA'), ('jecfa ', 'JECFA'), ('', 'CAS'), ('', 'name')]

    for number in numberIter:    
        #searchNameLink is (pageHeadings, name, link) if there is a FEMA website for number. None otherwise
        pageNameLink = searchAndFilter(number)
        
        
        if pageNameLink:
            
            #Add all information from FEMA webpage to dicty[number][subentries]
            dicty[number] = {}
            dicty[number]['link'] = pageNameLink[2]
            dicty[number]['name'] = pageNameLink[1]
            dicty[number]['FEMA'] = str(number)
            for item in pageNameLink[0]:
                
                try:
                    label = item.find('h3', class_='field-label').stripped_strings
                    label = list(label)[0]
                    content = item.find('div', class_='field-item').stripped_strings
                    content = list(content)[0]
                except:
                    continue
                
                if label == 'FLAVOR PROFILE':
                    dicty[number]['descriptors'] = content
                    #lowercase, remove non-word characters (function1), and reduce words to their stem (function2)
                    content.lower()
                    pattern = re.compile('[\W_]+')
                    pattern.sub(' ', content)
                    stemmer = nltk.stem.SnowballStemmer('english')
                    stems = [stemmer.stem(word) for word in content.split(' ')]
                    stems = ' '.join(stems)
                    text = nltk.word_tokenize(stems)
                    tokens = nltk.pos_tag(text)
                    selected = [token[0] for token in tokens if token[1] in ['NN', 'JJ']]
                    dicty[number]['tokens'] = selected
                elif label == 'CAS':
                    dicty[number]['CAS']=content
                elif label == 'JECFA NUMBER':
                    dicty[number]['JECFA']=content
                elif label == 'CFR':
                    dicty[number]['CFR']=content
            
            #Add rdkit molecule to dicty[number]['rdkit Mol']
            test = chemSearch(dicty[number], priorityList)
            if test:
                dicty[number]['rdkit Mol'] = test
            else:
                print(' {}nMol' .format(number), end='')

        else:
            print(' {}nLink' .format(number), end='')
        
        count += 1
        if count%10 == 0:
            print(' {:.2f}%' .format(count/len(numberIter)*100), end='')
        else:
            print('.', end='')
    return dicty

In [4]:
dicty4500to5000 = dictMaker(range(4500,5000))

.. 4502nMol..... 4507nMol.. 2.00%........ 4518nMol. 4.00%......... 6.00%......... 8.00%....... 4547nMol.. 10.00%......... 12.00%.. 4562nMol....... 14.00%......... 16.00%......... 18.00%......... 20.00%......... 22.00%......... 24.00%......... 26.00%......... 28.00%......... 30.00%......... 32.00%........ 4668nMol. 34.00%......... 4679nMol 36.00%......... 4689nMol 38.00% 4690nMol. 4691nMol........ 40.00%..... 4705nMol.... 42.00%....... 4717nMol.. 44.00%....... 4727nMol.. 46.00%. 4731nMol..... 4736nMol. 4737nMol. 4738nMol. 48.00%... 4743nMol. 4744nMol..... 50.00%.. 4752nMol.. 4754nMol. 4755nMol. 4756nMol. 4757nMol.. 52.00%......... 54.00% 4770nMol....... 4777nMol. 4778nMol. 56.00%.. 4782nMol. 4783nMol...... 58.00%... 4793nMol... 4796nMol... 60.00%. 4801nMol... 4804nMol. 4805nMol. 4806nMol. 4807nMol.. 62.00%. 4811nMol. 4812nMol..... 4817nLink. 4818nLink. 4819nLink 64.00% 4820nLink. 4821nLink. 4822nLink. 4823nLink. 4824nLink. 4825nLink. 4826nLink. 4827nLink. 4828nLink. 4829nLink 66.00% 483

In [38]:
with open('totalDict.pickle', 'wb') as f:
    pickle.dump(totalDict, f)

In [36]:
totalDict

2763

In [81]:
from collections import Counter

def getDictLists(sourceKey, dictionary=femaDict):
    '''
    Generator returns key, and result from a dictionary's given sourceKey when available.
    If the result is not available returns NoneType
    '''
    for key in dictionary.keys():
        try:
            ans = dictionary[key].get(sourceKey)
        except:
            continue
        if ans:
            yield key, ans

def phraseToSubWord(phraseList, function1, function2):
    '''
    processes a phrase to a list of words treated by functions 1 and 2
    '''
    newList = []
    for phrase in phraseList:
        newPhrase = []
        words = phrase.split()
        for word in words:
            newWord = function1(word)
            newWord = function2(newWord)
            newPhrase.append(newWord)
        ', '.join(newPhrase)
        newList.extend(newPhrase)
    return newList

def dictKeyConverter(function1, function2, sourceKey, newKey, dictionary=femaDict):
    '''
    Applies phraseToSubWord() to sourceKey in femDict and adds the results to newKey
    '''
    
    stemDescriptors = []
    for key, result in getDictLists(sourceKey):
        newList = phraseToSubWord(result, 
                                  function1, 
                                  function2)
        if key == 'single descriptor counts':
            print(key, result,newList)
        dictionary[key][newKey] = newList
    return dictionary

def dictSingles(function1, function2, sourceKey, listKey, indexKey, countKey, dictionary=femaDict):
    '''
    returns input dictionary with a:
    -listKey with a set-list of alphabetized descriptors used in a dictionary sourceKey list 
    as single words, not part of a phrase
    -indexKey with a dictionary using the words in the listKey list as keys and an associated index
    -countKey with a list of tuples with the count numbers of the single descriptors in the dictionary
    '''

    #create single descriptor list from the sourceKey entrys in dictionary
    singleDescriptors = []
    for _, result in getDictLists(sourceKey): 
        for phrase in result:
            if 0 < len(phrase.split()) < 3:
                singleDescriptors.append(phrase)
    
    #process singleDescriptors with two functions, count the occurences and assign count to countKey 
    singleDescriptors = phraseToSubWord(singleDescriptors,
                                        function1,
                                        function2)
    singleCounter = Counter(singleDescriptors).most_common()
    dictionary[countKey] = singleCounter
    
    #narrow down to a set, sort, and assign singleDescriptor list to listKey
    singleList= list(set(singleDescriptors))
    singleList.sort()
    dictionary[listKey] = singleList
    
    #create index dictionary of singleList elements for indexKey
    descriptorIndices = {}
    for i, word in enumerate(singleList):
        descriptorIndices[word] = i
    dictionary[indexKey] = descriptorIndices
    return dictionary

In [82]:
from nltk import stem
import re

pattern = re.compile('[\W_]+')
function1 = lambda x: pattern.sub('', x)
stemmer = stem.SnowballStemmer('english')
function2 = stemmer.stem

femaDict = dictKeyConverter(function1, function2, 
                            sourceKey='descriptors', 
                            newKey='stemmed descriptors')

femaDict = dictSingles(function1, function2,
                       sourceKey='descriptors',
                       listKey='single descriptors', 
                       indexKey='descriptor index pairs', 
                       countKey='single descriptor counts')

print(femaDict['4713'])
# print(femaDict['single descriptor counts'])

{'CAS': '26446-38-8', 'rejected descriptors': ['waxi', 'also', 'help', 'to', 'emulsifi', 'other', 'compon', 'within', 'the', 'flavor', 'formul'], 'name': 'SUCROSE MONOPALMITATE', 'descriptor indices': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'stemmed descriptors': ['fatti', 'waxi', 'can', 'also', 'help', 'to', 'emu

In [None]:
def descriptorSelector(listKey, sourceKey, selectKey, rejectKey, dictionary=femaDict):
    '''
    returns an input dictionary with two new keys based on the input dictionary's sourceKey list:
    selectKey has a list containing items found in singleList and the sourceKey list
    rejectKey has a list containing items not found in singleList but in the sourceKey list
    '''
    
    for key, result in getDictLists(sourceKey):
        selectList = []
        rejects = []
        for word in result:
            #print('word: {} ' .format(word))
            if word in dictionary[listKey]:
                selectList.append(word)
                #print('selectList: {} ' .format(selectList))
            else:
                rejects.append(word)
                #print('rejects: {} ' .format(rejects))
                #print('counter: {} ' .format(counter))
        if len(selectList) > 0:
            #print('length selectList: {} ' .format(len(selectList)))
            dictionary[key][selectKey] = selectList
        if len(rejects) > 0:
            #print('length rejects: {} ' .format(len(rejects)))
            dictionary[key][rejectKey] = rejects
    return dictionary

def selectIndicesMaker(listKey, sourceKey, indexKey,  newKey, dictionary=femaDict):
    '''
    returns an input dictionary with new keys based on the input dictionary's sourceKey list:
    selectKey has a list containing items found in singleList and the sourceKey list
    rejectKey has a list containing items not found in singleList but in the sourceKey list
    '''
    zeros = [0] * len(dictionary[listKey])
    for key, result in getDictLists(sourceKey):
        newList = zeros[:]
        for word in result:
            idx = dictionary[indexKey].get(word)
            if idx != None: #can't use simple if because it will discount 0 index
                newList[idx] = 1
            else:
                print(key, word)
        dictionary[key][newKey] = newList
    return dictionary

In [None]:
femaDict = descriptorSelector(listKey ='single descriptors', 
                              sourceKey='stemmed descriptors', 
                              selectKey='selected descriptors', 
                              rejectKey='rejected descriptors')

femaDict = selectIndicesMaker(listKey='single descriptors',
                              sourceKey='selected descriptors',
                              indexKey='descriptor index pairs', 
                              newKey='descriptor indices')
print(femaDict['4713'])

In [None]:
import numpy as np
import pandas as pd

def makeDataFrame(listKey, indexKey, dictionary=femaDict):
    '''
    Returns a pandas DataFrame from lists found in dictionary.
    Uses listKey as the column index in DataFrame. 
    As such listKey list must have the same length as the indexKey list. 
    Relies  on makeArrayRow function.
    '''  
    array = [[0]*(len(dictionary[listKey])+1)]#empty row to initialize array. will get ignored downstream
    for key, result in getDictLists(indexKey):
        newRow = [[int(key)] + result]
        array = np.concatenate((array, newRow), axis=0)
    data = array[1:,1:] #takes only real data, ignores first filler row and 1st index column
    index = array[1:,0] #uses the first column as index
    df = pd.DataFrame(data, index=index, columns=dictionary[listKey])
    return df

In [None]:
df = makeDataFrame('single descriptors', 'descriptor indices')

#compares each row in the dataFrame to its counterpart in the dictionary
for key, result in getDictLists('descriptor indices'):
    if (list(df.loc[int(key)]) != result): 
        print(key)
        break
print('All rows in DataArray match!')

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

def makeWordCould(text, title):
    wordcloud = WordCloud().generate(text)
    plt.figure(figsize = (14,7))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.title(title, size=30)
    plt.show()

def listToText(wordList):
    '''
    input: list of strings (words)
    returns: a string made out of the strings in the list separated by spaces
    '''
    text = ''
    for word in wordList:
        text = text + ' ' + word
    return text

In [None]:
count = femaDict['single descriptor counts']
text = ''
for item in count:
    for _ in range(item[1]):
        text = text + ' ' + item[0]

makeWordCould(text, 'Stemmed descriptors used for selection')

In [None]:
meatList = list(df[df.meat == 1].index)
tallowList = list(df[df.tallow ==1].index)
beefList = list(df[df.beef ==1].index)
meatIndices = list(set(meatList+tallowList+beefList))
len(meatIndices)

In [None]:
meatDescriptors = []
for key in meatIndices:
    tempList = femaDict[str(key)]['stemmed descriptors']
    meatDescriptors.extend(tempList)
text = ''
for word in meatDescriptors:
    text = text + ' ' + word

makeWordCould(text, 'Stemmed descriptors in meaty chemicals')

In [None]:
meatDescriptors = []
for key in meatIndices:
    tempList = femaDict[str(key)]['selected descriptors']
    meatDescriptors.extend(tempList)
text = ''
for word in meatDescriptors:
    text = text + ' ' + word

makeWordCould(text, 'Selected descriptors in meaty chemicals')

In [None]:
from sklearn.cluster import KMeans
X = df.values
totalLabels = len(df.columns)
nums = []
unities = []
for num in range(int(totalLabels/2), 0, -1):
    kmeans = KMeans(n_clusters=num, random_state=0).fit(X)
    dfTest = pd.DataFrame(kmeans.labels_, index = df.index)
    labels = dfTest.loc[meatIndices].values.tolist()
    labels = [label for lst in labels for label in lst]
    unity = len(set(labels))
    nums.append(num)
    unities.append(unity)
    if num%5 == 0:
        print(num)

In [None]:
ratios = np.array(unities)/np.array(nums)

plt.figure(1)

plt.subplot(221)
plt.plot(nums, ratios)
plt.axis([0, 40, 0, 1])

plt.subplot(222)
plt.plot(nums, unities)
plt.axis([0, 40, 0, 11])

plt.subplot(223)
plt.plot(nums, ratios)
plt.axis([22, 27, 0.2, 0.5])

plt.subplot(224)
plt.plot(nums, unities)
plt.axis([22, 27, 5, 10])

plt.show()

In [None]:
X = df.values
kmeans = KMeans(n_clusters=25, random_state=0).fit(X)
dfTest = pd.DataFrame(kmeans.labels_, index = df.index)
labels = dfTest.loc[meatIndices].values.tolist()
labels = [label for lst in labels for label in lst]
set(labels)

In [None]:
# To create word clouds for each of the labels generated at 25 total labels
X = df.values
kmeans = KMeans(n_clusters=25, random_state=0).fit(X)
dfLabels = pd.DataFrame(kmeans.labels_, index = df.index, columns=['labels'])

# Determine the labels associated with Meat (meatIndices)
labels = dfTest.loc[meatIndices].values #.tolist()
labels = [label for lst in labels for label in lst]
labelCount = Counter(labels).most_common()
labels = set(labels)

for label in labels:
    descriptors = []
    #find the associated fema numbers
    femaNums = dfLabels[dfLabels['labels'] == label].index.tolist()
    #for each fema number:
    for num in femaNums:
        #extract the select descriptors
        tempList = femaDict[str(num)]['selected descriptors']
        #compile a list of the descriptors
        descriptors.extend(tempList)
    #make a wordCloud
    meatCount = [v[1] for v in labelCount if v[0]==label]
    title = 'Label: ' + str(label) + '; Number of chemicals: ' + str(meatCount[0]) + '/' +  str(len(femaNums)) 
    makeWordCould(listToText(descriptors), title)

In [None]:
def findEmptyKeys(dictionary, descriptorKey):
    notEmpty = []
    empty = []
    
    for key in dictionary.keys():
        test = dictionary[key].get(descriptorKey)
        if test:
            notEmpty.append(key)
        else:
            empty.append(key)
    
    print('Not empty:\n')
    for item in notEmpty[0:5]:
        print(item)
    print('\nEmpty:')
    for item in empty[0:5]:
        print(item)
    
    return

findEmptyKeys(femaDict, 'select descriptor indices')