In [1]:
from chemproject.dictools import *

In [2]:
test = dictionary_maker([4300, 4332])

https://www.femaflavor.org/flavor/library/cis-and-trans-l-mercapto-p-menthan-3-one
.https://www.femaflavor.org/flavor/library/1-methyl-1h-pyrrole-2-carboxaldehyde
.

In [3]:
test

{4300: {'CAS': '29725-66-4',
  'FEMA': '4300',
  'JECFA': '1673',
  'descriptors': 'Fruit',
  'link': 'https://www.femaflavor.org/flavor/library/cis-and-trans-l-mercapto-p-menthan-3-one',
  'name': 'CIS- AND TRANS-L-MERCAPTO-P-MENTHAN-3-ONE',
  'rdkit Mol': <rdkit.Chem.rdchem.Mol at 0x112da5d00>,
  'tokens': ['fruit']},
 4332: {'CAS': '1192-58-1',
  'FEMA': '4332',
  'JECFA': '2152',
  'descriptors': 'Nuts',
  'link': 'https://www.femaflavor.org/flavor/library/1-methyl-1h-pyrrole-2-carboxaldehyde',
  'name': '1-METHYL-1H-PYRROLE-2-CARBOXALDEHYDE',
  'rdkit Mol': <rdkit.Chem.rdchem.Mol at 0x112658bc0>,
  'tokens': ['nut']}}

In [38]:
with open('totalDict.pickle', 'wb') as f:
    pickle.dump(totalDict, f)

In [81]:
from collections import Counter

def getDictLists(sourceKey, dictionary=femaDict):
    '''
    Generator returns key, and result from a dictionary's given sourceKey when available.
    If the result is not available returns NoneType
    '''
    for key in dictionary.keys():
        try:
            ans = dictionary[key].get(sourceKey)
        except:
            continue
        if ans:
            yield key, ans

def phraseToSubWord(phraseList, function1, function2):
    '''
    processes a phrase to a list of words treated by functions 1 and 2
    '''
    newList = []
    for phrase in phraseList:
        newPhrase = []
        words = phrase.split()
        for word in words:
            newWord = function1(word)
            newWord = function2(newWord)
            newPhrase.append(newWord)
        ', '.join(newPhrase)
        newList.extend(newPhrase)
    return newList

def dictKeyConverter(function1, function2, sourceKey, newKey, dictionary=femaDict):
    '''
    Applies phraseToSubWord() to sourceKey in femDict and adds the results to newKey
    '''
    
    stemDescriptors = []
    for key, result in getDictLists(sourceKey):
        newList = phraseToSubWord(result, 
                                  function1, 
                                  function2)
        if key == 'single descriptor counts':
            print(key, result,newList)
        dictionary[key][newKey] = newList
    return dictionary

def dictSingles(function1, function2, sourceKey, listKey, indexKey, countKey, dictionary=femaDict):
    '''
    returns input dictionary with a:
    -listKey with a set-list of alphabetized descriptors used in a dictionary sourceKey list 
    as single words, not part of a phrase
    -indexKey with a dictionary using the words in the listKey list as keys and an associated index
    -countKey with a list of tuples with the count numbers of the single descriptors in the dictionary
    '''

    #create single descriptor list from the sourceKey entrys in dictionary
    singleDescriptors = []
    for _, result in getDictLists(sourceKey): 
        for phrase in result:
            if 0 < len(phrase.split()) < 3:
                singleDescriptors.append(phrase)
    
    #process singleDescriptors with two functions, count the occurences and assign count to countKey 
    singleDescriptors = phraseToSubWord(singleDescriptors,
                                        function1,
                                        function2)
    singleCounter = Counter(singleDescriptors).most_common()
    dictionary[countKey] = singleCounter
    
    #narrow down to a set, sort, and assign singleDescriptor list to listKey
    singleList= list(set(singleDescriptors))
    singleList.sort()
    dictionary[listKey] = singleList
    
    #create index dictionary of singleList elements for indexKey
    descriptorIndices = {}
    for i, word in enumerate(singleList):
        descriptorIndices[word] = i
    dictionary[indexKey] = descriptorIndices
    return dictionary

In [82]:
from nltk import stem
import re

pattern = re.compile('[\W_]+')
function1 = lambda x: pattern.sub('', x)
stemmer = stem.SnowballStemmer('english')
function2 = stemmer.stem

femaDict = dictKeyConverter(function1, function2, 
                            sourceKey='descriptors', 
                            newKey='stemmed descriptors')

femaDict = dictSingles(function1, function2,
                       sourceKey='descriptors',
                       listKey='single descriptors', 
                       indexKey='descriptor index pairs', 
                       countKey='single descriptor counts')

print(femaDict['4713'])
# print(femaDict['single descriptor counts'])

{'CAS': '26446-38-8', 'rejected descriptors': ['waxi', 'also', 'help', 'to', 'emulsifi', 'other', 'compon', 'within', 'the', 'flavor', 'formul'], 'name': 'SUCROSE MONOPALMITATE', 'descriptor indices': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'stemmed descriptors': ['fatti', 'waxi', 'can', 'also', 'help', 'to', 'emu

In [None]:
def descriptorSelector(listKey, sourceKey, selectKey, rejectKey, dictionary=femaDict):
    '''
    returns an input dictionary with two new keys based on the input dictionary's sourceKey list:
    selectKey has a list containing items found in singleList and the sourceKey list
    rejectKey has a list containing items not found in singleList but in the sourceKey list
    '''
    
    for key, result in getDictLists(sourceKey):
        selectList = []
        rejects = []
        for word in result:
            #print('word: {} ' .format(word))
            if word in dictionary[listKey]:
                selectList.append(word)
                #print('selectList: {} ' .format(selectList))
            else:
                rejects.append(word)
                #print('rejects: {} ' .format(rejects))
                #print('counter: {} ' .format(counter))
        if len(selectList) > 0:
            #print('length selectList: {} ' .format(len(selectList)))
            dictionary[key][selectKey] = selectList
        if len(rejects) > 0:
            #print('length rejects: {} ' .format(len(rejects)))
            dictionary[key][rejectKey] = rejects
    return dictionary

def selectIndicesMaker(listKey, sourceKey, indexKey,  newKey, dictionary=femaDict):
    '''
    returns an input dictionary with new keys based on the input dictionary's sourceKey list:
    selectKey has a list containing items found in singleList and the sourceKey list
    rejectKey has a list containing items not found in singleList but in the sourceKey list
    '''
    zeros = [0] * len(dictionary[listKey])
    for key, result in getDictLists(sourceKey):
        newList = zeros[:]
        for word in result:
            idx = dictionary[indexKey].get(word)
            if idx != None: #can't use simple if because it will discount 0 index
                newList[idx] = 1
            else:
                print(key, word)
        dictionary[key][newKey] = newList
    return dictionary

In [None]:
femaDict = descriptorSelector(listKey ='single descriptors', 
                              sourceKey='stemmed descriptors', 
                              selectKey='selected descriptors', 
                              rejectKey='rejected descriptors')

femaDict = selectIndicesMaker(listKey='single descriptors',
                              sourceKey='selected descriptors',
                              indexKey='descriptor index pairs', 
                              newKey='descriptor indices')
print(femaDict['4713'])

In [None]:
import numpy as np
import pandas as pd

def makeDataFrame(listKey, indexKey, dictionary=femaDict):
    '''
    Returns a pandas DataFrame from lists found in dictionary.
    Uses listKey as the column index in DataFrame. 
    As such listKey list must have the same length as the indexKey list. 
    Relies  on makeArrayRow function.
    '''  
    array = [[0]*(len(dictionary[listKey])+1)]#empty row to initialize array. will get ignored downstream
    for key, result in getDictLists(indexKey):
        newRow = [[int(key)] + result]
        array = np.concatenate((array, newRow), axis=0)
    data = array[1:,1:] #takes only real data, ignores first filler row and 1st index column
    index = array[1:,0] #uses the first column as index
    df = pd.DataFrame(data, index=index, columns=dictionary[listKey])
    return df

In [None]:
df = makeDataFrame('single descriptors', 'descriptor indices')

#compares each row in the dataFrame to its counterpart in the dictionary
for key, result in getDictLists('descriptor indices'):
    if (list(df.loc[int(key)]) != result): 
        print(key)
        break
print('All rows in DataArray match!')

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

def makeWordCould(text, title):
    wordcloud = WordCloud().generate(text)
    plt.figure(figsize = (14,7))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.title(title, size=30)
    plt.show()

def listToText(wordList):
    '''
    input: list of strings (words)
    returns: a string made out of the strings in the list separated by spaces
    '''
    text = ''
    for word in wordList:
        text = text + ' ' + word
    return text

In [None]:
count = femaDict['single descriptor counts']
text = ''
for item in count:
    for _ in range(item[1]):
        text = text + ' ' + item[0]

makeWordCould(text, 'Stemmed descriptors used for selection')

In [None]:
meatList = list(df[df.meat == 1].index)
tallowList = list(df[df.tallow ==1].index)
beefList = list(df[df.beef ==1].index)
meatIndices = list(set(meatList+tallowList+beefList))
len(meatIndices)

In [None]:
meatDescriptors = []
for key in meatIndices:
    tempList = femaDict[str(key)]['stemmed descriptors']
    meatDescriptors.extend(tempList)
text = ''
for word in meatDescriptors:
    text = text + ' ' + word

makeWordCould(text, 'Stemmed descriptors in meaty chemicals')

In [None]:
meatDescriptors = []
for key in meatIndices:
    tempList = femaDict[str(key)]['selected descriptors']
    meatDescriptors.extend(tempList)
text = ''
for word in meatDescriptors:
    text = text + ' ' + word

makeWordCould(text, 'Selected descriptors in meaty chemicals')

In [None]:
from sklearn.cluster import KMeans
X = df.values
totalLabels = len(df.columns)
nums = []
unities = []
for num in range(int(totalLabels/2), 0, -1):
    kmeans = KMeans(n_clusters=num, random_state=0).fit(X)
    dfTest = pd.DataFrame(kmeans.labels_, index = df.index)
    labels = dfTest.loc[meatIndices].values.tolist()
    labels = [label for lst in labels for label in lst]
    unity = len(set(labels))
    nums.append(num)
    unities.append(unity)
    if num%5 == 0:
        print(num)

In [None]:
ratios = np.array(unities)/np.array(nums)

plt.figure(1)

plt.subplot(221)
plt.plot(nums, ratios)
plt.axis([0, 40, 0, 1])

plt.subplot(222)
plt.plot(nums, unities)
plt.axis([0, 40, 0, 11])

plt.subplot(223)
plt.plot(nums, ratios)
plt.axis([22, 27, 0.2, 0.5])

plt.subplot(224)
plt.plot(nums, unities)
plt.axis([22, 27, 5, 10])

plt.show()

In [None]:
X = df.values
kmeans = KMeans(n_clusters=25, random_state=0).fit(X)
dfTest = pd.DataFrame(kmeans.labels_, index = df.index)
labels = dfTest.loc[meatIndices].values.tolist()
labels = [label for lst in labels for label in lst]
set(labels)

In [None]:
# To create word clouds for each of the labels generated at 25 total labels
X = df.values
kmeans = KMeans(n_clusters=25, random_state=0).fit(X)
dfLabels = pd.DataFrame(kmeans.labels_, index = df.index, columns=['labels'])

# Determine the labels associated with Meat (meatIndices)
labels = dfTest.loc[meatIndices].values #.tolist()
labels = [label for lst in labels for label in lst]
labelCount = Counter(labels).most_common()
labels = set(labels)

for label in labels:
    descriptors = []
    #find the associated fema numbers
    femaNums = dfLabels[dfLabels['labels'] == label].index.tolist()
    #for each fema number:
    for num in femaNums:
        #extract the select descriptors
        tempList = femaDict[str(num)]['selected descriptors']
        #compile a list of the descriptors
        descriptors.extend(tempList)
    #make a wordCloud
    meatCount = [v[1] for v in labelCount if v[0]==label]
    title = 'Label: ' + str(label) + '; Number of chemicals: ' + str(meatCount[0]) + '/' +  str(len(femaNums)) 
    makeWordCould(listToText(descriptors), title)

In [None]:
def findEmptyKeys(dictionary, descriptorKey):
    notEmpty = []
    empty = []
    
    for key in dictionary.keys():
        test = dictionary[key].get(descriptorKey)
        if test:
            notEmpty.append(key)
        else:
            empty.append(key)
    
    print('Not empty:\n')
    for item in notEmpty[0:5]:
        print(item)
    print('\nEmpty:')
    for item in empty[0:5]:
        print(item)
    
    return

findEmptyKeys(femaDict, 'select descriptor indices')