In [1]:
# for basic data science
import pandas as pd
import math

# for downloading files off the internet
import urllib.request
import json
import time

In [2]:
# JSON files are at addresses of this form
def flavordb_entity_url(x):
    return "https://cosylab.iiitd.edu.in/flavordb2/entities_json?id="+str(x)


# translates the JSON file at the specified web address into a dictionary
def get_flavordb_entity(x):
    # source: https://stackoverflow.com/questions/12965203/how-to-get-json-from-webpage-into-python-script
    with urllib.request.urlopen(flavordb_entity_url(x)) as url:
        return json.loads(url.read().decode())
    return None


In [3]:
# "subcolumns" in the "molecules" column that we are interested in
def molecules_df_cols():
    return ['pubchem_id', 'flavor_profile', 'bitterdb_id' ,'bitter', "supersweetdb_id","super_sweet"]    


In [4]:
def clean_flavordb_dataframes(molecules_df): #TODO: Recheck
    """
    Helps ensure consistent intra-column typing and converts all strings to lowercase.
    """
    strtype = type('')
    settype = type(set())
    
#     # ensuring that these columns have type str
#     for k in molecules_df_cols()[1:]:
#         molecules_df[k] = [
#             elem.strip().lower() if isinstance(elem, strtype) else ''
#             for elem in flavor_df[k]
#         ]
    
    molecules_df['flavor_profile'] = [
        set([x.strip().lower() for x in elem])
        for elem in molecules_df['flavor_profile']
    ]
    
    return molecules_df
    
# generate dataframes from some of the JSON objects
def get_flavordb_dataframes(start, end):
    """
    Download JSON data, converts it to DataFrames, and cleans them.
    
    Returns DataFrames for both foods and molecules, as well as missing JSON entries.
    """
    # make intermediate values to make dataframes from
    molecules_dict = {}
    missing = [] # numbers of the missing JSON files during iteration
    
    for i in range(start, end):
        # we use a try-except here because some of the JSON pages are missing
        try:
            # 1: Find the JSON file. Gets the ith food entity, as a JSON dict
            fdbe = get_flavordb_entity(i)

            # update the molecules dataframe with the data in 'molecules' field
            for m in fdbe['molecules']:
                if m['pubchem_id'] not in molecules_dict:
                    molecules_dict[m['pubchem_id']] = [
                        set(m['flavor_profile'].split('@'))            
                    ]
                    for col in molecules_df_cols()[2:]:
                        molecules_dict[m['pubchem_id']].append(m[col])
                        
                        
        #Exception Handler
        except urllib.error.HTTPError as e:
            if e.code == 404: # if the JSON file is missing
                missing.append(i)
            else:
                raise RuntimeError(
                    'Error while fetching JSON object from ' + flavordb_entity_url(x)
                ) from e

    molecules_df = pd.DataFrame(
        [
            [k, v[0], v[1], v[2], v[3], v[4]]
             for k, v in molecules_dict.items()
        ],
        columns=molecules_df_cols()
    )
    
    # clean up the dataframe columns
    molecules_df = clean_flavordb_dataframes(molecules_df)
    
    return molecules_df, missing


In [5]:
# updates & saves the download progress of your dataframes
def update_flavordb_dataframes(df1, ranges):
    """
    Adds more data to the specified DataFrames, and saves them as CSV files.
    
    If successful, returns the specified DataFrames, now updated, and any missing JSON files.
    """
    df1_old = df1
    missing_old = []

    # time how long it took to download the files
    start = time.time()
    
    # for each range in ranges, save your progress.
    # don't continue with the program unless everything succeeds!
    try:
        for a, b in ranges:
            df1_new, missing_new = get_flavordb_dataframes(a, b)
            df1_old = pd.concat([df1_old, df1_new], ignore_index=True)
            missing_old.extend(missing_new)
        
        return df1_old, missing_old
    except:
        raise # always throw the error so you know what happened
    finally:
        # even if you throw an error, you'll have saved them as csv files
        df1_old.to_csv('molecules.csv')

        end = time.time()
        mins = (end - start) / 60.0
        print('Downloading took: '+ str(mins) + 'minutes')


In [6]:
# take new dataframes
df1 = pd.DataFrame(columns=molecules_df_cols())

# fill the DataFrames with JSON files up to id = 1000
ranges = [(50 * i, 50 * (i + 1)) for i in range(20)]
# update & save the dataframes as csv files
update_flavordb_dataframes(df1, ranges)


Downloading took: 3.540716548760732minutes


(     pubchem_id                                     flavor_profile  \
 0          6322                                            {faint}   
 1          6736  {animal, indole, mothball, fecal, civet, very ...   
 2         31252  {cocoa, roasted nut, medical, roasted nuts, me...   
 3          7909  {herbal, dairy, green, solvent, fruity, sharp,...   
 4          7284              {cocoa, almond, musty, nutty, coffee}   
 ...         ...                                                ...   
 7635     107905                                           {bitter}   
 7636    5280343                                           {bitter}   
 7637      65084                                           {bitter}   
 7638        994                                         {odorless}   
 7639     445154                                           {bitter}   
 
      bitterdb_id bitter supersweetdb_id super_sweet  
 0                     0                              
 1                     0            