In [89]:
import json
from pathlib2 import Path
from pprint import pprint
import numpy as np
import pandas as pd

ingredient_file = 'ewg_ingredients.json'
product_file = 'ewg_products.json'

try:
    c1 = json.load(open('ewg_skindeep_ingredients1.json'))
except IOError:
    c1 = []
try:
    c2 = json.load(open('ewg_skindeep_ingredients2.json'))
except IOError:
    c2 = []
try:
    c3 = json.load(open('ewg_skindeep_ingredients3.json'))
except IOError:
    c3 = []

products = {}
ingredients = {}
dup_prods = None
dup_ing = None

In [3]:
# Collect crawled data
# Since 3 crawler instances were ran together asynchronously
# concatenate results and remove duplicates

# Only parse ingredients and product if this step hasn't already been done 
if not Path(product_file).is_file():
    dup_prods = 0
    p_list = (
        [x for x in c1 if 'product_id' in x.keys()]
        + [x for x in c2 if 'product_id' in x.keys()]
        + [x for x in c3 if 'product_id' in x.keys()])
    for product in p_list:
        if product['product_id'] not in products.keys():
            products[product['product_id']] = product
        else:
            dup_prods = dup_prods + 1
    with open('ewg_products.json', 'w') as f:
        json.dump(products, f)
else:
    products = json.load(open(product_file))

if not Path(ingredient_file).is_file():
    dup_ing = 0
    i_list = (
        [x for x in c1 if 'ingredient_id' in x.keys()]
        + [x for x in c2 if 'ingredient_id' in x.keys()]
        + [x for x in c3 if 'ingredient_id' in x.keys()])
    for ingredient in i_list:
        if ingredient['ingredient_id'] not in ingredients.keys():
            ingredients[ingredient['ingredient_id']] = ingredient
        else:
            dup_ing = dup_ing + 1
    with open('ewg_ingredients.json', 'w') as f:
        json.dump(ingredients, f)
else:
    ingredients = json.load(open(ingredient_file))

print("Crawled data contains {} ingredients in {} unique products".format(len(ingredients), len(products)))
if dup_ing is not None:
    print("Duplicate Ingredients: {}".format(dup_ing))
if dup_prods is not None:
    print("Duplicate Products: {}".format(dup_prods))

Crawled data contains 8927 ingredients in 72309 unique products
Duplicate Ingredients: 8773
Duplicate Products: 15418


In [4]:
count = 0
num_ingredients = 0
for (k, v) in products.iteritems():
    if "ingredient_list" in v.keys():
        count = count + 1
        num_ingredients = num_ingredients + len(v["ingredient_list"])
    #else:
    #    pprint(v)
print("{} out of {} product entries have ingredient lists. The average number of ingredients is {}".format(count, len(products), 1.0 * num_ingredients/count))

72304 out of 72309 product entries have ingredient lists. The average number of ingredients is 21.1214040717


In [8]:
# Try cross comparing a product ingredient list to ingredients dictionary
error_keys = []
error_prod = []
for (k, v) in products.iteritems():
    if 'ingredient_list' in v.keys():
        for key in v['ingredient_list']:
            if key not in ingredients:                
                error_keys.append(key)
                error_prod.append(v['url'])

In [10]:
print("Number of keys not in ingredients dict: {}".format(len(error_keys)))


Number of keys not in ingredients dict: 0


In [74]:
# Check ingredient data availability
metric_keys = ingredients.values()[0].keys()
metric_dict = {}
for ingredient in ingredients.values():
    for key in (set(ingredient.keys()) - set(metric_keys)):
        metric_keys.append(key)
    for key in metric_keys:
        if ingredient.get(key, None) is not None:
            metric_dict[key] = metric_dict.get(key, 0) + 1


In [75]:
print("Percent of ingredients with given key:\n")
for (k, v) in metric_dict.iteritems():
    print("{}: {:.2f}%".format(k,  (100.0 * v/len(ingredients))))

Percent of ingredients with given key:

use_restrict_score: 99.99%
cancer_score: 99.99%
allergy_imm_tox_score: 99.99%
function_list: 68.06%
ingredient_score: 99.99%
url: 100.00%
overall_hazard_score: 99.99%
ingredient_name: 99.99%
ingredient_id: 100.00%
data_availability: 99.99%
synonym_list: 64.36%
dev_reprod_tox_score: 99.99%


In [81]:
# Check product data availability
metric_keys = products.values()[0].keys()
metric_dict = {}
for product in products.values():
    for key in (set(product.keys()) - set(metric_keys)):
        metric_keys.append(key)
    for key in metric_keys:
        if product.get(key, None) is not None:
            metric_dict[key] = metric_dict.get(key, 0) + 1

        

In [82]:
print("Percent of products with given key:\n")
for (k, v) in metric_dict.iteritems():
    print("{}: {:.2f}%".format(k,  (100.0 * v/len(products))))

Percent of products with given key:

use_restrict_score: 100.00%
cancer_score: 100.00%
product_type: 100.00%
product_id: 100.00%
ingredient_list: 99.99%
url: 100.00%
product_score: 98.36%
overall_hazard_score: 100.00%
data_availability: 100.00%
allergy_imm_tox_score: 100.00%
product_name: 100.00%
dev_reprod_tox_score: 100.00%
