In [26]:
import os
import tqdm
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datasets import load_dataset

In [27]:
data = [os.path.join('meta', x) for x in os.listdir('meta')]

In [72]:
grid = []
nutrition_desc = dict()

for d in tqdm.tqdm(data):
    row = dict()

    with open(d) as file:
        obj = json.load(file)
        row['src'] = obj['source'].get('sourceRecipeUrl', None)
        row['ingredients'] = ', '.join(obj['ingredientLines'])
        
        nutrition = obj['nutritionEstimates']
        for n in nutrition:
            nutrition_desc[n['attribute']] = {
                'unit': n['unit']['plural'],
                'description': n['description'],
            }
            row[n['attribute']] = n['value']
    
    grid.append(row)

100%|██████████| 27638/27638 [00:08<00:00, 3233.21it/s]


In [28]:
df = pd.DataFrame(data=grid)
df.to_csv('data.csv', index=False)

In [73]:
df = pd.DataFrame(data=grid)
df = df[df['SUGAR'].notna()]
df.rename(columns = {'SUGAR':'sugar'}, inplace = True)
df = df[['src', 'ingredients', 'sugar']]
df.fillna(0, inplace=True)
df.head()

Unnamed: 0,src,ingredients,sugar
0,http://www.myrecipes.com/recipe/stuffed-cherry...,"2 pints cherry tomatoes, 1 avocado, peeled and...",3.49
1,http://paleomg.com/simple-baked-halibut-with-b...,1 pound halibut (cut into 2 eight ounce pieces...,0.17
2,http://allrecipes.com/Recipe/honey-whole-wheat...,"1 cup water (70 degrees to 80 degrees), 1/4 cu...",2.15
3,http://eatwithmarco.com/en/pasta-alla-norma/,"3 melanzane / 3 eggplants, 1 spicchio d’aglio ...",3.78
4,http://www.myrecipes.com/recipe/plum-upside-do...,"2 teaspoons butter, 6 large red plums, pitted ...",38.17


In [77]:
df = df[df['sugar'] < 100]
label = df['sugar']
label = np.array(label)
mean = label.mean()
std = label.std()
mean, std

(10.85810766787474, 13.3627190349059)

In [78]:
label = (label - mean) / std
label.mean(), label.std()

(-2.8934427485446114e-17, 0.9999999999999999)

In [79]:
df['sugar'] = label

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sugar'] = label


In [83]:
df.head()

Unnamed: 0,src,ingredients,sugar
0,http://www.myrecipes.com/recipe/stuffed-cherry...,"2 pints cherry tomatoes, 1 avocado, peeled and...",-0.551393
1,http://paleomg.com/simple-baked-halibut-with-b...,1 pound halibut (cut into 2 eight ounce pieces...,-0.799845
2,http://allrecipes.com/Recipe/honey-whole-wheat...,"1 cup water (70 degrees to 80 degrees), 1/4 cu...",-0.651672
3,http://eatwithmarco.com/en/pasta-alla-norma/,"3 melanzane / 3 eggplants, 1 spicchio d’aglio ...",-0.529691
4,http://www.myrecipes.com/recipe/plum-upside-do...,"2 teaspoons butter, 6 large red plums, pitted ...",2.043887


In [84]:
df.to_csv('sugar.csv', index=False)

In [101]:
from huggingface_hub import notebook_login

notebook_login()#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

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [93]:
dataset = load_dataset("csv", data_files="sugar.csv")

Found cached dataset csv (/Users/ziqin/.cache/huggingface/datasets/csv/default-11981611f38d66e1/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


  0%|          | 0/1 [00:00<?, ?it/s]

In [94]:
dataset = dataset['train'].shuffle(seed=42).train_test_split(test_size=0.08)

In [95]:
dataset

DatasetDict({
    train: Dataset({
        features: ['src', 'ingredients', 'sugar'],
        num_rows: 22592
    })
    test: Dataset({
        features: ['src', 'ingredients', 'sugar'],
        num_rows: 1965
    })
})

In [102]:
dataset.push_to_hub("ziq/ingredient_to_sugar_level")

Pushing split train to the Hub.


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/23 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing split test to the Hub.


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading metadata:   0%|          | 0.00/527 [00:00<?, ?B/s]