In [2]:
import torch
from torch import nn
from torch.optim import Adam
from torchvision.transforms import transforms
from torch.utils.data import DataLoader, Dataset
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
from PIL import Image
import torchvision.models as models
from sklearn.preprocessing import MultiLabelBinarizer
import torch.nn.functional as F
from datetime import datetime
import timm

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

  from .autonotebook import tqdm as notebook_tqdm


device(type='cuda')

In [3]:
metadata_cafe1 = '../nutrition5k_dataset/metadata/dish_metadata_cafe1.csv'

In [None]:
def parse_file(data):
    data = data.split('\n')
    total = 0
    dishes = []
    df_data = []
    print("lines length: ", len(data))
    for line in data:
        line = line.strip()
        if line == '':
            continue
        line = line.split(',')

        num_ingredients = (len(line) - 6) // 7

        new_dish = {
            'dish_id': line[0],
            'total_calories': float(line[1]),
            'total_mass': float(line[2]),
            'total_fat': float(line[3]),
            'total_carbs': float(line[4]),
            'total_protein': float(line[5]),
        }
        dishes.append(line[0])

        total = total + 1
        for i in range(num_ingredients):
            ingredients = line[6+i*7:6+(i+1)*7]
            # print(ingredients)
            ingredient = {
                'ingredient_id': ingredients[0],
                'ingredient_name': ingredients[1],
                'ingredient_mass': float(ingredients[2]),
                'ingredient_calories': float(ingredients[3]),
                'ingredient_fat': float(ingredients[4]),
                'ingredient_carbs': float(ingredients[5]),
                'ingredient_protein': float(ingredients[6])
            }
            df_data.append({**new_dish, **ingredient})
    print("total dishes: ", total)
    return df_data, dishes


def read_and_parse_file(file_path):
    with open(file_path, 'r') as file:
        return parse_file(file.read())


df_data, dishes = read_and_parse_file(metadata_cafe1)
dishes_df = pd.DataFrame(dishes, columns=['dish_id'])

dataset = pd.DataFrame(df_data)

print("total shape", dataset.shape)
print("unique ingredient_ids", dataset['ingredient_id'].unique().shape)

print("unique dishes ids based only on ids from dataset: ",
      dishes_df['dish_id'].unique().shape)
print("unique dishes ids based on all combination ingredient - dish ",
      dataset['dish_id'].unique().shape)

lines length:  4769
total dishes:  4768
total shape (27225, 13)
unique ingredient_ids (211,)
unique dishes ids based only on ids from dataset:  (4768,)
unique dishes ids based on all combination ingredient - dish  (4768,)


In [5]:
dataset.head()

Unnamed: 0,dish_id,total_calories,total_mass,total_fat,total_carbs,total_protein,ingredient_id,ingredient_name,ingredient_mass,ingredient_calories,ingredient_fat,ingredient_carbs,ingredient_protein
0,dish_1561662216,300.794281,193.0,12.387489,28.21829,18.63397,ingr_0000000508,soy sauce,3.398568,1.801241,0.020391,0.16653,0.275284
1,dish_1561662216,300.794281,193.0,12.387489,28.21829,18.63397,ingr_0000000122,garlic,2.124105,3.164916,0.010621,0.700955,0.135943
2,dish_1561662216,300.794281,193.0,12.387489,28.21829,18.63397,ingr_0000000026,white rice,8.49642,11.045346,0.025489,2.378998,0.229403
3,dish_1561662216,300.794281,193.0,12.387489,28.21829,18.63397,ingr_0000000524,parsley,0.213397,0.07917,0.001707,0.013657,0.006189
4,dish_1561662216,300.794281,193.0,12.387489,28.21829,18.63397,ingr_0000000094,onions,1.707173,0.682869,0.001707,0.153646,0.018779


In [10]:
dataset.value_counts('ingredient_id').describe()

count     211.000000
mean      129.028436
std       208.086610
min         1.000000
25%        18.000000
50%        53.000000
75%       162.000000
max      1653.000000
Name: count, dtype: float64

We can see that some of the ingredients appear just once in the dataset.

For this project I will use only top N ingredients by total mass.

In [15]:
top75_ingredients = dataset.groupby('ingredient_id')[
    'ingredient_mass'].sum().sort_values(ascending=False).head(75).index

top75_ingredients_df = dataset[dataset['ingredient_id'].isin(
    top75_ingredients)]

top75_ingredients_df.head()

Unnamed: 0,dish_id,total_calories,total_mass,total_fat,total_carbs,total_protein,ingredient_id,ingredient_name,ingredient_mass,ingredient_calories,ingredient_fat,ingredient_carbs,ingredient_protein
2,dish_1561662216,300.794281,193.0,12.387489,28.21829,18.63397,ingr_0000000026,white rice,8.49642,11.045346,0.025489,2.378998,0.229403
4,dish_1561662216,300.794281,193.0,12.387489,28.21829,18.63397,ingr_0000000094,onions,1.707173,0.682869,0.001707,0.153646,0.018779
5,dish_1561662216,300.794281,193.0,12.387489,28.21829,18.63397,ingr_0000000023,brown rice,68.0,75.48,0.612,15.64,1.768
7,dish_1561662216,300.794281,193.0,12.387489,28.21829,18.63397,ingr_0000000054,apple,4.267931,2.219324,0.008536,0.59751,0.012804
8,dish_1561662216,300.794281,193.0,12.387489,28.21829,18.63397,ingr_0000000029,mixed greens,21.339657,5.975104,0.085359,1.131002,0.469472


In [17]:
top75_ingredients_df.value_counts('ingredient_id').describe()

count      75.000000
mean      213.706667
std       218.622331
min        19.000000
25%        89.000000
50%       170.000000
75%       258.500000
max      1653.000000
Name: count, dtype: float64

We have a mean of 213 ingredients which will help in the training.

In [None]:
top75_ingredients_df[['ingredient_id', 'total_mass', 'total_calories',
                      'total_fat', 'total_carbs', 'total_protein']].describe()

Unnamed: 0,total_mass,total_calories,total_fat,total_carbs,total_protein
count,16028.0,16028.0,16028.0,16028.0,16028.0
mean,266.208261,320.396939,16.102739,23.176464,24.023547
std,223.643283,298.253359,22.2493,29.08565,21.955824
min,5.0,1.15,0.0,0.0,0.072
25%,145.0,141.419998,5.737881,9.717452,5.944
50%,241.0,290.038407,13.122,18.466,19.712002
75%,357.0,436.992378,22.471354,31.491592,35.307003
max,7975.0,9485.81543,875.541016,844.568604,147.491821
