# One hot encode recipe ingredients

In [None]:
import pandas as pd
import numpy as np

## Load recipes and drop unimportant columns

In [None]:
recipes = pd.read_csv("foodDataset/PP_recipes.csv")
recipes.drop(columns=["calorie_level", "name_tokens", "i", "id"], inplace=True)
recipes.head()

Unnamed: 0,ingredient_tokens,steps_tokens,techniques,ingredient_ids
0,"[[2911, 1019, 249, 6878], [1353], [6953], [153...","[40480, 40482, 21662, 481, 6878, 500, 246, 161...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...","[389, 7655, 6270, 1527, 3406]"
1,"[[17918], [25916], [2507, 6444], [8467, 1179],...","[40480, 40482, 729, 2525, 10906, 485, 43, 8393...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...","[2683, 4969, 800, 5298, 840, 2499, 6632, 7022,..."
2,"[[5867, 24176], [1353], [6953], [1301, 11332],...","[40480, 40482, 8240, 481, 24176, 296, 1353, 66...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...","[1257, 7655, 6270, 590, 5024, 1119, 4883, 6696..."
3,"[[1270, 1645, 28447], [21601], [27952, 29471, ...","[40480, 40482, 5539, 21601, 1073, 903, 2324, 4...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[7940, 3609, 7060, 6265, 1170, 6654, 5003, 3561]"
4,"[[1430, 11434], [1430, 17027], [1615, 23, 695,...","[40480, 40482, 14046, 1430, 11434, 488, 17027,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...","[3484, 6324, 7594, 243]"


## Convert to numpy

In [None]:
ingredient_np = recipes["ingredient_ids"].to_numpy()
print(ingredient_np)

['[389, 7655, 6270, 1527, 3406]'
 '[2683, 4969, 800, 5298, 840, 2499, 6632, 7022, 1511, 3248, 4964, 6270]'
 '[1257, 7655, 6270, 590, 5024, 1119, 4883, 6696, 7946, 5648, 7239, 7705, 7594, 1168, 2683]'
 ...
 '[2378, 7655, 3219, 2320, 5168, 5319, 4189, 2683, 2499, 6363, 840, 6711, 1168, 5180]'
 '[5627, 2807, 5412, 3399, 7979, 1093, 1257, 7803, 6696, 800, 1833, 3512]'
 '[6473, 800, 4807, 2683, 335, 1563, 1511, 3248, 2499]']


## Convert array entries to values instead of strings

In [None]:
for x in range(len(ingredient_np)):
    ingredient_np[x] = np.array(eval(ingredient_np[x]))
    
print(ingredient_np) #now its in a 2D array form

[array([ 389, 7655, 6270, 1527, 3406])
 array([2683, 4969,  800, 5298,  840, 2499, 6632, 7022, 1511, 3248, 4964,
        6270])
 array([1257, 7655, 6270,  590, 5024, 1119, 4883, 6696, 7946, 5648, 7239,
        7705, 7594, 1168, 2683])
 ...
 array([2378, 7655, 3219, 2320, 5168, 5319, 4189, 2683, 2499, 6363,  840,
        6711, 1168, 5180])
 array([5627, 2807, 5412, 3399, 7979, 1093, 1257, 7803, 6696,  800, 1833,
        3512])
 array([6473,  800, 4807, 2683,  335, 1563, 1511, 3248, 2499])]


In [None]:
## Expand ingredient lists to their own columns to get one hot encoding

In [None]:
reducedRecipes = []
ingrIds = recipes["ingredient_ids"]
for index, value in ingrIds.items(): #loop through each row and make a new row with one hot encoding
    
    row = {}
    for x in value:
        row.update({"ig_" + str(x): 1})
    reducedRecipes.append(row)


print(reducedRecipes[0])

{'ig_389': 1, 'ig_7655': 1, 'ig_6270': 1, 'ig_1527': 1, 'ig_3406': 1}


In [None]:
## Convert to dataframe and fill NA with false. This part takes atleast 5-10 mins.

In [None]:
reducedDF = pd.DataFrame(reducedRecipes, dtype=np.int8)
reducedDF = reducedDF.fillna(0)
reducedDF = reducedDF.astype("boolean")
print(reducedDF.head())

#reducedDF = pd.read_feather("ingredientOneHot.fth")
#print(reducedDF.head())

   ig_389  ig_7655  ig_6270  ig_1527  ig_3406  ig_2683  ig_4969  ig_800  \
0    True     True     True     True     True    False    False   False   
1   False    False     True    False    False     True     True    True   
2   False     True     True    False    False     True    False   False   
3   False    False    False    False    False    False    False   False   
4   False    False    False    False    False    False    False   False   

   ig_5298  ig_840  ...  ig_6145  ig_2522  ig_7605  ig_6195  ig_1681  ig_750  \
0    False   False  ...    False    False    False    False    False   False   
1     True    True  ...    False    False    False    False    False   False   
2    False   False  ...    False    False    False    False    False   False   
3    False   False  ...    False    False    False    False    False   False   
4    False   False  ...    False    False    False    False    False   False   

   ig_5474  ig_3845  ig_7980  ig_7069  
0    False    False    False

In [None]:
## Save to feather format. Small and fast use df.read_feather("fileanme") to load again requires pyarrow

In [None]:
!pip install pyarrow
reducedDF.to_feather("ingredientOneHot.fth")

