In [41]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.tree import export_graphviz
import graphviz 

In [2]:
recipe_df = pd.read_csv('../dataset/recipes.csv')
recipe_with_ingred = pd.read_csv('../dataset/recipe_with_ingredient.csv')

In [3]:
recipe_df.head()

Unnamed: 0,recipe_id,recipe_name,calories,carbohydrate,fat,protein
0,45430986,Gluten-Free Rolls,177,31.06,4.62,3.0
1,42818205,French Fries,203,39.29,3.74,4.75
2,42384638,Chicken Stir Fry,306,39.22,8.51,20.0
3,42288128,Black Bean Salsa,110,13.69,4.77,4.59
4,8629075,Taco Soup,192,18.09,6.88,12.7


In [4]:
recipe_with_ingred.head()

Unnamed: 0,recipe_id,33919,39699,61271,3419,15845548,75372,570035,1071192,41011,...,40723,1350,72158,49809,2200644,50123,40637,28033,38403,40592
0,45430986,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,42818205,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,42384638,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,42288128,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,8629075,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
def mg_apply_func(row):
    fat, carb, protein = row['fat'], row['carbohydrate'], row['protein']
    total = fat + carb + protein
    if total <= 0:
        return 0
    f_percent, c_percent, p_percent = fat / total, carb / total, protein / total
    if p_percent >= 0.35:
        if f_percent <= 0.3 and f_percent >= 0.15:
            return 1
    return 0

In [6]:
recipe_df['fl_mg'] = recipe_df.apply(mg_apply_func, axis=1)

In [7]:
len(recipe_df[recipe_df['fl_mg'] == 1])

466

### Merge with feature vectors

In [8]:
res_recipe_df = pd.merge(recipe_with_ingred, recipe_df[['recipe_id', 'fl_mg']], on='recipe_id', how='inner')

In [9]:
len(res_recipe_df)

3365

In [10]:
res_recipe_df.head()

Unnamed: 0,recipe_id,33919,39699,61271,3419,15845548,75372,570035,1071192,41011,...,1350,72158,49809,2200644,50123,40637,28033,38403,40592,fl_mg
0,45430986,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,42818205,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,42384638,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,42288128,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,8629075,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
positive_fl_mg = res_recipe_df[res_recipe_df['fl_mg'] == 1]
negative_fl_mg = res_recipe_df[res_recipe_df['fl_mg'] == 0]

In [12]:
len(positive_fl_mg)

466

In [44]:
len(negative_fl_mg )

2899

In [13]:
# undersampling
negative_fl_mg_rs = negative_fl_mg.sample(n=len(positive_fl_mg), random_state=1)

In [14]:
positive_fl_mg.isnull().sum(axis = 1)

13      0
30      0
35      0
44      0
46      0
       ..
3358    0
3359    0
3361    0
3362    0
3363    0
Length: 466, dtype: int64

In [15]:
final_df = pd.concat([positive_fl_mg, negative_fl_mg_rs], axis= 0)

In [16]:
len(final_df)

932

### Training

In [17]:
final_df.shape[1]

3804

In [18]:
X, y = final_df.iloc[:, 0:final_df.shape[1] - 1], final_df.iloc[:, final_df.shape[1] - 1]

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [21]:
logistic = LogisticRegression()
logistic.fit(X_train,y_train) 

tree_model = DecisionTreeClassifier(criterion='entropy')
tree_model.fit(X_train, y_train )

svc_model = SVC()
svc_model.fit(X_train, y_train )

SVC()

In [22]:
LogisticPredict = logistic.predict(X_test)
TreePredict = tree_model.predict(X_test)
SvcPredict = svc_model.predict(X_test)

In [23]:
print(f'The accuracy of logistic regression is: {accuracy_score(y_test, LogisticPredict)}')
print(f'The accuracy of DT is: {accuracy_score(y_test, TreePredict)}')
print(f'The accuracy of SVC is: {accuracy_score(y_test, SvcPredict)}')

The accuracy of logistic regression is: 0.5347593582887701
The accuracy of DT is: 0.6042780748663101
The accuracy of SVC is: 0.47593582887700536


In [24]:
print(f'weights: {logistic.coef_}, bias_term: {logistic.intercept_}')

weights: [[-1.25370349e-08  0.00000000e+00  2.63984667e-16 ...  0.00000000e+00
  -5.19641016e-16 -1.03918650e-15]], bias_term: [5.79006634e-15]


In [34]:
a = list(logistic.coef_[0])

In [36]:
a.sort()

In [43]:
tree_model.feature_importances_

array([0.1399196 , 0.        , 0.        , ..., 0.        , 0.        ,
       0.00329464])

In [52]:
b = list(tree_model.feature_importances_)
b.sort()