In [1]:
import pandas as pd
import numpy as np
from prince import MCA
from imblearn.over_sampling import SMOTEN
from collections import Counter


from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.neural_network import MLPClassifier
from sklearn.feature_selection import RFECV
from sklearn.tree import export_graphviz
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
import graphviz 
import matplotlib.pyplot as plt

In [125]:
recipe_df = pd.read_csv('../dataset/recipes.csv')
recipe_with_ingred = pd.read_csv('../dataset/recipe_with_ingredient.csv')
food_df = pd.read_csv('../dataset/food.csv')

-------------

# find spices





In [126]:
def mg_apply_func(row):
    fat, carb, protein = row['fat'] * 9, row['carbohydrate'] * 4, row['protein'] * 4
    total = row['calories']
    if total <= 0:
        return 0
    f_percent, c_percent, p_percent = fat / total, carb / total, protein / total
    if p_percent >= 0.35:
        if  0.15 <= f_percent <= 0.3:
            return 1
    return 0

In [127]:
recipe_df['fl_mg'] = recipe_df.apply(mg_apply_func, axis=1)

### Merge with feature vectors

In [128]:
res_recipe_df = pd.merge(recipe_with_ingred, recipe_df[['recipe_id', 'fl_mg']], on='recipe_id', how='inner')

# feature selection 1

In [129]:
ingred_spices_df = pd.read_csv('collect_spices_ingred_df.csv')
ingred_spices_df.shape

In [131]:
res_recipe_df = res_recipe_df.drop(labels = [str(x) for x in ingred_spices_df.food_id.values], axis = 1)
res_recipe_df.shape

(3365, 3302)

In [132]:
res_recipe_df = res_recipe_df.loc[:, res_recipe_df.sum(axis=0)/res_recipe_df.shape[0] >= 0.0005]
res_recipe_df.shape

(3365, 1339)

## UnderSampling

In [133]:
positive_fl_mg = res_recipe_df[res_recipe_df['fl_mg'] == 1]
negative_fl_mg = res_recipe_df[res_recipe_df['fl_mg'] == 0]

In [134]:
# undersampling
negative_fl_mg_rs = negative_fl_mg.sample(n= round(len(positive_fl_mg)), random_state=0)

In [135]:
final_df = pd.concat([positive_fl_mg, negative_fl_mg_rs], axis= 0)

In [136]:
X, y = final_df.iloc[:, 0:final_df.shape[1] - 1], final_df.iloc[:, final_df.shape[1] - 1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=25)

In [12]:
# mca = MCA(n_components = 4, n_iter=10, random_state=25)
# mca.fit(X_train)
# X_train = mca.transform(X_train)
# X_test = mca.transform(X_test)

# print(f"Original class counts in training set: {Counter(y_train)}")
# print(f"Original class counts in test set: {Counter(y_test)}")

In [137]:
sampler = SMOTEN(random_state=2)
X_train, y_train = sampler.fit_resample(X_train, y_train)

In [None]:
# print(f"Original class counts: {Counter(y_train)}")

# mca.eigenvalues_

### Feature Selection Using Tunned Rondom Forest

In [138]:
def printAccruacy(model_name, actual_test, predictions_test, actual_train=None, predictions_train=None):
    print(f'The accuracy of {model_name} is: {accuracy_score(actual_test, predictions_test)}')
    if actual_train is not None and predictions_train is not None:
        print(f'The accuracy of {model_name} on training set is: {accuracy_score(actual_train, predictions_train)}')
    
def printReport(model_name, actual_test, predictions_test):
    print(f'For {model_name}:')
    print('-----------------------------------------------------')
    print(classification_report(actual_test, predictions_test))

In [139]:
# print number of features in original dataset
print('# of features:',final_df.shape[1])

# of features: 1339


In [140]:
X, y = final_df.iloc[:, 0:final_df.shape[1] - 1], final_df.iloc[:, final_df.shape[1] - 1]

In [141]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [113]:
# Number of trees in Random Forest
rf_n_estimators = [int(x) for x in np.linspace(200, 1000, 5)]

# Maximum number of levels in tree
rf_max_depth = [int(x) for x in np.linspace(5, 55, 11)]
# Add the default as a possible value
rf_max_depth.append(None)

# Number of features to consider at every split
rf_max_features = ['sqrt', 'log2']

# Criterion to split on
rf_criterion = ['gini', 'entropy']

# Minimum number of samples required to split a node
rf_min_samples_split = [int(x) for x in np.linspace(2, 10, 9)]

# Minimum decrease in impurity required for split to happen
rf_min_impurity_decrease = [0.0, 0.05, 0.1]

# Method of selecting samples for training each tree
rf_bootstrap = [True, False]

# Create the grid
rf_grid = {'n_estimators': rf_n_estimators,
               'max_depth': rf_max_depth,
               'max_features': rf_max_features,
               'criterion': rf_criterion,
               'min_samples_split': rf_min_samples_split,
               'min_impurity_decrease': rf_min_impurity_decrease,
               'bootstrap': rf_bootstrap}
rf_grid

{'n_estimators': [200, 400, 600, 800, 1000],
 'max_depth': [5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, None],
 'max_features': ['sqrt', 'log2'],
 'criterion': ['gini', 'entropy'],
 'min_samples_split': [2, 3, 4, 5, 6, 7, 8, 9, 10],
 'min_impurity_decrease': [0.0, 0.05, 0.1],
 'bootstrap': [True, False]}

In [None]:
# Create the model to be tuned
rf_base = RandomForestClassifier()

# Create the random search Random Forest
rf_random = RandomizedSearchCV(estimator = rf_base, param_distributions = rf_grid, scoring='f1',
                               n_iter = 200, cv = 5, verbose = 2, random_state = 0, 
                               n_jobs = -1)

# Fit the random search model
rf_random.fit(X_train, y_train)

# View the best parameters from the random search
rf_random.best_params_

In [147]:
rf_feature_select = RandomForestClassifier() 
# n_estimators=400, 
# min_samples_split=10,
# min_impurity_decrease=0.05,
# max_features='sqrt',
# max_depth=20,
# criterion='entropy',
# bootstrap=True
rf_feature_select = rf_feature_select.fit(X_train, y_train)

In [143]:
model = SelectFromModel(rf_feature_select, prefit=True)
new_features_index = model.get_support()


In [144]:
# print number of features after seleciton
num_new_features = sum([1 for i in new_features_index if i])
print('# of features:',num_new_features)

# of features: 309


In [117]:
food_df[food_df['food_id'].isin(X.loc[:, new_features_index].columns[1:])]

Unnamed: 0,food_id,food_name
1,39536,Honey
2,3092,Egg
6,36492,Russet Potatoes (Flesh and Skin)
11,4881229,Skinless Chicken Breast
12,36320,Carrots
...,...,...
3786,36237,Alfalfa Seeds (Sprouted)
3790,36668,"Waterchestnuts (Solids and Liquids, Canned)"
3794,35751,"Apricot Nectar (Without Added Ascorbic Acid, C..."
3795,37492,"Beef Knuckle (Tip Side, Steak, Lean Only, Trim..."


In [145]:
printAccruacy('random forest', y_test, rf_feature_select.predict(X_test), y_train, rf_feature_select.predict(X_train))

The accuracy of random forest is: 0.7075471698113207
The accuracy of random forest on training set is: 1.0


In [148]:
new_X = X.loc[:, new_features_index]
X_train, X_test, y_train, y_test = train_test_split(new_X , y, test_size=0.20)
rf_feature_select = RandomForestClassifier() 
rf_feature_select = rf_feature_select.fit(X_train, y_train)
printAccruacy('random forest', y_test, rf_feature_select.predict(X_test), y_train, rf_feature_select.predict(X_train))
X_train.shape

The accuracy of random forest is: 0.7264150943396226
The accuracy of random forest on training set is: 1.0


(420, 309)

In [146]:
new_X = X.loc[:, new_features_index]
X_train, X_test, y_train, y_test = train_test_split(new_X , y, test_size=0.20)
# X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.15)

In [None]:
# feature_importance = RFECV(SVC(kernel="linear"), scoring='accuracy')
# feature_importance.fit(X_train, y_train)

------------------

### Training - Decision Tree

In [None]:
clf = DecisionTreeClassifier()
path = clf.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas, impurities = path.ccp_alphas, path.impurities
clfs = []

for ccp_alpha in ccp_alphas:
    clf = DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha)
    clf.fit(X_train, y_train)
    clfs.append(clf)

In [None]:
tree_depths = [clf.tree_.max_depth for clf in clfs]
plt.figure(figsize=(10,  6))
plt.plot(ccp_alphas, tree_depths)
plt.xlabel("effective alpha")
plt.ylabel("total depth")

In [None]:
acc_scores = [accuracy_score(y_val, clf.predict(X_val)) for clf in clfs]

tree_depths = [clf.tree_.max_depth for clf in clfs]
plt.figure(figsize=(10,  6))
plt.grid()
plt.plot(ccp_alphas, acc_scores)
plt.xlabel("effective alpha")
plt.ylabel("Accuracy scores")

In [None]:
def getMaxInd(lst):
    ind = 0
    val = -1
    for i in range(len(lst)):
        if lst[i] >= val:
            ind = i
            val = lst[i]
    return ind


In [None]:
ccp_alphas[getMaxInd(acc_scores)]

In [None]:
# Maximum number of levels in tree
dt_max_depth = [int(x) for x in range(5,50)]
# Add the default as a possible value
dt_max_depth.append(None)

# Number of features to consider at every split
dt_max_features = ['sqrt', 'log2']

# Criterion to split on
dt_criterion = ['gini', 'entropy']

# Minimum number of samples required to split a node
dt_min_samples_split = [int(x) for x in np.linspace(2, 10, 9)]

# Minimum decrease in impurity required for split to happen
dt_min_impurity_decrease = [0.0, 0.05, 0.1]

# Create the grid
dt_grid = {'max_depth': dt_max_depth,
               'max_features': dt_max_features,
               'criterion': dt_criterion,
               'min_samples_split': dt_min_samples_split,
               'min_impurity_decrease': dt_min_impurity_decrease}
# dt_grid

In [None]:
# Create the model to be tuned
dt_base = DecisionTreeClassifier()

# Create the random search Random Forest
dt_random = GridSearchCV(estimator = dt_base, param_grid = dt_grid, scoring='f1',
                              cv = 5, verbose = 2, n_jobs = -1)

# Fit the random search model
dt_random.fit(X_train, y_train)

# View the best parameters from the random search
dt_random.best_params_

In [None]:
dt_model = DecisionTreeClassifier(min_samples_split=10,
                                  min_impurity_decrease=0,
                                  max_features='log2',
                                  max_depth=42,
                                  criterion='entropy'
                                 )
dt_model.fit(X_train, y_train)

In [None]:
print(f'The accuracy of DT is: {accuracy_score(y_test, dt_model.predict(X_test))}')
print(f'The accuracy of DT on training set is: {accuracy_score(y_train, dt_model.predict(X_train))}')

In [None]:
printReport('decision tree', y_test, dt_model.predict(X_test))


In [None]:
y_test[y_test == 1].sum()

In [None]:
tree_model = DecisionTreeClassifier(ccp_alpha=ccp_alphas[getMaxInd(acc_scores)])
tree_model.fit(X_train, y_train)
TreePredict = tree_model.predict(X_test)

In [None]:
print(f'The accuracy of DT is: {accuracy_score(y_test, TreePredict)}')
print(f'The accuracy of DT on training set is: {accuracy_score(y_train, tree_model.predict(X_train))}')

### SVM and Tunning

In [None]:
Cs = np.linspace(0.01, 10, 50)

gammas = np.linspace(0.01, 1, 50)
# np.linspace(0.01, 0.02, 30)

svc_grid = {'C': Cs, 'gamma' : gammas}

In [None]:
svc_grid_search = GridSearchCV(SVC(kernel='rbf'), svc_grid, cv=5, n_jobs = -1, verbose = 2, scoring='f1')
svc_grid_search.fit(X_train, y_train)
svc_grid_search.best_params_

In [None]:
svc_model = SVC(C=5.1, gamma=0.01)
svc_model.fit(X_train, y_train)
SvcPredict = svc_model.predict(X_test)
printAccruacy('SVC model', y_test, SvcPredict, y_train, svc_model.predict(X_train))


### Training

In [None]:
logistic = LogisticRegression()
logistic.fit(X_train,y_train) 

tree_model = DecisionTreeClassifier(criterion='entropy')
tree_model.fit(X_train, y_train )

rf = RandomForestClassifier()
rf.fit(X_train, y_train )

svc_model = SVC()
svc_model.fit(X_train, y_train)



In [None]:
mlp = MLPClassifier(max_iter=300)
mlp.fit(X_train, y_train)

In [None]:
LogisticPredict = logistic.predict(X_test)
TreePredict = tree_model.predict(X_test)
SvcPredict = svc_model.predict(X_test)
mlpPredict = mlp.predict(X_test)
rfPredict = rf.predict(X_test)

In [None]:
print(f'The accuracy of logistic regression is: {accuracy_score(y_test, LogisticPredict)}')
print(f'The accuracy of DT is: {accuracy_score(y_test, TreePredict)}')
print(f'The accuracy of SVC is: {accuracy_score(y_test, SvcPredict)}')
print(f'The accuracy of MLP is: {accuracy_score(y_test, mlpPredict)}')
print(f'The accuracy of RF is: {accuracy_score(y_test, rfPredict)}')

In [None]:
printReport('decision tree', y_test, TreePredict)


In [None]:
print(f'weights: {logistic.coef_}, bias_term: {logistic.intercept_}')

In [None]:
tree_f_importance = tree_model.feature_importances_
rf_f_importance = rf.feature_importances_

In [None]:
def mapWithIndex(lst):
    res = []
    for i in range(len(lst)):
        res.append((i, lst[i]))
    return res

def filterNonImportant(lst, threshold):
    res = []
    for i in lst:
        if i[1] >= threshold:
            res.append(i)
    return res

def getIndex(lst):
    res = []
    for i in lst:
        res.append(i[0])
    return res

In [None]:
tree_f_importance = mapWithIndex(tree_f_importance)
rf_f_importance = mapWithIndex(rf_f_importance)

In [None]:
rf_f_importance = filterNonImportant(rf_f_importance, 0.001)
tree_f_importance = filterNonImportant(tree_f_importance, 0.01)

In [None]:
len(tree_f_importance)

In [None]:
tree_f_importance.sort(key=lambda x : x[0])
feature_index = getIndex(tree_f_importance)

In [None]:
new_X = X.iloc[:, feature_index]
X_train, X_test, y_train, y_test = train_test_split(new_X, y, test_size=0.2)

In [None]:
logistic = LogisticRegression()
logistic.fit(X_train,y_train) 

tree_model = DecisionTreeClassifier(criterion='entropy')
tree_model.fit(X_train, y_train )

rf = RandomForestClassifier()
rf.fit(X_train, y_train )

svc_model = SVC()
svc_model.fit(X_train, y_train)

mlp = MLPClassifier(random_state=1, max_iter=300, activation='logistic')
mlp.fit(X_train, y_train)

In [None]:
LogisticPredict = logistic.predict(X_test)
TreePredict = tree_model.predict(X_test)
SvcPredict = svc_model.predict(X_test)
mlpPredict = mlp.predict(X_test)
rfPredict = rf.predict(X_test)

In [None]:
print(f'The accuracy of logistic regression is: {accuracy_score(y_test, LogisticPredict)}')
print(f'The accuracy of DT is: {accuracy_score(y_test, TreePredict)}')
print(f'The accuracy of SVC is: {accuracy_score(y_test, SvcPredict)}')
print(f'The accuracy of MLP is: {accuracy_score(y_test, mlpPredict)}')
print(f'The accuracy of RF is: {accuracy_score(y_test, rfPredict)}')