# Build Classification Models

We will use the dataset we saved from the last lesson full of balanced, clean data all about cuisines

We will use this dataset with a variety of classifiers to predict a given national cuisine based on a group of ingredients

In [2]:
import pandas as pd
cuisines_df = pd.read_csv("./cleaned_cuisines.csv")
cuisines_df.head()

Unnamed: 0.1,Unnamed: 0,cuisine,almond,angelica,anise,anise_seed,apple,apple_brandy,apricot,armagnac,...,whiskey,white_bread,white_wine,whole_grain_wheat_flour,wine,wood,yam,yeast,yogurt,zucchini
0,0,indian,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,indian,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,indian,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,indian,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,indian,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score,precision_score,confusion_matrix,classification_report, precision_recall_curve
from sklearn.svm import SVC
import numpy as np

In [4]:
# Divide the X and y coordinates into two dataframes for training. cuisine can be the labels dataframe

cuisines_label_df = cuisines_df['cuisine']
cuisines_label_df.head()

0    indian
1    indian
2    indian
3    indian
4    indian
Name: cuisine, dtype: object

In [5]:
# Drop that Unnamed: 0 column and the cuisine column, calling drop(). Save the rest of the data as trainable features

cuisines_feature_df = cuisines_df.drop(['Unnamed: 0', 'cuisine'], axis=1)
cuisines_feature_df.head()

Unnamed: 0,almond,angelica,anise,anise_seed,apple,apple_brandy,apricot,armagnac,artemisia,artichoke,...,whiskey,white_bread,white_wine,whole_grain_wheat_flour,wine,wood,yam,yeast,yogurt,zucchini
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


# Split thd data

In [6]:
# Split your data into training and testing groups

X_train, X_test, y_train, y_test = train_test_split(cuisines_feature_df, cuisines_label_df, test_size=0.3)

# Apply logistic regression

Since we are using the multiclass case, we need to choose what scheme to use and what solver to set.
Use LogisticRegression with a multiclass setting and the liblinear solver to train.

In [9]:
# Create a logistic regression with multi_class set to ovr and the solver set to liblinear:

lr = LogisticRegression(multi_class='ovr',solver='liblinear')
model = lr.fit(X_train, np.ravel(y_train))

accuracy = model.score(X_test, y_test)
print ("Accuracy is {}".format(accuracy))

Accuracy is 0.8148457047539617


In [19]:
# We can see this model in action by testing one row of data (#100)
print(f'ingredients: {X_test.iloc[50][X_test.iloc[100]!=0].keys()}')
print(f'cuisine: {y_test.iloc[100]}')

ingredients: Index(['chicken', 'egg', 'fish', 'potato', 'seaweed', 'starch',
       'vegetable_oil', 'wasabi'],
      dtype='object')
cuisine: japanese


In [18]:
# check for the accuracy of this prediction

#rehsape to 2d array and transpose
test= X_test.iloc[100].values.reshape(-1, 1).T
# predict with score
proba = model.predict_proba(test)
classes = model.classes_
# create df with classes and scores
resultdf = pd.DataFrame(data=proba, columns=classes)

# create df to show results
topPrediction = resultdf.T.sort_values(by=[0], ascending = [False])
topPrediction.head()



Unnamed: 0,0
japanese,0.924206
korean,0.060194
chinese,0.012186
thai,0.002992
indian,0.000421


In [21]:
# Get more detail by printing a classification report

y_pred = model.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

     chinese       0.74      0.74      0.74       235
      indian       0.94      0.89      0.91       257
    japanese       0.76      0.77      0.77       241
      korean       0.88      0.81      0.84       242
        thai       0.76      0.86      0.81       224

    accuracy                           0.81      1199
   macro avg       0.82      0.81      0.81      1199
weighted avg       0.82      0.81      0.82      1199

