# Build More Classification Models

In [2]:
import pandas as pd
cuisines_df = pd.read_csv("./cleaned_cuisines.csv")
cuisines_df.head()

Unnamed: 0.1,Unnamed: 0,cuisine,almond,angelica,anise,anise_seed,apple,apple_brandy,apricot,armagnac,...,whiskey,white_bread,white_wine,whole_grain_wheat_flour,wine,wood,yam,yeast,yogurt,zucchini
0,0,indian,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,indian,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,indian,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,indian,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,indian,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [3]:
cuisines_label_df = cuisines_df['cuisine']
cuisines_label_df.head()

0    indian
1    indian
2    indian
3    indian
4    indian
Name: cuisine, dtype: object

In [4]:
cuisines_feature_df = cuisines_df.drop(['Unnamed: 0', 'cuisine'], axis=1)
cuisines_feature_df.head()

Unnamed: 0,almond,angelica,anise,anise_seed,apple,apple_brandy,apricot,armagnac,artemisia,artichoke,...,whiskey,white_bread,white_wine,whole_grain_wheat_flour,wine,wood,yam,yeast,yogurt,zucchini
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


# Try different classifiers

In [5]:
# import the needed libraries

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score,precision_score,confusion_matrix,classification_report, precision_recall_curve
import numpy as np

In [6]:
# split your training and test data

X_train, X_test, y_train, y_test = train_test_split(cuisines_feature_df, cuisines_label_df, test_size=0.3)

# Linear SVC classifier

Support-Vector clustering (SVC) is a child of the Support-Vector machines family of ML techniques.
In this method, we can choose a 'kernel' to decide how to cluster the labels.
The 'C' parameter refers to 'regularization' which regulates the influence of parameters.
The kernel can be one of several; here we set it to 'linear' to ensure that we leverage linear SVC.
Probability defaults to 'false'; here we set it to 'true' to gather probability estimates.
We set the random state to '0' to shuffle the data to get probabilities.

In [10]:
# start with a Linear SVC

C = 10
# Create different classifiers.
classifiers = {
    'Linear SVC': SVC(kernel='linear', C=C, probability=True,random_state=0)
    
}


In [11]:
# train your model using the Linear SVC and print out a report

n_classifiers = len(classifiers)

for index, (name, classifier) in enumerate(classifiers.items()):
    classifier.fit(X_train, np.ravel(y_train))

    y_pred = classifier.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy (train) for %s: %0.1f%% " % (name, accuracy * 100))
    print(classification_report(y_test,y_pred))

Accuracy (train) for Linear SVC: 77.8% 
              precision    recall  f1-score   support

     chinese       0.66      0.72      0.68       239
      indian       0.87      0.88      0.87       234
    japanese       0.80      0.72      0.76       248
      korean       0.84      0.77      0.80       245
        thai       0.75      0.82      0.78       233

    accuracy                           0.78      1199
   macro avg       0.78      0.78      0.78      1199
weighted avg       0.78      0.78      0.78      1199



# K-Neighbors classifier

K-Neighbors is part of the "neighbors" family of ML methods, which can be used for both supervised and unsupervised learning. In this method, a predefined number of points is created and data are gathered around these points such that generalized labels can be predicted for the data.

In [12]:
# Apply the K-Neighbors classifier
# The previous classifier was good, and worked well with the data, but maybe we can get better accuracy.
# Try a K-Neighbors classifier.

C = 10
# Create different classifiers.
classifiers = {
    'Linear SVC': SVC(kernel='linear', C=C, probability=True,random_state=0),
    'KNN classifier': KNeighborsClassifier(C),
    
}

In [13]:
# train your model using two classifiers  and print out a report

n_classifiers = len(classifiers)

for index, (name, classifier) in enumerate(classifiers.items()):
    classifier.fit(X_train, np.ravel(y_train))

    y_pred = classifier.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy (train) for %s: %0.1f%% " % (name, accuracy * 100))
    print(classification_report(y_test,y_pred))

Accuracy (train) for Linear SVC: 77.8% 
              precision    recall  f1-score   support

     chinese       0.66      0.72      0.68       239
      indian       0.87      0.88      0.87       234
    japanese       0.80      0.72      0.76       248
      korean       0.84      0.77      0.80       245
        thai       0.75      0.82      0.78       233

    accuracy                           0.78      1199
   macro avg       0.78      0.78      0.78      1199
weighted avg       0.78      0.78      0.78      1199

Accuracy (train) for KNN classifier: 72.7% 
              precision    recall  f1-score   support

     chinese       0.59      0.72      0.65       239
      indian       0.87      0.79      0.83       234
    japanese       0.67      0.85      0.75       248
      korean       0.91      0.58      0.71       245
        thai       0.73      0.69      0.71       233

    accuracy                           0.73      1199
   macro avg       0.75      0.73      0.73    

The result is a little worse

# Support Vector Classifier

Support-Vector classifiers are part of the Support-Vector Machine family of ML methods that are used for classification and regression tasks. SVMs "map training examples to points in space" to maximize the distance between two categories. Subsequent data is mapped into this space so their category can be predicted.

In [14]:
# Let's try for a little better accuracy with a Support Vector Classifier.

C = 10
# Create different classifiers.
classifiers = {
    'Linear SVC': SVC(kernel='linear', C=C, probability=True,random_state=0),
    'KNN classifier': KNeighborsClassifier(C),
    'SVC': SVC(),
    
}

In [15]:
# train your model using three classifiers and print out a report

n_classifiers = len(classifiers)

for index, (name, classifier) in enumerate(classifiers.items()):
    classifier.fit(X_train, np.ravel(y_train))

    y_pred = classifier.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy (train) for %s: %0.1f%% " % (name, accuracy * 100))
    print(classification_report(y_test,y_pred))

Accuracy (train) for Linear SVC: 77.8% 
              precision    recall  f1-score   support

     chinese       0.66      0.72      0.68       239
      indian       0.87      0.88      0.87       234
    japanese       0.80      0.72      0.76       248
      korean       0.84      0.77      0.80       245
        thai       0.75      0.82      0.78       233

    accuracy                           0.78      1199
   macro avg       0.78      0.78      0.78      1199
weighted avg       0.78      0.78      0.78      1199

Accuracy (train) for KNN classifier: 72.7% 
              precision    recall  f1-score   support

     chinese       0.59      0.72      0.65       239
      indian       0.87      0.79      0.83       234
    japanese       0.67      0.85      0.75       248
      korean       0.91      0.58      0.71       245
        thai       0.73      0.69      0.71       233

    accuracy                           0.73      1199
   macro avg       0.75      0.73      0.73    

The result is quite good!

# Ensemble Classifiers
Let's try some 'Ensemble Classifiers, specifically Random Forest and AdaBoost

The ensemble method of Machine Learning "combines the predictions of several base estimators" to improve the model's quality.

Random Forest, an averaging method, builds a 'forest' of 'decision trees' infused with randomness to avoid overfitting. The n_estimators parameter is set to the number of trees.

AdaBoost fits a classifier to a dataset and then fits copies of that classifier to the same dataset. It focuses on the weights of incorrectly classified items and adjusts the fit for the next classifier to correct.

In [16]:
# Let's try for a better accuracy with a Random Forest and AdaBoost.

C = 10
# Create different classifiers.
classifiers = {
    'Linear SVC': SVC(kernel='linear', C=C, probability=True,random_state=0),
    'KNN classifier': KNeighborsClassifier(C),
    'SVC': SVC(),
    'RFST': RandomForestClassifier(n_estimators=100),
    'ADA': AdaBoostClassifier(n_estimators=100)
}

In [17]:
# train your model using five classifiers and print out a report

n_classifiers = len(classifiers)

for index, (name, classifier) in enumerate(classifiers.items()):
    classifier.fit(X_train, np.ravel(y_train))

    y_pred = classifier.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy (train) for %s: %0.1f%% " % (name, accuracy * 100))
    print(classification_report(y_test,y_pred))

Accuracy (train) for Linear SVC: 77.8% 
              precision    recall  f1-score   support

     chinese       0.66      0.72      0.68       239
      indian       0.87      0.88      0.87       234
    japanese       0.80      0.72      0.76       248
      korean       0.84      0.77      0.80       245
        thai       0.75      0.82      0.78       233

    accuracy                           0.78      1199
   macro avg       0.78      0.78      0.78      1199
weighted avg       0.78      0.78      0.78      1199

Accuracy (train) for KNN classifier: 72.7% 
              precision    recall  f1-score   support

     chinese       0.59      0.72      0.65       239
      indian       0.87      0.79      0.83       234
    japanese       0.67      0.85      0.75       248
      korean       0.91      0.58      0.71       245
        thai       0.73      0.69      0.71       233

    accuracy                           0.73      1199
   macro avg       0.75      0.73      0.73    



Accuracy (train) for ADA: 72.0% 
              precision    recall  f1-score   support

     chinese       0.68      0.59      0.64       239
      indian       0.86      0.74      0.80       234
    japanese       0.71      0.67      0.69       248
      korean       0.73      0.79      0.76       245
        thai       0.65      0.81      0.72       233

    accuracy                           0.72      1199
   macro avg       0.73      0.72      0.72      1199
weighted avg       0.73      0.72      0.72      1199

