# Build Classification Models

In [1]:
import pandas as pd;

raw_cuisines = pd.read_csv("../data/cleaned_cuisines.csv")
print(raw_cuisines.head())
# split dataset to x and y.
cuisine_y = raw_cuisines['cuisine']
cuisine_X = raw_cuisines.drop(labels=['cuisine', 'Unnamed: 0'],axis=1)

   Unnamed: 0 cuisine  almond  angelica  anise  anise_seed  apple  \
0           0  indian       0         0      0           0      0   
1           1  indian       1         0      0           0      0   
2           2  indian       0         0      0           0      0   
3           3  indian       0         0      0           0      0   
4           4  indian       0         0      0           0      0   

   apple_brandy  apricot  armagnac  ...  whiskey  white_bread  white_wine  \
0             0        0         0  ...        0            0           0   
1             0        0         0  ...        0            0           0   
2             0        0         0  ...        0            0           0   
3             0        0         0  ...        0            0           0   
4             0        0         0  ...        0            0           0   

   whole_grain_wheat_flour  wine  wood  yam  yeast  yogurt  zucchini  
0                        0     0     0    0      0 

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score,precision_score,confusion_matrix,classification_report, precision_recall_curve
from sklearn.svm import SVC
import numpy as np

train_X, test_X, train_y, test_y = train_test_split(cuisine_X, cuisine_y, test_size=0.33, shuffle=True)

In [3]:
# how to choose solver,multi_class
#对于少量的样本数据，'liblinear' 是很好的选择，而 'sag' 和 'saga' 对大量样本训练速度更快。
#对于多分类问题，只有 'newton-cg'、'sag'、'saga' 和 'lbfgs' 能够处理多项损失，
# 而 'liblinear' 面对多分类问题，得先把一种类别作为一个类别，剩余的所有类别作为另外一个类别,
# 依次类推，遍历所有类别，进行分类(one over rest)。
# solver="liblinear",multi_class must be ovr or auto
lr = LogisticRegression(solver='liblinear', multi_class='ovr')
#flatten to 1-D array when needed.
ravel = np.ravel(train_y)
model = lr.fit(train_X, ravel)
score = model.score(test_X, test_y)
print('score: ', score)

score:  0.8127369219105383


In [4]:
lr2 = LogisticRegression(solver='lbfgs', multi_class='auto')
#flatten the array.
ravel2 = np.ravel(train_y)
model2 = lr.fit(train_X, ravel2)
score2 = model2.score(test_X, test_y)
print('score2: ', score2)

score2:  0.8127369219105383


In [5]:
#probablity.
proba = model.predict_proba(test_X)
classes = model.classes_
resultdf = pd.DataFrame(data=proba, columns=classes)
top_result = resultdf.T.sort_values(by=0, ascending=False)
top_result.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1309,1310,1311,1312,1313,1314,1315,1316,1317,1318
chinese,0.681026,0.006037,0.05475,0.021508,0.015203,0.266273,0.703008,0.41154,0.129276,0.078011,...,0.950626,0.009735,0.00022,0.72776,0.06295,0.076387,0.001871,0.747194,0.87985,0.191736
korean,0.216744,0.005421,0.897924,0.068959,0.975072,0.074033,0.128879,0.221046,0.273055,0.24934,...,0.003217,0.000776,0.000182,0.002461,0.718099,0.017224,0.56223,0.010475,0.03463,0.018997
japanese,0.090768,0.000131,0.047169,0.821702,0.001239,0.552636,0.081007,0.170013,0.291186,0.105532,...,0.012657,0.012072,0.004212,0.25769,0.118601,0.901539,0.435812,0.017994,0.048047,0.763615
thai,0.006809,0.001469,0.000152,0.033921,0.002742,0.087882,0.039698,0.197388,0.239277,0.161092,...,0.029187,0.026979,0.52381,0.011008,0.060608,0.003125,8e-05,0.224328,0.037366,0.017503
indian,0.004653,0.986942,5e-06,0.05391,0.005745,0.019175,0.047407,1.4e-05,0.067207,0.406025,...,0.004313,0.950438,0.471575,0.001081,0.039742,0.001725,7e-06,8e-06,0.000107,0.008149


In [6]:
y_pred = model.predict(test_X)
#every time execute the whole file, the report score is different,
#train_test_split default shuffle the dataset before the splitting.
print(classification_report(test_y,y_pred))

              precision    recall  f1-score   support

     chinese       0.74      0.70      0.72       262
      indian       0.95      0.89      0.92       274
    japanese       0.75      0.79      0.77       265
      korean       0.83      0.80      0.81       245
        thai       0.80      0.88      0.84       273

    accuracy                           0.81      1319
   macro avg       0.81      0.81      0.81      1319
weighted avg       0.81      0.81      0.81      1319

