# Build Classification Models

In [2]:
import pandas as pd

cuisines=pd.read_csv("../data/cleaned_cuisines.csv")
cuisines.head()

Unnamed: 0.1,Unnamed: 0,almond,angelica,anise,anise_seed,apple,apple_brandy,apricot,armagnac,artemisia,...,white_bread,white_wine,whole_grain_wheat_flour,wine,wood,yam,yeast,yogurt,zucchini,cuisine
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,indian
1,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,indian
2,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,indian
3,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,indian
4,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,indian


In [3]:
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split,cross_validate
from sklearn.metrics import accuracy_score,precision_score,confusion_matrix,classification_report,precision_recall_curve
from sklearn.svm import SVC
import numpy as np

In [4]:
cuisine_label_df=cuisines['cuisine']
cuisine_label_df.head()

0    indian
1    indian
2    indian
3    indian
4    indian
Name: cuisine, dtype: object

In [5]:
cuisines_feature_df=cuisines.drop(['Unnamed: 0','cuisine'],axis=1)
cuisines_feature_df.head()

Unnamed: 0,almond,angelica,anise,anise_seed,apple,apple_brandy,apricot,armagnac,artemisia,artichoke,...,whiskey,white_bread,white_wine,whole_grain_wheat_flour,wine,wood,yam,yeast,yogurt,zucchini
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [26]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

lr=LogisticRegression(multi_class='ovr',solver='liblinear')
X_train,X_test,y_train,y_test=train_test_split(cuisines_feature_df,cuisine_label_df,test_size=0.3)
print(y_test.shape)
print(X_test.shape)
# 使用np.ravel可以将数组进行降低维度
model=lr.fit(X_train,np.ravel(y_train))
score=model.score(X_test,y_test)
print(f'score is {score}')
# 可以通过查看某一样数据来观测模型的运行情况
print(f'ingredients:{X_test.iloc[50][X_test.iloc[50]!=0].keys()}')
print(f'cuisine :{y_test.iloc[50]}')
print(type(y_test))


(1199,)
(1199, 380)
score is 0.7964970809007507
ingredients:Index(['black_pepper', 'clam', 'onion', 'pumpkin', 'sake', 'soy_sauce',
       'soybean', 'wheat'],
      dtype='object')
cuisine :korean
<class 'pandas.core.series.Series'>


In [36]:
# 可以再进行一步深入的研究，检查一下本来预测结果的准确率
print(X_test.iloc[50].values)
print(X_test.iloc[50].values.reshape(-1,1).T)
# 预测，修改X_test为一个行向量
predict=model.predict(X_test.iloc[50].values.reshape(-1,1).T)
print(f'predict:{predict}')
predict_proba=model.predict_proba(X_test.iloc[50].values.reshape(-1,1).T)
print(f'predict_proba:{predict_proba}')
pd.DataFrame(data=predict_proba,columns=model.classes_)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
 0 0 0 0 0 0 0 0 0 0]
[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 



Unnamed: 0,chinese,indian,japanese,korean,thai
0,0.004116,0.000442,0.280796,0.714579,6.7e-05


In [37]:
# 输出分类报告
from sklearn.metrics import classification_report
y_pred=model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

     chinese       0.76      0.67      0.71       254
      indian       0.91      0.87      0.89       234
    japanese       0.72      0.78      0.74       233
      korean       0.84      0.84      0.84       243
        thai       0.77      0.83      0.80       235

    accuracy                           0.80      1199
   macro avg       0.80      0.80      0.80      1199
weighted avg       0.80      0.80      0.80      1199

