In [15]:
# 导入必要的库
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# import sklearn
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
# from sklearn.feature_selection import RFE

import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier
from mlxtend.classifier import StackingCVClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB 
RANDOM_SEED = 42

# 读取数据

In [16]:
def unique_value(df: pd.DataFrame) -> np.array:
    df_specific = df.drop(list(df.columns[:2]), axis=1, inplace=False)
    temp = df_specific.to_numpy()
    return np.unique(temp)

In [17]:
train_df = pd.read_csv("../dataset/recipes_train.csv")
test_df = pd.read_csv("../dataset/recipes_test.csv")

In [18]:
train_df.shape, test_df.shape

((1469, 385), (979, 384))

In [19]:
train_df.head()

Unnamed: 0,id,cuisine,almond,angelica,anise,anise_seed,apple,apple_brandy,apricot,armagnac,...,whiskey,white_bread,white_wine,whole_grain_wheat_flour,wine,wood,yam,yeast,yogurt,zucchini
0,1,indian,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,chinese,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,chinese,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,korean,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,thai,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
test_df.head()

Unnamed: 0,id,almond,angelica,anise,anise_seed,apple,apple_brandy,apricot,armagnac,artemisia,...,whiskey,white_bread,white_wine,whole_grain_wheat_flour,wine,wood,yam,yeast,yogurt,zucchini
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
food_index = {}
food_count = {} # key为类, value为对应类的出现次数
loc_index = {} # key为类, value为其所有数据的索引
for _, group in train_df.groupby(by='cuisine'):
    # name, group = specific data -> DataFrame
    name = group['cuisine'].head(1).item()
    group.drop(["id", 'cuisine'], axis=1, inplace=True)
    loc_index[name] = group.index
    food_index[name] = group.sum().to_dict()
    food_count[name] = len(group)
food_index.keys() # key为类, value为类中各元素总和 dict: (column:sum_value)

dict_keys(['chinese', 'indian', 'japanese', 'korean', 'thai'])

In [22]:
from sklearn.decomposition import PCA
features_names = train_df.columns[2:]
labels = train_df.loc[:, 'cuisine']
pca = PCA(n_components=100)
pca.fit(train_df.loc[:, features_names])
reduced_features = pca.transform(train_df.loc[:, features_names])
reduced_features_test = pca.transform(test_df.iloc[:, 1:])

In [23]:
# 将用于训练的内容提取出来
# train_x = train_df[result_indices].values
train_x = reduced_features
train_y = train_df["cuisine"].values
# 分割训练集验证集, 42更好。。
X_train, X_valid, y_train, y_valid = train_test_split(train_x, train_y, test_size=0.2, random_state=42)

# label: string -> int
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
label_encoder.fit(train_y)
train_y_int = label_encoder.transform(train_y)
y_train_int = label_encoder.transform(y_train)
y_valid_int = label_encoder.transform(y_valid)

In [24]:
params_sklearn = {
    'learning_rate':0.01,
    # 'num_leaves':2^11-1,    
    # 'max_depth':11,
    
    'reg_alpha':0.1,
    'reg_lambda':0.1,   

    'subsample':0.9,
    'colsample_bytree':0.9,
    # 'class_weight':weight
}

In [25]:
clf1 = SVC(class_weight='balanced', gamma=0.1)
clf2 = RandomForestClassifier(random_state=RANDOM_SEED)
clf3 = lgb.LGBMClassifier(**params_sklearn)
clf4 = BernoulliNB()
lr = LogisticRegression()

In [28]:
sclf = StackingCVClassifier(classifiers=[clf1, clf2, clf3, clf4], 
                            meta_classifier=lr,  
                            random_state=RANDOM_SEED)

In [29]:
from sklearn.model_selection import cross_val_score
for clf, label in zip([clf1, clf2, clf3, clf4, sclf], \
                      ['SVC', 'Random Forest', 'LGBM', 'Naive bayes', 'StackingClassifier']):
    scores = cross_val_score(clf, train_x, train_y_int, cv=4, scoring='accuracy')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))

Accuracy: 0.77 (+/- 0.02) [SVC]
Accuracy: 0.73 (+/- 0.01) [Random Forest]
Accuracy: 0.71 (+/- 0.01) [LGBM]
Accuracy: 0.64 (+/- 0.02) [Naive bayes]
Accuracy: 0.59 (+/- 0.01) [StackingClassifier]
