In [1]:
# 导入必要的库
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# import sklearn
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
# from sklearn.feature_selection import RFE


# 读取数据

In [2]:
def unique_value(df: pd.DataFrame) -> np.array:
    df_specific = df.drop(list(df.columns[:2]), axis=1, inplace=False)
    temp = df_specific.to_numpy()
    return np.unique(temp)

In [3]:
train_df = pd.read_csv("../dataset/recipes_train.csv")
test_df = pd.read_csv("../dataset/recipes_test.csv")

In [4]:
train_df.shape, test_df.shape

((1469, 385), (979, 384))

In [5]:
train_df.head()

Unnamed: 0,id,cuisine,almond,angelica,anise,anise_seed,apple,apple_brandy,apricot,armagnac,...,whiskey,white_bread,white_wine,whole_grain_wheat_flour,wine,wood,yam,yeast,yogurt,zucchini
0,1,indian,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,chinese,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,chinese,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,korean,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,thai,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
test_df.head()

Unnamed: 0,id,almond,angelica,anise,anise_seed,apple,apple_brandy,apricot,armagnac,artemisia,...,whiskey,white_bread,white_wine,whole_grain_wheat_flour,wine,wood,yam,yeast,yogurt,zucchini
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# 单model

In [7]:
food_index = {}
food_count = {} # key为类, value为对应类的出现次数
loc_index = {} # key为类, value为其所有数据的索引
for _, group in train_df.groupby(by='cuisine'):
    # name, group = specific data -> DataFrame
    name = group['cuisine'].head(1).item()
    group.drop(["id", 'cuisine'], axis=1, inplace=True)
    loc_index[name] = group.index
    food_index[name] = group.sum().to_dict()
    food_count[name] = len(group)
food_index.keys() # key为类, value为类中各元素总和 dict: (column:sum_value)

dict_keys(['chinese', 'indian', 'japanese', 'korean', 'thai'])

## 特征选择

In [8]:
cuisine_count = list(food_index.keys())
result_indices = np.array([])
for cuisine in cuisine_count:
    cur_features_names = [item[0] for item in food_index[cuisine].items() if item[1]!=0]
    cuisine_features = train_df.loc[loc_index[cuisine], cur_features_names]
    corr_matrix = cuisine_features.corr()
    # 只取相关度 > 0.7的特征
    high_corr_pairs = np.where(np.abs(corr_matrix) > 0.7)
    selected_feature_indices = np.unique(np.concatenate(high_corr_pairs))
    # 每次将不同cuisine获得的有意义的特征名称合并
    result_indices = np.union1d(result_indices, np.array(cur_features_names)[selected_feature_indices])

## 训练

In [9]:
# 将用于训练的内容提取出来
train_x = train_df[result_indices].values
train_y = train_df["cuisine"].values

In [10]:
# 分割训练集验证集, 42更好。。
X_train, X_valid, y_train, y_valid = train_test_split(train_x, train_y, test_size=0.2, random_state=42)

In [14]:
# 使用lr模型
model = SVC(C=0.8, kernel='linear')

# 训练模型
model.fit(X_train, y_train)
print("train accuracy:", accuracy_score(model.predict(X_train), y_train))
print("valid accuracy:", accuracy_score(model.predict(X_valid), y_valid))

train accuracy: 0.9131914893617021
valid accuracy: 0.8129251700680272


# 集成model

In [17]:
class BaggingClassifier(object):
    def __init__(self
                 ,n_estimator:int = None 
                 ,base_estimator = None
                 ,ratio:float = 1.0 
                 ,bootstrap:bool = True
                 ,random_state:int = 42
                ):
        """
        :param n_estimator: 基分类器数
        :param base_estimator: 基分类器
        :param ratio: 样本抽取比例
        :param bootstrap: bootstrap抽样
        :param random_state: 设定随机种子
        """
        self.n_estimator = n_estimator
        self.base_estimator = base_estimator
        self.ratio = ratio
        self.bootstrap = bootstrap
        self.random_state = random_state
        self.__estimators = None # 私有变量

    def __mode(self,x):
        """
        内置函数，用来计算一个数组的众数
        :param x: 一维数组
        :return: 返回数组的众数
        """
        freq = {i: x.tolist().count(i) for i in np.unique(x)}
        mode = max(freq, key=lambda k: freq[k])
        return mode

    def fit(self,x,y):
        """
        模型拟合函数
        :param x: 输入矩阵
        :param y: 标签
        :return: 
        """
        n = len(x)
        self.__estimators = []
        np.random.seed(self.random_state)
        for i in range(self.n_estimator):
            index = np.random.choice(n, int(n * self.ratio), replace=self.bootstrap)
            x_new = x[index]
            y_new = y[index]
            self.__estimators.append(self.base_estimator.fit(x_new,y_new))

    def predict(self, x):
        """
        获得结果
        :param x: 输入矩阵
        :return: 预测标签
        """
        pred = []
        for i in range(len(self.__estimators)):
            pred.append(self.__estimators[i].predict(x))
        pred = np.array(pred)
        results = []
        for i in range(pred.shape[1]):
            col = pred[:,i]
            results.append(self.__mode(col))
        return np.array(results)

In [19]:
base_model = SVC(C=0.8, kernel='linear')
bagging = BaggingClassifier(n_estimator=100, base_estimator=base_model, ratio=0.6)
bagging.fit(X_train, y_train)
print("train accuracy:", accuracy_score(bagging.predict(X_train), y_train))
print("valid accuracy:", accuracy_score(bagging.predict(X_valid), y_valid))

train accuracy: 0.8238297872340425
valid accuracy: 0.782312925170068
