In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix


%matplotlib inline
plt.style.use("fivethirtyeight")
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 100)

## 数据预处理
### 将数据导入,并且查看数据的基本信息

In [None]:
data = pd.read_csv('./data/heart-disease-uci/heart.csv')
#查看一下基本的信息
data.head()

In [None]:
# 没有缺失的数据
data.info()

In [None]:
# 查看一下类别的数目以及种类, 二分类问题, 并且类别比较均衡
np.unique(data['target'], return_counts=True)

In [None]:
# 绘制类别的饼图
data['target'].value_counts().plot(kind='bar')
plt.xlabel('target')

In [None]:
data.describe()

### 用直方图画出各个变量的分布情况

In [None]:
data.hist(bins=30,figsize=(15,15))

### 类别型变量与Target之间的关系

In [None]:
class_vars = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']
plt.figure(figsize=(15,15))
for i, column in enumerate(class_vars, 1):
    plt.subplot(3, 3, i)
    data[data.target == 0][column].value_counts().plot(kind='bar', color='blue', alpha = 0.5, label = 'Have Heart Disease = NO')
    data[data.target == 1][column].value_counts().plot(kind='bar', color='red', alpha = 0.5, label = 'Have Heart Disease = YES')
    plt.legend(loc='best')
    plt.xlabel(column)


# numeric_vars = ['age', 'chol', 'oldpeak', 'thalach', 'trestbps']
# plt.figure(figsize=(6.4,4.8), dpi=120)
# sns.heatmap(data.corr())

### 连续型变量与Target之间的关系

In [None]:
continue_vars = ['age', 'chol', 'oldpeak', 'thalach', 'trestbps']
plt.figure(figsize=(10,10))
for i, column in enumerate(continue_vars, 1):
    plt.subplot(3, 3, i)
    data[data.target == 0][column].hist(bins=30, color='blue', alpha = 0.5, label = 'Have Heart Disease = NO')
    data[data.target == 1][column].hist(bins=30, color='red', alpha = 0.5, label = 'Have Heart Disease = YES')
    plt.legend(loc='best',fontsize='x-small')
    plt.xlabel(column)


### HeatMap

In [None]:
corr_matrix = data.corr()
fig, ax = plt.subplots(figsize=(10, 10))
ax = sns.heatmap(corr_matrix,
                 annot=True,
                 linewidths=0.5,
                 fmt=".2f",
                 cmap="YlGnBu");
bottom, top = ax.get_ylim()
ax.set_ylim(bottom + 0.5, top - 0.5)

In [None]:
corr_matrix['target'].iloc[:-1].plot(kind='bar', grid=True, figsize=(8, 4), \
                                                    title="Correlation with target")

### 构造虚拟变量

In [None]:
def get_dummies(data, class_vars):
    tmp_data = data.copy(deep = True)
    for key in class_vars:
        tmp_data = pd.get_dummies(tmp_data, columns = [key])
    return tmp_data

df_dummies = get_dummies(data, class_vars)

In [None]:
df_dummies.describe()

### 标准化数据集

In [None]:
from sklearn.preprocessing import StandardScaler
X = df_dummies.drop(columns=['target'])
y = df_dummies['target']

sc = StandardScaler()
X[continue_vars] = sc.fit_transform(X[continue_vars])
X.describe()

### 划分数据集

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

## 训练
### 交叉验证选择最优参数

In [None]:
class EstimatorSelectionHelper:

    def __init__(self, models, params):
        if not set(models.keys()).issubset(set(params.keys())):
            missing_params = list(set(models.keys()) - set(params.keys()))
            raise ValueError("Some estimators are missing parameters: %s" % missing_params)
        self.models = models
        self.params = params
        self.keys = models.keys()
        self.grid_searches = {}
        self.best = {}

    def fit(self, X, y, cv=3, n_jobs=3, verbose=1, scoring=None, refit=False):
        for key in self.keys:
            print("Running GridSearchCV for %s." % key)
            model = self.models[key]
            params = self.params[key]
            gs = GridSearchCV(model, params, cv=cv, n_jobs=n_jobs,
                              verbose=verbose, scoring=scoring, refit=refit,
                              return_train_score=True)
            gs.fit(X,y)
            self.best[key] = {'score':gs.best_score_, 'params':gs.best_params_}
            self.grid_searches[key] = gs    

    def score_summary(self, sort_by='mean_score'):
        def row(key, scores, params):
            d = {
                 'estimator': key,
                 'min_score': min(scores),
                 'max_score': max(scores),
                 'mean_score': np.mean(scores),
                 'std_score': np.std(scores),
            }
            return pd.Series({**params,**d})

        rows = []
        for k in self.grid_searches:
            print(k)
            params = self.grid_searches[k].cv_results_['params']
            scores = []
            for i in range(self.grid_searches[k].cv):
                key = "split{}_test_score".format(i)
                r = self.grid_searches[k].cv_results_[key]        
                scores.append(r.reshape(len(params),1))

            all_scores = np.hstack(scores)
            for p, s in zip(params,all_scores):
                rows.append((row(k, s, p)))

        df = pd.concat(rows, axis=1).T.sort_values([sort_by], ascending=False)

        columns = ['estimator', 'min_score', 'mean_score', 'max_score', 'std_score']
        columns = columns + [c for c in df.columns if c not in columns]

        return df[columns]
    def best_params(self):
        return self.best

In [None]:
models = {
    'rf':RandomForestClassifier(),
    'gbdt':GradientBoostingClassifier(),
    'ada':AdaBoostClassifier(),
    'xgb':XGBClassifier(),
    'svm':SVC(),
    'lr':LogisticRegression(),
    'knn':KNeighborsClassifier(),
    'dt': DecisionTreeClassifier(),
}

params = {
    'rf':{'n_estimators' : [16, 32,64], "max_depth" : [2,4,6]},
    'gbdt':{"n_estimators" : [16, 32,64,96], "max_depth" : [2,4,6], "learning_rate" : [0.1,0.3,0.9]},
    'ada':{"n_estimators" : [16, 32,64], "learning_rate" : [0.5,1.0]},
    'xgb':{"max_depth" : [2,4,6], "learning_rate" : [0.1,0.3,0.9]},
    'svm': [
        {'kernel': ['linear'], 'C': [1, 10]},
        {'kernel': ['rbf'], 'C': [1, 10], 'gamma': [0.001, 0.0001]},
    ],
    'lr':{'C' : [1, 0.5], 'penalty' : ['l1','l2'], 'solver':['liblinear']},
    'knn':{'n_neighbors' : [5,10,15]},
    'dt':{"max_depth" : [2,4,6],}
}

Helper = EstimatorSelectionHelper(models, params)
Helper.fit(X_train, y_train, cv=5)
Helper.score_summary(sort_by='mean_score')

In [None]:
best = Helper.best_params()
best

### 设置最优参数


In [None]:
best_models = [
    ('rf',RandomForestClassifier()),
    ('gbdt',GradientBoostingClassifier()),
    ('ada',AdaBoostClassifier()),
    ('xgb',XGBClassifier()),
    ('svm',SVC()),
    ('lr',LogisticRegression()),
    ('knn', KNeighborsClassifier()),
    ('dt', DecisionTreeClassifier()),
]

for item in best_models:
    key, clf = item
    clf.set_params(**best[key]['params'])

In [None]:
vote_hard = VotingClassifier(estimators = best_models , voting = 'hard')
vote_hard.fit(X_train, y_train)
prediction = vote_hard.predict(X_test)

### 计算准确率

In [None]:
correct = (prediction == y_test).sum()
accuracy = correct / len(y_test)
print(f'accuracy {accuracy}')

## 模型可视化

### 混淆矩阵以及分类结果报告

In [None]:
cr = classification_report(y_test, prediction)
print(cr)

In [None]:
cm = confusion_matrix(y_test, prediction)
plt.figure(figsize=(7,7))
sns.heatmap(cm, annot = True, annot_kws = {'size':15}, cmap = 'PuBu')

### 通过RF查看特征的重要程度

In [None]:
rf = best_models[0][1]
rf.fit(X_train, y_train)
def feature_imp(df, model):
    df = df.drop(columns = ['target'])
    fi = pd.DataFrame()
    fi["feature"] = df.columns
    fi["importance"] = model.feature_importances_
    return fi.sort_values(by="importance", ascending=False)

feature_imp(df_dummies, rf).plot(kind='barh', figsize=(12,7), legend=False)

## 复盘
**1.第一个出错的点,就是在画与target相关变量的饼图的时候,并没有考虑将0也算在内,某个条件患心脏病的人数多,并不能认为该条件下患心脏病的人数就高, 因此,直接拿来做为同一个特征是不合理的.**

**2.没有只缩放连续型变量,这里应该是个导致结果不好的原因.**

**3.归一化与划分数据集的先后顺序问题. 参考知乎上的回答:**

1、整体归一化后，再划分。
实际上我是从一些数据竞赛中得到一些经验。整体归一化实际上是让你的模型更快地知道了比较稳定的统计量（比如均值），因为一个是部分训练集统计的均值，而另一部分是训练集全集的均值。还有在特征构建上，经常会计算某某字段距离该字段均值（或其他统计量，比如比较稳健的中位数）的差距或者距离。更多样本的均值自然会比较稳定。这样在本地测试集上的效果评价就会过于乐观，或者说我提到的泄露。你也知道，在数据算法竞赛中，线上和本地测试集的优化方向一致性是非常重要的。如果你不严格地做本地测试集，真正把它隔离出来，那在本地优化个半天的结果，传到线上，就会一塌糊涂，也就是过拟合本地的测试集了。另外数据算法竞赛竞争非常激烈，前几名可能都是丝毫分厘之差而已。做好这些细节是有助于提高名次了。当然在实际工程应用中，这些就比较随意和灵活，更讲求实用性。

**方案1通常被视为"data leakage"原则错误，在训练模型的时候应用了test的信息(如极值、均值方差等)；同时方案1也会使得训练集中各变量不严格服从[0,1]的分布，这和归一化的初衷是相违背的；
所以,正确的做法是对训练集进行标准化, 然后用训练集的标准去规范测试集.**

**4. 集成树模型可以输出特征的重要性**