In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import seaborn as sns

%matplotlib inline

## 数据预处理
### 将数据导入,并且查看数据的基本信息

In [None]:
data = pd.read_csv('./data/heart-disease-uci/heart.csv')
#查看一下基本的信息
data.head()

In [None]:
# 没有缺失的数据
data.info()

In [None]:
# 查看一下类别的数目以及种类, 二分类问题, 并且类别比较均衡
np.unique(data['target'], return_counts=True)

In [None]:
# 绘制类别的饼图
data['target'].value_counts().plot(kind='pie')


In [None]:
data.describe()

### 用直方图画出各个变量的分布情况

In [None]:
data.hist(figsize=(15,15))

### 变量热力图

In [None]:
class_vars = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']
plt.figure(figsize=(6.4,4.8), dpi=120)
sns.heatmap(data.corr())

In [None]:
data.corr()['target']

### 类别型变量与target之间的关系饼图

In [None]:
# 画出类别变量与患心脏病之间的关系饼图.
data_target = data.loc[data.target == 1]
for col in class_vars:
    data[col].value_counts().plot(kind='pie')
    plt.show()

通过分析饼图,我们可以发现,是否患心脏病与各个类别型变量的属性均有关系.

### 构造虚拟变量

In [None]:
# MapDict = {
#     'sex':None,
#     'fbs':None,
#     'exang':None,
#     'thal':{2:1, 3:1, 1:0, 0:0},
#     'slope':{2:1,1:1,0:0},
#     'restecg':{2:0, 0:1, 1:1},
#     'ca':{0:0, 0:1,2:0,3:0,4:0},
#     'cp':{0:1,2:1,1:0,3:0},
# }

MapColumns = [ 'sex', 'fbs', 'exang', 'thal', 'slope', 'restecg', 'ca', 'cp']
# def get_dummies(data, **kwargs):
#     tmp_data = data.copy(deep = True)
#     for k, v in kwargs.items():
#         if v:
#             tmp_data[k] = tmp_data[k].map(v)
#             tmp_data = pd.get_dummies(tmp_data, columns = [k])
#         else:
#             tmp_data = pd.get_dummies(tmp_data, columns = [k])
#     return tmp_data

def get_dummies(data, MapColumns):
    tmp_data = data.copy(deep = True)
    for key in MapColumns:
        tmp_data = pd.get_dummies(tmp_data, columns = [key])
    return tmp_data

# df_dummies = get_dummies(data, **MapDict)
df_dummies = get_dummies(data, MapColumns)

In [None]:
df_dummies.describe()

### 分离训练集与测试集

In [None]:
from sklearn.model_selection import train_test_split

X = df_dummies.drop(columns=['target'])
y = df_dummies['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

### 对特征进行标准化处理

In [None]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train_sc = sc.fit_transform(X_train)
X_test_sc = sc.transform(X_test)

## 训练
### 交叉验证选择最优参数

In [None]:
class EstimatorSelectionHelper:

    def __init__(self, models, params):
        if not set(models.keys()).issubset(set(params.keys())):
            missing_params = list(set(models.keys()) - set(params.keys()))
            raise ValueError("Some estimators are missing parameters: %s" % missing_params)
        self.models = models
        self.params = params
        self.keys = models.keys()
        self.grid_searches = {}
        self.best = {}

    def fit(self, X, y, cv=3, n_jobs=3, verbose=1, scoring=None, refit=False):
        for key in self.keys:
            print("Running GridSearchCV for %s." % key)
            model = self.models[key]
            params = self.params[key]
            gs = GridSearchCV(model, params, cv=cv, n_jobs=n_jobs,
                              verbose=verbose, scoring=scoring, refit=refit,
                              return_train_score=True)
            gs.fit(X,y)
            self.best[key] = {'score':gs.best_score_, 'params':gs.best_params_}
            self.grid_searches[key] = gs    

    def score_summary(self, sort_by='mean_score'):
        def row(key, scores, params):
            d = {
                 'estimator': key,
                 'min_score': min(scores),
                 'max_score': max(scores),
                 'mean_score': np.mean(scores),
                 'std_score': np.std(scores),
            }
            return pd.Series({**params,**d})

        rows = []
        for k in self.grid_searches:
            print(k)
            params = self.grid_searches[k].cv_results_['params']
            scores = []
            for i in range(self.grid_searches[k].cv):
                key = "split{}_test_score".format(i)
                r = self.grid_searches[k].cv_results_[key]        
                scores.append(r.reshape(len(params),1))

            all_scores = np.hstack(scores)
            for p, s in zip(params,all_scores):
                rows.append((row(k, s, p)))

        df = pd.concat(rows, axis=1).T.sort_values([sort_by], ascending=False)

        columns = ['estimator', 'min_score', 'mean_score', 'max_score', 'std_score']
        columns = columns + [c for c in df.columns if c not in columns]

        return df[columns]
    def best_params(self):
        return self.best

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

models = {
    'rf':RandomForestClassifier(),
    'gbdt':GradientBoostingClassifier(),
    'ada':AdaBoostClassifier(),
    'xgb':XGBClassifier(),
    'svm':SVC(),
    'lr':LogisticRegression(),
}

params = {
    'rf':{'n_estimators' : [32,64,96], "max_depth" : [4,6,8]},
    'gbdt':{"n_estimators" : [32,64,96], "max_depth" : [2,4,6], "learning_rate" : [0.1,0.3,0.9]},
    'ada':{"n_estimators" : [32,64,96], "learning_rate" : [0.5,1.0]},
    'xgb':{"max_depth" : [2,4,6], "learning_rate" : [0.1,0.3,0.9]},
    'svm': [
        {'kernel': ['linear'], 'C': [1, 10]},
        {'kernel': ['rbf'], 'C': [1, 10], 'gamma': [0.001, 0.0001]},
    ],
    'lr':{'C' : [1, 10]},
}

Helper = EstimatorSelectionHelper(models, params)
Helper.fit(X_train_sc, y_train, cv=5)
Helper.score_summary(sort_by='mean_score')

In [None]:
best = Helper.best_params()
best

### 设置最优参数


In [None]:
best_models = [
    ('rf',RandomForestClassifier()),
    ('gbdt',GradientBoostingClassifier()),
    ('ada',AdaBoostClassifier()),
    ('xgb',XGBClassifier()),
    ('svm',SVC()),
    ('lr',LogisticRegression()),
]

for item in best_models:
    key, clf = item
    clf.set_params(**best[key]['params'])

In [None]:
from sklearn.ensemble import VotingClassifier
vote_hard = VotingClassifier(estimators = best_models , voting = 'hard')
vote_hard.fit(X_train_sc, y_train)
prediction = vote_hard.predict(X_test_sc)

### 计算准确率

In [None]:
correct = (prediction == y_test).sum()
accuracy = correct / len(y_test)

In [None]:
print(f'accuracy {accuracy}')

### 编写一个pipeline

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
steps = [
    ('pca',PCA()),
    ('vote_clf',vote_hard),
]

params = {
    'pca__n_components':[6,12,18]
}

pipeline = Pipeline(steps)

gs = GridSearchCV(pipeline, params, cv = 5)
gs.fit(X_train_sc, y_train)
print(f"best_params{gs.best_params_}, best_score{gs.best_score_}")

### 设置PCA超参数

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=gs.best_params_['pca__n_components'])
X_train_d = pca.fit_transform(X_train_sc)
X_test_d = pca.transform(X_test_sc)

In [None]:
vote_hard.fit(X_train_d, y_train)
prediction = vote_hard.predict(X_test_d)
correct = (prediction == y_test).sum()
accuracy = correct / len(y_test)
print(f'accuracy {accuracy}')