In [1]:
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.tree import export_graphviz as eg
import graphviz
import matplotlib.pyplot as plt
import numpy as np

## 전처리

범주형 변수들을 get_dummies() 함수를 이용해 변환하여 기존 heart 데이터프레임에 저장합니다. <br/>
age(나이)의 경우 한 살, 한 살의 영향이 크지 않을 것이고 연령대별로 판단하는 것이 더 좋은 방법이라 생각하여 범주형변수로 변환 후 분석하였습니다.

In [2]:
heart=pd.read_csv('heart.csv')

In [3]:
heart.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


## age의 label을 먼저 만들어 준 뒤 pandas의cut 메소드를 사용, 범주형 변수로 변환합니다.

In [4]:
labels = ["{0} - {1}".format(i, i + 9) for i in range(20, 80, 10)]

In [5]:
labels

['20 - 29', '30 - 39', '40 - 49', '50 - 59', '60 - 69', '70 - 79']

In [6]:
heart['age'] = pd.cut(heart.age, range(20, 90, 10), right=False, labels=labels)

In [7]:
heart.dtypes

age         category
sex            int64
cp             int64
trestbps       int64
chol           int64
fbs            int64
restecg        int64
thalach        int64
exang          int64
oldpeak      float64
slope          int64
ca             int64
thal           int64
target         int64
dtype: object

In [8]:
cat_list=['age','sex','cp','fbs','restecg','exang','slope','ca','thal']
for i in cat_list:
    heart[i]=pd.Categorical(heart[i])

In [9]:
heart=pd.get_dummies(heart)

In [10]:
RANDOM_STATE=11

## Train, test 데이터를 만듭니다.

In [11]:
X_train, X_test, y_train, y_test = train_test_split(heart.drop(['target'],axis=1),heart.target,test_size=0.3,random_state=RANDOM_STATE)

In [12]:
X_train.head()

Unnamed: 0,trestbps,chol,thalach,oldpeak,age_20 - 29,age_30 - 39,age_40 - 49,age_50 - 59,age_60 - 69,age_70 - 79,...,slope_2,ca_0,ca_1,ca_2,ca_3,ca_4,thal_0,thal_1,thal_2,thal_3
20,135,234,161,0.5,0,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,1
265,112,212,132,0.1,0,0,0,0,1,0,...,1,0,1,0,0,0,0,0,1,0
45,120,325,172,0.2,0,0,0,1,0,0,...,1,1,0,0,0,0,0,0,1,0
230,108,243,152,0.0,0,0,1,0,0,0,...,1,1,0,0,0,0,0,0,1,0
29,130,197,152,1.2,0,0,0,1,0,0,...,0,1,0,0,0,0,0,0,1,0


In [13]:
y_train.head()

20     1
265    0
45     1
230    0
29     1
Name: target, dtype: int64

In [14]:
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import RidgeClassifier

In [15]:
rcf=RidgeClassifier().fit(X_train,y_train)

In [16]:
rcf.score(X_test,y_test)

0.8021978021978022

In [17]:
s_pipe=Pipeline([('scaler',StandardScaler()),('rcf',RidgeClassifier())])

In [18]:
s_pipe.fit(X_train,y_train)

Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('rcf', RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,
        max_iter=None, normalize=False, random_state=None, solver='auto',
        tol=0.001))])

In [19]:
s_pipe.score(X_test,y_test)

0.7912087912087912

In [20]:
m_pipe=Pipeline([('scaler',MinMaxScaler()),('rcf',RidgeClassifier())])

In [21]:
m_pipe.fit(X_train,y_train)

Pipeline(memory=None,
     steps=[('scaler', MinMaxScaler(copy=True, feature_range=(0, 1))), ('rcf', RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,
        max_iter=None, normalize=False, random_state=None, solver='auto',
        tol=0.001))])

In [22]:
m_pipe.score(X_test,y_test)

0.8021978021978022

In [23]:
params = {'random_state':np.arange(0,20),'alpha':[10**i for i in [1,0,-1,-2,-3]], 'max_iter':np.arange(10,100,10)}

In [24]:
rcf_grid=GridSearchCV(RidgeClassifier(), params)

In [25]:
rcf_grid.fit(X_train,y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,
        max_iter=None, normalize=False, random_state=None, solver='auto',
        tol=0.001),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'random_state': array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19]), 'alpha': [10, 1, 0.1, 0.01, 0.001], 'max_iter': array([10, 20, 30, 40, 50, 60, 70, 80, 90])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [26]:
rcf_grid.score(X_test,y_test)

0.8131868131868132

In [27]:
rcf_grid.best_params_

{'alpha': 10, 'max_iter': 10, 'random_state': 0}

In [28]:
rcf_grid.best_estimator_

RidgeClassifier(alpha=10, class_weight=None, copy_X=True, fit_intercept=True,
        max_iter=10, normalize=False, random_state=0, solver='auto',
        tol=0.001)

## Ensemble
https://chrisalbon.com/machine_learning/model_selection/model_selection_using_grid_search/

In [50]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import KFold

In [51]:
kf = KFold(n_splits=5)

In [67]:
# Create a pipeline
pipe = Pipeline([('classifier', RandomForestClassifier())])


search_space = [{'classifier': [GradientBoostingClassifier()],
                 'classifier__learning_rate': [0.1, 0.01, 0.001],
                 'classifier__max_depth': [3,5,10]},
                {'classifier': [RandomForestClassifier()],
                 'classifier__n_estimators': [10, 100],
                 'classifier__max_depth': [3, 5, 10]},
               {'classifier': [AdaBoostClassifier()],
                 'classifier__learning_rate': [0.1, 0.01, 0.001],
                 'classifier__n_estimators': [50,70]}]

splits=[3,5,10]

In [70]:
clf = GridSearchCV(pipe, search_space, cv=5)

In [71]:
clf.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('classifier', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'classifier': [GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.001, loss='deviance', max_depth=10,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samp...tate=None)], 'classifier__learning_rate': [0.1, 0.01, 0.001], 'classifier__n_estimators': [50, 70]}],
       pre_di

In [72]:
clf.score(X_test,y_test)

0.8131868131868132

In [73]:
clf.best_estimator_ 

Pipeline(memory=None,
     steps=[('classifier', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=3, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])