In [1]:
#앞의 전처리 진행을 모두 함수로 변경

#Null 처리 함수
def fillna(df):
    df['Age'].fillna(df['Age'].mean(), inplace=True)
    df['Cabin'].fillna('N',inplace=True)
    df['Embarked'].fillna('N',inplace=True)
    df['Fare'].fillna(0,inplace=True)
    return df

In [2]:
#머신러닝 시 불필요한 칼럼 제거
def drop_col(df):
    df.drop(['PassengerId','Name','Ticket'], axis=1, inplace=True)
    return df

In [3]:
#레이블인코딩
from sklearn.preprocessing import LabelEncoder

def encoding(df):
    df['Cabin']=df['Cabin'].str[:1]
    features = ['Cabin','Sex','Embarked']
    for i in features:
        la=LabelEncoder()
        la.fit(df[i])
        df[i] = la.transform(df[i])
        
    return df

In [4]:
#앞의 3개 함수를 한번에 가져오는 함수
def transform_features(df):
    df = fillna(df)
    df = drop_col(df)
    df = encoding(df)
    return df

In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [6]:
df=pd.read_csv('01/train.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [7]:
#전처리 적용
new_df=transform_features(df)
new_df.head() #new_df가 바로 우리가 이용할 df!

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,1,22.0,1,0,7.25,7,3
1,1,1,0,38.0,1,0,71.2833,2,0
2,1,3,0,26.0,0,0,7.925,7,3
3,1,1,0,35.0,1,0,53.1,2,3
4,0,3,1,35.0,0,0,8.05,7,3


## 본격 머신러닝! y:'Survived', x: 그 외 숫자형 변수들

In [8]:
y_df = new_df['Survived']
X_df = new_df.drop('Survived',axis=1)

##### train/test split

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, test_size=0.2, random_state =11)

##### ML 알고리즘(분류): 비교를 위해, 결정트리 / rf / logistic 3개를 사용

In [10]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [11]:
#객체 생성 (결정트리, RF, 로지스틱 Classifier 각각 생성)
dt_clf=DecisionTreeClassifier(random_state=11)
rf_clf=RandomForestClassifier(random_state=11)
lr_clf=LogisticRegression()

- 사실 정확하게 하려면 train/test 나눈 후 교차검증까지 해야 하지만 단순한 성능비교를 위해 바로 학습후 정확도 측정

In [12]:
##결정트리
dt_clf.fit(X_train,y_train)
dt_pred=dt_clf.predict(X_test)
print(accuracy_score(y_test, dt_pred))

0.7877094972067039


In [13]:
##RF
rf_clf.fit(X_train,y_train)
rf_pred=rf_clf.predict(X_test)
print(accuracy_score(y_test,rf_pred))

0.8324022346368715




In [14]:
##로지스틱
lr_clf.fit(X_train,y_train)
lr_pred=lr_clf.predict(X_test)
print(accuracy_score(y_test,lr_pred)) #3개 알고리즘 중에선, 로지스틱의 정확도가 가장 높음.

0.8659217877094972




##### 교차검증
- 1) kfold
- 2) cross_val_score()
- 3) GridSearchCV()

In [15]:
##kfold(5-fold)
from sklearn.model_selection import KFold

def exec_kfold(clf, folds=5):
    kfold=KFold(n_splits=folds)
    scores=[]
    
    for iter_count, (train_index, test_index) in enumerate(kfold.split(X_df)):
        X_train, X_test = X_df.values[train_index], X_df.values[test_index]
        y_train, y_test = y_df.values[train_index], y_df.values[test_index]
        
        clf.fit(X_train,y_train) #학습
        pred = clf.predict(X_test) #예측
        accuracy = accuracy_score(y_test, pred) #정확도
        scores.append(accuracy)
        print("교차 검증 {0} 정확도: {1:.4f}".format(iter_count, accuracy))
    mean_score = np.mean(scores) #5개 fold의 평균 정확도
    print(' ')
    print('평균 정확도: {0:.4f}'.format(mean_score))


In [16]:
exec_kfold(dt_clf)

교차 검증 0 정확도: 0.7542
교차 검증 1 정확도: 0.7809
교차 검증 2 정확도: 0.7865
교차 검증 3 정확도: 0.7697
교차 검증 4 정확도: 0.8202
 
평균 정확도: 0.7823


In [17]:
exec_kfold(rf_clf)

교차 검증 0 정확도: 0.7989
교차 검증 1 정확도: 0.7640
교차 검증 2 정확도: 0.8202
교차 검증 3 정확도: 0.7921
교차 검증 4 정확도: 0.8315
 
평균 정확도: 0.8013


In [18]:
exec_kfold(lr_clf)

교차 검증 0 정확도: 0.7933
교차 검증 1 정확도: 0.7921
교차 검증 2 정확도: 0.7753
교차 검증 3 정확도: 0.7472
교차 검증 4 정확도: 0.8427
 
평균 정확도: 0.7901




In [19]:
##cross_val_score(): 분류는 자동으로 Stratified k-fold 이용(그래서 위의 k-fold와 값이 좀 다름)
from sklearn.model_selection import cross_val_score

In [20]:
#dt
scores = cross_val_score(dt_clf, X_df, y_df, scoring='accuracy',cv=5)
np.mean(scores)

0.7835081515022234

In [30]:
#rf
scores = cross_val_score(rf_clf, X_df, y_df, cv=5)
np.mean(scores)

0.807078761622775

In [31]:
#lr
scores = cross_val_score(lr_clf, X_df, y_df, cv=5)
np.mean(scores)



0.7879080937673661

In [32]:
##gridsearchcv
from sklearn.model_selection import GridSearchCV

In [36]:
#여기서는 dt에 대해서만 수행해 보자.(이유: 하이퍼 파라미터 지정하는 게 모델마다 다름!)

#train/test 분할
X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, test_size=0.2, random_state=11)

#파라미터 지정
params={'max_depth': [2,3,5,10], 'min_samples_split': [2,3,5], 'min_samples_leaf': [1,5,8]}

#학습
grid_dclf=GridSearchCV(dt_clf, param_grid=params, scoring='accuracy', cv=5)
grid_dclf.fit(X_train, y_train)



GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=11,
            splitter='best'),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'max_depth': [2, 3, 5, 10], 'min_samples_split': [2, 3, 5], 'min_samples_leaf': [1, 5, 8]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [37]:
#결과
print(grid_dclf.best_params_) #최적의 하이퍼 파라미터
print(grid_dclf.best_score_) #그 때의 스코어

{'max_depth': 3, 'min_samples_leaf': 1, 'min_samples_split': 2}
0.7991573033707865


In [38]:
best_dclf=grid_dclf.best_estimator_

#예측
pred = best_dclf.predict(X_test)
#정확도
accuracy = accuracy_score(y_test, pred)
print(accuracy) 

0.8715083798882681
