In [2]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd

In [3]:
# machine learning
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OneHotEncoder

In [5]:
#다운로드 링크: 
train_df = pd.read_csv('train_preprocessed.csv')
test_df = pd.read_csv('test_preprocessed.csv')
combine = [train_df, test_df]

In [6]:
train_df.info()
print("\n===============================================\n")
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
Survived     891 non-null int64
Pclass       891 non-null int64
Sex          891 non-null int64
Age          891 non-null int64
Fare         891 non-null float64
Embarked     891 non-null int64
Title        891 non-null int64
IsAlone      891 non-null int64
Age*Class    891 non-null int64
dtypes: float64(1), int64(8)
memory usage: 62.7 KB


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 9 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Sex            418 non-null int64
Age            418 non-null int64
Fare           418 non-null float64
Embarked       418 non-null int64
Title          418 non-null int64
IsAlone        418 non-null int64
Age*Class      418 non-null int64
dtypes: float64(1), int64(8)
memory usage: 29.4 KB


### Training / Validation set 분리

In [7]:
from sklearn.model_selection import train_test_split

X = train_df.drop("Survived", axis=1)
Y = train_df["Survived"]
X_test  = test_df.drop("PassengerId", axis=1).copy()
#test데이터는 validation data로, 타당성을 검증해준다.(훈련데이터셋과는 다름)
#train data와 test data는 분포가 같다는 가정이나 train으로 모델을 만들고 test로 평가를 한다.

X_train, X_val, y_train, y_val = \
train_test_split(X, Y, test_size = 0.20, random_state=5)
X_train.shape, y_train.shape, X_val.shape, y_val.shape

((712, 8), (712,), (179, 8), (179,))

### 파라미터 비교 / 확인
#### class_weight
- 기본 파라미터 확인

In [32]:
# training과 validation을 8:2로 나눈 뒤 train으로 모델을 만들고 validation으로 예측
'''
training : 모델을 만들기 위한 데이터
validation : 예측을 위한 데이터

- 둘의 분포는 같다는 가정. ( 같은 모집단에서 생성된 데이터라는 가정)
'''
decision_tree = DecisionTreeClassifier()
decision_tree.get_params() #모양을 결정하는 파라미터들

{'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_split': 1e-07,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'presort': False,
 'random_state': None,
 'splitter': 'best'}

 - class_weight = 0:1, 1:1

In [9]:
decision_tree=DecisionTreeClassifier(class_weight={0:1,1:1})
decision_tree.fit(X_train,y_train)
#Y_pred=decision_tree.predict(X_test) 안써줘도 됨
acc_traing=round(decision_tree.score(X_train,y_train)*100,2)
acc_validation=round(decision_tree.score(X_val,y_val)*100,2)
print("training_acc: ",acc_traing,"  ","validation_acc: ",acc_validation)

training_acc:  94.1    validation_acc:  83.8


 - class_weight = 0:1, 1:10

In [14]:
"""
class_weight는 모델의 성능을 나타내진 못하지만 모델의 특성을 나타내준다.
y값(target변수)이 0(죽었다)과 1(산다)이 있을 때, 산다고 예측하는 것이 더 중요한 경우,
산다고 예측하는 것에 대해서 비중을 높게 줄 수 있다. 키값을 주어, 0죽는다에는 1
1산다에는 10의 비중을 두어 중요성을 줄 수 있다.
"""
decision_tree=DecisionTreeClassifier(class_weight={0:1,1:10})
#앞의 것보다 성능은 떨어져도 살아남은 사람을 살아남았다고 예측한 확률은 높다.
decision_tree.fit(X_train,y_train)
#Y_pred=decision_tree.predict(X_test) 안써줘도 됨
acc_traing=round(decision_tree.score(X_train,y_train)*100,2)
acc_validation=round(decision_tree.score(X_val,y_val)*100,2)
print("training_acc: ",acc_traing,"  ","validation_acc: ",acc_validation)

training_acc:  83.99    validation_acc:  68.72


 - class_weight = 0:10, 1:1

In [13]:
decision_tree=DecisionTreeClassifier(class_weight={0:10,1:1})
#앞의 것보다 성능은 떨어져도 살아남은 사람을 살아남았다고 예측한 확률은 높다.
decision_tree.fit(X_train,y_train)
#Y_pred=decision_tree.predict(X_test) 안써줘도 됨
acc_traing=round(decision_tree.score(X_train,y_train)*100,2)
acc_validation=round(decision_tree.score(X_val,y_val)*100,2)
print("training_acc: ",acc_traing,"  ","validation_acc: ",acc_validation)

training_acc:  91.99    validation_acc:  82.12


 - gini impurity

In [11]:
#성능을 높게 하는데 쓰임
decision_tree=DecisionTreeClassifier(criterion="gini")
decision_tree.fit(X_train,y_train)
#Y_pred=decision_tree.predict(X_test) 안써줘도 됨
acc_traing=round(decision_tree.score(X_train,y_train)*100,2)
acc_validation=round(decision_tree.score(X_val,y_val)*100,2)
print("training_acc: ",acc_traing,"  ","validation_acc: ",acc_validation)

training_acc:  94.1    validation_acc:  83.8


 - entropy

In [12]:
#변수의 설명력을 높여줌
decision_tree=DecisionTreeClassifier(criterion="entropy")
decision_tree.fit(X_train,y_train)
#Y_pred=decision_tree.predict(X_test) 안써줘도 됨
acc_traing=round(decision_tree.score(X_train,y_train)*100,2)
acc_validation=round(decision_tree.score(X_val,y_val)*100,2)
print("training_acc: ",acc_traing,"  ","validation_acc: ",acc_validation)

training_acc:  94.1    validation_acc:  84.36


### 가지치기(Pruning) 관련 파라미터 비교
 - 기본 파라미터 확인

In [None]:
decision_tree = DecisionTreeClassifier()
decision_tree.get_params()

 - 가지치기 없음 

In [None]:
decision_tree=DecisionTreeClassifier()
decision_tree.fit(X_train,y_train)
#Y_pred=decision_tree.predict(X_test) 안써줘도 됨
acc_traing=round(decision_tree.score(X_train,y_train)*100,2)
acc_validation=round(decision_tree.score(X_val,y_val)*100,2)
print("training_acc: ",acc_traing,"  ","validation_acc: ",acc_validation)

 - 가지치기(max_depth 사용)

In [19]:
decision_tree=DecisionTreeClassifier(max_depth=19)
#max_depth를 높일수록 성능이 좋아짐, depth가 매우 높았을 때 특징은 train과 test의 차이가 커진다는 것이다.
decision_tree.fit(X_train,y_train)
#Y_pred=decision_tree.predict(X_test) 안써줘도 됨
acc_traing=round(decision_tree.score(X_train,y_train)*100,2)
acc_validation=round(decision_tree.score(X_val,y_val)*100,2)
print("training_acc: ",acc_traing,"  ","validation_acc: ",acc_validation)

training_acc:  94.1    validation_acc:  84.36


 - 가지치기(max_leaf_nodes 사용)

In [22]:
decision_tree=DecisionTreeClassifier(max_leaf_nodes=30)
decision_tree.fit(X_train,y_train)
#Y_pred=decision_tree.predict(X_test) 안써줘도 됨
acc_traing=round(decision_tree.score(X_train,y_train)*100,2)
acc_validation=round(decision_tree.score(X_val,y_val)*100,2)
print("training_acc: ",acc_traing,"  ","validation_acc: ",acc_validation)

training_acc:  88.48    validation_acc:  86.03


 - 가지치기(min_samples_split 사용)

In [24]:
decision_tree=DecisionTreeClassifier(min_samples_split=30)
decision_tree.fit(X_train,y_train)
#Y_pred=decision_tree.predict(X_test) 안써줘도 됨
acc_traing=round(decision_tree.score(X_train,y_train)*100,2)
acc_validation=round(decision_tree.score(X_val,y_val)*100,2)
print("training_acc: ",acc_traing,"  ","validation_acc: ",acc_validation)

training_acc:  88.06    validation_acc:  86.03


 - 가지치기(min_impurity_decrease 사용)

In [33]:
decision_tree=DecisionTreeClassifier(min_impurity_split=0.01)
decision_tree.fit(X_train,y_train)
#Y_pred=decision_tree.predict(X_test) 안써줘도 됨
acc_traing=round(decision_tree.score(X_train,y_train)*100,2)
acc_validation=round(decision_tree.score(X_val,y_val)*100,2)
print("training_acc: ",acc_traing,"  ","validation_acc: ",acc_validation)

training_acc:  94.1    validation_acc:  83.24


 - 가지치기(min_impurity_split 사용)

In [None]:
decision_tree = DecisionTreeClassifier(min_impurity_split=0.01)
decision_tree.fit(X_train, y_train)
Y_pred = decision_tree.predict(X_test)
acc_traing = round(decision_tree.score(X_train, y_train) * 100, 2)
acc_validation = round(decision_tree.score(X_val, y_val) * 100, 2)
print("training_acc:", acc_traing, "   ", "validation_acc: ", acc_validation)

### 파라미터 최적화
#### Exhaustive Grid Search
 - 기본 파라미터 확인 

In [None]:
decision_tree = DecisionTreeClassifier()
decision_tree.get_params()

In [37]:
"""
Exhaustive Gird Search는 완결탐색으로 검사항목과 기억세트의 모든 항목과의 비교가 끝날 때까지 계속되는
탐색이다. Grid(격자무늬)
"""
from sklearn.model_selection import GridSearchCV

#고정할 파라미터 지정
#class_weight는 default 값(1) 사용
#presort: False
#splitter: best
decision_tree=DecisionTreeClassifier()
parameters={
    'criterion':['gini'],
    'max_depth':np.arange(2,10),
#     'max_leaf_nodes':np.arange(30,40),
#     'min_impurity_decrease': np.append(np.arange(0,0.05,0.005),0.0)
    'min_impurity_split': np.append(np.arange(0,0.02,0.005),0.0),
    'min_samples_leaf': np.append(np.arange(4,7),1),
    'min_samples_split': np.append(np.arange(18,22),2),
#     'presort': True,
    'min_weight_fraction_leaf': np.arange(0,0.5,0.05)}

parameters

clf=GridSearchCV(estimator=decision_tree,param_grid=parameters)
#decision_tree 모델의 최적의 파라미터를 찾아준다. 
clf.fit(X_train,y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'criterion': ['gini'], 'max_depth': array([2, 3, 4, 5, 6, 7, 8, 9]), 'min_impurity_split': array([ 0.   ,  0.005,  0.01 ,  0.015,  0.   ]), 'min_samples_leaf': array([4, 5, 6, 1]), 'min_samples_split': array([18, 19, 20, 21,  2]), 'min_weight_fraction_leaf': array([ 0.  ,  0.05,  0.1 ,  0.15,  0.2 ,  0.25,  0.3 ,  0.35,  0.4 ,  0.45])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [None]:
## 파라미터 확인
clf.best_params_

In [None]:
#성능확인
acc_traing=round(decision_tree.score(X_train,y_train)*100,2)
acc_validation=round(decision_tree.score(X_val,y_val)*100,2)
print("training_acc: ",acc_traing,"  ","validation_acc: ",acc_validation)