In [1]:
# sklearn 에 대해 
import sklearn

print(sklearn.__version__)

0.24.1


In [19]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

import numpy as np
import pandas as pd

# load 해보면 걍 bunch라고 나오는데, 그게 중요한건 아니고 value들이네 정도 생각
# 그다음에 데이터 프레임에 넣어보자 
iris = load_iris()

# (150,4) feature(data) set
iris_data = iris.data
# print(iris_data, iris_data.shape)

# (150,) target(label) set
iris_label = iris.target
# print(iris_label, iris_label.shape)

# name of feature and target
print(iris.feature_names, iris.target_names)

# creating dataframe
iris_df = pd.DataFrame(data = iris_data, columns=iris.feature_names)

['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)'] ['setosa' 'versicolor' 'virginica']


In [20]:
iris_df.head(3)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2


In [21]:
# add target in dataframe, w column name 'label'
iris_df['label'] = iris.target
iris_df.head(3)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),label
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0


In [24]:
# target has three values, which are 'setosa', 'versicolor', 'virginica'
iris_df['label'].value_counts(), iris.target_names

(0    50
 1    50
 2    50
 Name: label, dtype: int64,
 array(['setosa', 'versicolor', 'virginica'], dtype='<U10'))

In [29]:
# divide train set to train / validation sets
# train_test_split(data, label, test_size, random_state)
# can divide to train(X(feature)/y(target)), test(X(feature)/y(target))

X_train, X_test, y_train, y_test = train_test_split(iris_data, iris_label, test_size=0.2, random_state=11)


In [30]:
# creating DecisionTreeClassifier, create classifier object

dt_clf = DecisionTreeClassifier(random_state=11)

# use train function in decision tree classifier object
dt_clf.fit(X_train, y_train)


DecisionTreeClassifier(random_state=11)

In [31]:
# after training, predict for test sets 
pred = dt_clf.predict(X_test)

In [32]:
# check accuracy with real result
from sklearn.metrics import accuracy_score

accuracy_score(y_test, pred)

0.9333333333333333

In [35]:
# 보통 딕셔너리 형태로 되어 있기 때문에, keys를 보면 어떤 정보들이 들어있는지 확인 가능
keys = iris.keys()
keys

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename'])

In [41]:
# if train data is biased, then test result will be good but actual result will be bad
# so, need to have a cross-validation
# 1) KFold, divide by k and make k-1 train, 1 test , repeat k times


from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.datasets import load_iris
import numpy as np
import pandas as pd

iris = load_iris()
features = iris.data
label = iris.target

dt_clf = DecisionTreeClassifier(random_state=156)

In [72]:
# divide 5 sets to make 4 train / 1 test sets iterated by 5 times
# create KFold object by n_splits = 5, which is # of divided sets
kfold = KFold(n_splits=5)

# to check accuracy for every iteration, create an empty list
cv_accuracy = []

# print(features.shape[0])

In [73]:

kfold.split(features)

<generator object _BaseKFold.split at 0x7f89d004c970>

In [74]:
# 
n_iter = 0

# split features by n_splits. result will be a tuple(train(n-1), test(1))
# each train_index and test_index has index numbers
for train_index, test_index in kfold.split(features):
    
    # map features/labels by train, test index 
    X_train, X_test = features[train_index], features[test_index]
    y_train, y_test = label[train_index], label[test_index]
    
    # fit and predict for dt_clf
    dt_clf.fit(X_train, y_train)
    pred = dt_clf.predict(X_test)
    
    # iteration add by 1
    n_iter += 1
    
    # check accuracy score
    accuracy = np.round(accuracy_score(y_test, pred), 4)
    
    # this is to check train_size and test_size
    train_size = X_train.shape[0]
    test_size = X_test.shape[0]
    
    # print accuracy w iteration and train/test size
    print('{0} th trial: accuracy:{1}, train_size:{2}, test_size:{3}'
          .format(n_iter, accuracy, train_size, test_size))
    
    # also print test index values
    print('{0} th trial test index:{1}'.format(n_iter, test_index))
    
    # append to list
    cv_accuracy.append(accuracy)
    
# print avearge accuracy
print(np.mean(cv_accuracy))

1 th trial: accuracy:1.0, train_size:120, test_size:30
1 th trial test index:[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29]
2 th trial: accuracy:0.9667, train_size:120, test_size:30
2 th trial test index:[30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53
 54 55 56 57 58 59]
3 th trial: accuracy:0.8667, train_size:120, test_size:30
3 th trial test index:[60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83
 84 85 86 87 88 89]
4 th trial: accuracy:0.9333, train_size:120, test_size:30
4 th trial test index:[ 90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107
 108 109 110 111 112 113 114 115 116 117 118 119]
5 th trial: accuracy:0.7333, train_size:120, test_size:30
5 th trial test index:[120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137
 138 139 140 141 142 143 144 145 146 147 148 149]
0.9


In [75]:
# but, KFold can also have bias
# if abnormal data is 1% and rest is 99%, then just dividing total sets equally will still get bias
# so, we need to split data similar to origin data's distribution
# Stratified K fold recognizes origin data distribution and split train/test data similarly

import pandas as pd

iris = load_iris()
iris_df = pd.DataFrame(iris.data, columns=iris.feature_names)
iris_df['label'] = iris.target

# checking distribution of target values 
iris_df['label'].value_counts()

0    50
1    50
2    50
Name: label, dtype: int64

In [82]:
# kfold case, if divided by 3, each train set has only two target values
# Rest target value cannot occur since there is no rest value trained for each sets
kfold = KFold(n_splits=3)
n_iter = 0

for train_index, test_index in kfold.split(iris_df):
    n_iter += 1
    label_train = iris_df['label'][train_index]
    label_test = iris_df['label'][test_index]
    
    print('train:\n', label_train.value_counts())
    print('test:\n', label_test.value_counts())

train:
 1    50
2    50
Name: label, dtype: int64
test:
 0    50
Name: label, dtype: int64
train:
 0    50
2    50
Name: label, dtype: int64
test:
 1    50
Name: label, dtype: int64
train:
 0    50
1    50
Name: label, dtype: int64
test:
 2    50
Name: label, dtype: int64


In [81]:
# Using stratifiedKFold
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=3)
n_iter = 0 

# skf.split(data, target) or (whole data, target)
for train_index, test_index in skf.split(iris_df, iris_df['label']):
    n_iter += 1
    label_train = iris_df['label'][train_index] 
    label_test = iris_df['label'][test_index]
    
    print(label_train.value_counts())
    print()
    print(label_test.value_counts())

2    34
0    33
1    33
Name: label, dtype: int64

0    17
1    17
2    16
Name: label, dtype: int64
1    34
0    33
2    33
Name: label, dtype: int64

0    17
2    17
1    16
Name: label, dtype: int64
0    34
1    33
2    33
Name: label, dtype: int64

1    17
2    17
0    16
Name: label, dtype: int64


In [84]:
# Conclusion

from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

import numpy as np
import pandas as pd

from sklearn.model_selection import KFold, StratifiedKFold

iris = load_iris()
feature = iris.data
label = iris.target


In [87]:
# kfold: Wow, worst case ever, because train set / test set are disjoint so cannot predict(wrong answer for all)

kfold = KFold(n_splits=3)
dt_clf = DecisionTreeClassifier(random_state=11)

cv_accuracy = []

for train_index, test_index in kfold.split(feature):
    X_train, X_test = feature[train_index], feature[test_index]
    y_train, y_test = label[train_index], label[test_index]
    
    dt_clf.fit(X_train, y_train)
    pred = dt_clf.predict(X_test)
    
    accuracy = accuracy_score(y_test, pred)
    print(accuracy)
    cv_accuracy.append(accuracy)
    

print(np.mean(cv_accuracy))
    

0.0
0.0
0.0
0.0


In [88]:
# Stratified KFold: high accuracy, at least won't get 0% 

skf = StratifiedKFold(n_splits=3)
dt_clf = DecisionTreeClassifier(random_state=11)

cv_accuracy = []

for train_index, test_index in skf.split(feature, label):
    X_train, X_test = feature[train_index], feature[test_index]
    y_train, y_test = label[train_index], label[test_index]
    
    dt_clf.fit(X_train, y_train)
    pred = dt_clf.predict(X_test)
    
    accuracy = accuracy_score(y_test, pred)
    print(accuracy)
    cv_accuracy.append(accuracy)
    

print(np.mean(cv_accuracy))
    

0.98
0.92
0.98
0.96


In [89]:
# GridSearchCV - hyper parameter tuning method 
grid_parameters = {'max_depth': [1,2,3],
                   'min_samples_split': [2,3]
                  }

In [111]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, train_test_split


iris_data = load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris_data.data, iris_data.target,test_size=0.2, random_state=11)

dtree = DecisionTreeClassifier(random_state=11)

grid_parameters = {'max_depth': [1,2,3], 'min_samples_split': [2,3]}


In [112]:
import pandas as pd

grid_dtree = GridSearchCV(dtree, grid_parameters, cv=3, refit=True)

In [113]:
# creates all results, but also saves best parameter result(refit=True)
grid_dtree.fit(X_train, y_train)

GridSearchCV(cv=3, estimator=DecisionTreeClassifier(random_state=11),
             param_grid={'max_depth': [1, 2, 3], 'min_samples_split': [2, 3]})

In [114]:
scores_df = pd.DataFrame(grid_dtree.cv_results_)

In [115]:
scores_df.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_min_samples_split,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,0.001706,0.000291,0.000567,8.5e-05,1,2,"{'max_depth': 1, 'min_samples_split': 2}",0.675,0.675,0.675,0.675,1.110223e-16,5
1,0.001596,0.000251,0.000519,9.3e-05,1,3,"{'max_depth': 1, 'min_samples_split': 3}",0.675,0.675,0.675,0.675,1.110223e-16,5
2,0.000786,3.1e-05,0.00027,1.1e-05,2,2,"{'max_depth': 2, 'min_samples_split': 2}",0.925,1.0,0.95,0.958333,0.03118048,3
3,0.000773,6.2e-05,0.00026,1.5e-05,2,3,"{'max_depth': 2, 'min_samples_split': 3}",0.925,1.0,0.95,0.958333,0.03118048,3
4,0.00069,4.6e-05,0.000234,1.8e-05,3,2,"{'max_depth': 3, 'min_samples_split': 2}",0.925,1.0,0.975,0.966667,0.03118048,1


In [116]:
scores_df[['params', 'mean_test_score', 'rank_test_score', 'split0_test_score', 'split1_test_score', 'split2_test_score']]

Unnamed: 0,params,mean_test_score,rank_test_score,split0_test_score,split1_test_score,split2_test_score
0,"{'max_depth': 1, 'min_samples_split': 2}",0.675,5,0.675,0.675,0.675
1,"{'max_depth': 1, 'min_samples_split': 3}",0.675,5,0.675,0.675,0.675
2,"{'max_depth': 2, 'min_samples_split': 2}",0.958333,3,0.925,1.0,0.95
3,"{'max_depth': 2, 'min_samples_split': 3}",0.958333,3,0.925,1.0,0.95
4,"{'max_depth': 3, 'min_samples_split': 2}",0.966667,1,0.925,1.0,0.975
5,"{'max_depth': 3, 'min_samples_split': 3}",0.966667,1,0.925,1.0,0.975


In [117]:
grid_dtree.best_params_

{'max_depth': 3, 'min_samples_split': 2}

In [118]:
grid_dtree.best_score_

0.9666666666666667

In [106]:
grid_dtree.best_estimator_

DecisionTreeClassifier(max_depth=3, random_state=11)

In [121]:
# don't do fit again, just predict
pred = grid_dtree.predict(X_test)

accuracy_score(y_test, pred)

0.9333333333333333

In [122]:
estimator = grid_dtree.best_estimator_
pred2 = estimator.predict(X_test)

accuracy_score(y_test, pred2)

0.9333333333333333