# Decision Tree & Ensemble

### 데이터 준비

In [15]:
import pandas as pd
from sklearn import metrics

In [16]:
cars = pd.read_csv('automobile.csv')

In [17]:
variables = ['bore', 'city_mpg', 'compression_ratio', 'curb_weight', 'engine_size',
             'horsepower', 'peak_rpm', 'city_mpg', 'price']
X = cars[variables]
y = cars['doors']

In [18]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)

### 모형 평가 출력 함수

In [5]:
def model_performance(y_test, y_pred):    
    print('confusion matrix')
    print(metrics.confusion_matrix(y_test, y_pred))
    print('accuracy : {}'.format(metrics.accuracy_score(y_test, y_pred)))
    print('precision : {}'.format(metrics.precision_score(y_test, y_pred, pos_label='four')))
    print('recall : {}'.format(metrics.recall_score(y_test, y_pred, pos_label='four')))
    print('F1 : {}'.format(metrics.f1_score(y_test, y_pred, pos_label='four')))

## 1. Decision Tree

In [6]:
from sklearn.tree import DecisionTreeClassifier

In [7]:
tree = DecisionTreeClassifier()
tree.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [8]:
y_tree = tree.predict(X_test)

In [9]:
model_performance(y_test, y_tree)

confusion matrix
[[30 10]
 [16  8]]
accuracy : 0.59375
precision : 0.6521739130434783
recall : 0.75
F1 : 0.6976744186046512


In [21]:
varDic = {'var':variables, 'importance':tree.feature_importances_}
importance = pd.DataFrame(varDic)
importance

Unnamed: 0,importance,var
0,0.050379,bore
1,0.0,city_mpg
2,0.177097,compression_ratio
3,0.19102,curb_weight
4,0.0,engine_size
5,0.087669,horsepower
6,0.055751,peak_rpm
7,0.0,city_mpg
8,0.438085,price


## 2. Random Forest

In [22]:
from sklearn.ensemble import RandomForestClassifier

In [23]:
rf = RandomForestClassifier(n_estimators=10, random_state=0)
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=0,
            verbose=0, warm_start=False)

In [24]:
y_rf = rf.predict(X_test)

In [25]:
model_performance(y_test, y_rf)

confusion matrix
[[32 11]
 [ 8 13]]
accuracy : 0.703125
precision : 0.8
recall : 0.7441860465116279
F1 : 0.7710843373493975


## 3. Gradient Boosting Tree

In [26]:
from sklearn.ensemble import GradientBoostingClassifier

In [27]:
gb = GradientBoostingClassifier(n_estimators=10, random_state=0)
gb.fit(X_train, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=10, presort='auto', random_state=0,
              subsample=1.0, verbose=0, warm_start=False)

In [28]:
y_gb = gb.predict(X_test)

In [29]:
model_performance(y_test, y_gb)

confusion matrix
[[38  5]
 [15  6]]
accuracy : 0.6875
precision : 0.7169811320754716
recall : 0.8837209302325582
F1 : 0.7916666666666666


# 범주형 변수를 dummy 변수로 변환하여 모든 변수 사용

In [32]:
cate_var = cars.columns[cars.dtypes == 'object'].difference(['doors'])
cate_var

Index(['aspiration', 'body', 'cylinders', 'engine_location', 'engine_type',
       'fuel', 'fuel_system', 'maker', 'wheels'],
      dtype='object')

In [33]:
dummyVar = pandas.get_dummies(cars[cate_var])

In [35]:
X_all = pandas.concat([X, dummyVar], axis=1)

In [36]:
X_all.head()

Unnamed: 0,bore,city_mpg,compression_ratio,curb_weight,engine_size,horsepower,peak_rpm,city_mpg.1,price,aspiration_std,...,maker_plymouth,maker_porsche,maker_saab,maker_subaru,maker_toyota,maker_volkswagen,maker_volvo,wheels_4wd,wheels_fwd,wheels_rwd
0,3.19,24,10.0,2337,109,102,5500,24,13950,1,...,0,0,0,0,0,0,0,0,1,0
1,3.19,18,8.0,2824,136,115,5500,18,17450,1,...,0,0,0,0,0,0,0,1,0,0
2,3.19,19,8.5,2844,136,110,5500,19,17710,1,...,0,0,0,0,0,0,0,0,1,0
3,3.13,17,8.3,3086,131,140,5500,17,23875,0,...,0,0,0,0,0,0,0,0,1,0
4,3.5,23,8.8,2395,108,101,5800,23,16430,1,...,0,0,0,0,0,0,0,0,0,1


In [37]:
X_all_train, X_all_test, y_train, y_test = train_test_split(X_all, y, test_size=0.4)

### SVC

In [39]:
from sklearn.svm import SVC

In [40]:
model = SVC(kernel='rbf')
model.fit(X_all_train, y_train)
y_pred = model.predict(X_all_test)

In [41]:
model_performance(y_test, y_pred)

confusion matrix
[[41  0]
 [22  1]]
accuracy : 0.65625
precision : 0.6507936507936508
recall : 1.0
F1 : 0.7884615384615385


### DecisionTree

In [42]:
model = DecisionTreeClassifier()
model.fit(X_all_train, y_train)
y_pred = model.predict(X_all_test)

In [43]:
model_performance(y_test, y_pred)

confusion matrix
[[34  7]
 [ 9 14]]
accuracy : 0.75
precision : 0.7906976744186046
recall : 0.8292682926829268
F1 : 0.8095238095238095


### RandomForest

In [44]:
model = RandomForestClassifier()
model.fit(X_all_train, y_train)
y_pred = model.predict(X_all_test)

In [45]:
model_performance(y_test, y_pred)

confusion matrix
[[37  4]
 [ 6 17]]
accuracy : 0.84375
precision : 0.8604651162790697
recall : 0.9024390243902439
F1 : 0.8809523809523809


### GradientBoosting

In [46]:
model = GradientBoostingClassifier(random_state=0)
model.fit(X_all_train, y_train)
y_pred = model.predict(X_all_test)

In [47]:
model_performance(y_test, y_pred)

confusion matrix
[[35  6]
 [ 9 14]]
accuracy : 0.765625
precision : 0.7954545454545454
recall : 0.8536585365853658
F1 : 0.8235294117647058
