# Import python packages

In [1]:
import pandas as pd
import numpy as np
from sklearn import datasets

### Scikit learn - machine learning

In [2]:
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

### Scikit learn - 성능 평가

In [3]:
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

# Data set

### scikit learn 제공 toy data set 중 iris data 사용
- iris 붓꽃
- 꽃받침, 꽃잎의 길이, 너비 데이터를 사용해 '속(생물 분류)' 분류 (setosa, versicolor, virginica)

In [4]:
data = datasets.load_iris()
data_df = pd.DataFrame(data.data, columns=data.feature_names)
data_df['target'] = data.target
data_df['target_name'] = data.target_names[data.target]
data_df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target,target_name
0,5.1,3.5,1.4,0.2,0,setosa
1,4.9,3.0,1.4,0.2,0,setosa
2,4.7,3.2,1.3,0.2,0,setosa
3,4.6,3.1,1.5,0.2,0,setosa
4,5.0,3.6,1.4,0.2,0,setosa
5,5.4,3.9,1.7,0.4,0,setosa
6,4.6,3.4,1.4,0.3,0,setosa
7,5.0,3.4,1.5,0.2,0,setosa
8,4.4,2.9,1.4,0.2,0,setosa
9,4.9,3.1,1.5,0.1,0,setosa


In [5]:
type(data)

sklearn.datasets.base.Bunch

### Train, test data set

In [6]:
x = data.data
y = data.target_names[data.target]

In [7]:
type(x)

numpy.ndarray

In [8]:
trainset_size = int(round(len(x)*0.80))
trainset_size

120

In [9]:
x_train = x[:trainset_size]
y_train = y[:trainset_size]
len(x_train)

120

In [10]:
x_test = x[trainset_size:]
y_test = y[trainset_size:]
len(x_test)

30

# Classification

### Naive Bayes

- Gaussian Naive Bayes 분류 모델 생성 후
- x_train, y_train 적용해 훈련

In [11]:
clf_nb = GaussianNB()
clf_nb.fit(x_train, y_train)

GaussianNB(priors=None)

- 분류 모델에 x_test 적용해 분류 결과 추출

In [13]:
y_pred_nb = clf_nb.predict(x_test)

+ classification report와 accuracy

In [14]:
print('\n Classification Report \n')
print(classification_report(y_test, y_pred_nb))
print('\n Accuracy \n')
print(accuracy_score(y_test, y_pred_nb, normalize=True))


 Classification Report 

             precision    recall  f1-score   support

 versicolor       0.00      0.00      0.00         0
  virginica       1.00      0.93      0.97        30

avg / total       1.00      0.93      0.97        30


 Accuracy 

0.933333333333


  'recall', 'true', average, warn_for)


### Decision Tree

In [15]:
clf_dt = DecisionTreeClassifier()
clf_dt.fit(x_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [17]:
y_pred_dt = clf_dt.predict(x_test)

In [18]:
print('\n Classification Report \n')
print(classification_report(y_test, y_pred_dt))
print('\n Accuracy \n')
print(accuracy_score(y_test, y_pred_dt, normalize=True))


 Classification Report 

             precision    recall  f1-score   support

 versicolor       0.00      0.00      0.00         0
  virginica       1.00      0.83      0.91        30

avg / total       1.00      0.83      0.91        30


 Accuracy 

0.833333333333


  'recall', 'true', average, warn_for)


### Random Forest

In [19]:
clf_rf = RandomForestClassifier(n_estimators=100)
clf_rf.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [20]:
y_pred_rf = clf_rf.predict(x_test)

In [21]:
print('\n Classification Report \n')
print(classification_report(y_test, y_pred_rf))
print('\n Accuracy \n')
print(accuracy_score(y_test, y_pred_rf, normalize=True))


 Classification Report 

             precision    recall  f1-score   support

 versicolor       0.00      0.00      0.00         0
  virginica       1.00      0.73      0.85        30

avg / total       1.00      0.73      0.85        30


 Accuracy 

0.733333333333


  'recall', 'true', average, warn_for)


### SVM

In [22]:
clf_svm = SVC()
clf_svm.fit(x_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [23]:
y_pred_svm = clf_svm.predict(x_test)

In [24]:
print('\n Classification Report \n')
print(classification_report(y_test, y_pred_svm))
print('\n Accuracy \n')
print(accuracy_score(y_test, y_pred_svm, normalize=True))


 Classification Report 

             precision    recall  f1-score   support

 versicolor       0.00      0.00      0.00         0
  virginica       1.00      0.83      0.91        30

avg / total       1.00      0.83      0.91        30


 Accuracy 

0.833333333333


  'recall', 'true', average, warn_for)
