# データ解析：分類、アンサンブル

# 乳がんデータセット

分類とアンサンブルの内容は下記URL内容の一部を参考した：

https://chrisalbon.com/machine_learning/trees_and_forests/random_forest_classifier_example/

In [7]:
import pandas as pd
import numpy as np

from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()
df = pd.DataFrame(cancer.data, columns=cancer.feature_names)
df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [8]:
df['class'] = pd.Categorical.from_codes(cancer.target, cancer.target_names)
df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,class
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,malignant
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,malignant
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,malignant
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,malignant
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,malignant


In [9]:
np.random.seed(0)
df['is_train'] = np.random.uniform(0, 1, len(df)) <= .75
train, valid = df[df['is_train']==True], df[df['is_train']==False]
print(len(train),len(valid))

(435, 134)


1.1. SVM分類

In [35]:
from sklearn.svm import SVC

trainx = train.drop(['class','is_train'],axis=1)
trainy = pd.factorize(train['class'])[0]
clfsvc1 = SVC(C=1, gamma=1)
clfsvc1.fit(trainx, trainy)

SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=1, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [39]:
from sklearn.metrics import accuracy_score
validx = valid.drop(['class','is_train'],axis=1)
validy = pd.factorize(valid['class'])[0]
preds = clfsvc1.predict(validx)
print(validx)
print(validy)
print(preds)
print('accuracy:', accuracy_score(validy, preds))
pd.crosstab(validy, preds, rownames=['true class'], colnames=['pred class'])

     mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
7         13.710         20.83           90.20      577.9          0.11890   
8         13.000         21.82           87.50      519.8          0.12730   
10        16.020         23.24          102.70      797.8          0.08206   
13        15.850         23.95          103.70      782.7          0.08401   
17        16.130         20.68          108.10      798.8          0.11700   
18        19.810         22.15          130.00     1260.0          0.09831   
19        13.540         14.36           87.46      566.3          0.09779   
20        13.080         15.71           85.63      520.0          0.10750   
21         9.504         12.44           60.34      273.9          0.10240   
23        21.160         23.04          137.20     1404.0          0.09428   
27        18.610         20.25          122.10     1094.0          0.09440   
31        11.840         18.70           77.93      440.6       

pred class,1
true class,Unnamed: 1_level_1
0,48
1,86


1.2. SVM分類（別のハイパーパラメータ）

In [14]:
from sklearn.svm import SVC

trainx = train.drop(['class','is_train'],axis=1)
trainy = pd.factorize(train['class'])[0]
clfsvc2 = SVC(kernel="linear", C=1)
clfsvc2.fit(trainx, trainy)

SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [15]:
from sklearn.metrics import accuracy_score
validx = valid.drop(['class','is_train'],axis=1)
validy = pd.factorize(valid['class'])[0]
preds = clfsvc2.predict(validx)
print('accuracy:', accuracy_score(validy, preds))
pd.crosstab(validy, preds, rownames=['true class'], colnames=['pred class'])

('accuracy:', 0.96268656716417911)


pred class,0,1
true class,Unnamed: 1_level_1,Unnamed: 2_level_1
0,43,5
1,0,86


1.3. アンサンブル(RandomForest)

In [20]:
from sklearn.ensemble import RandomForestClassifier

trainx = train.drop(['class','is_train'],axis=1)
trainy = pd.factorize(train['class'])[0]
clfrf = RandomForestClassifier(n_jobs=2, random_state=0)
clfrf.fit(trainx, trainy)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=2, oob_score=False, random_state=0,
            verbose=0, warm_start=False)

In [21]:
from sklearn.metrics import accuracy_score
validx = valid.drop(['class','is_train'],axis=1)
validy = pd.factorize(valid['class'])[0]
preds = clfrf.predict(validx)
print('accuracy:', accuracy_score(validy, preds))
pd.crosstab(validy, preds, rownames=['true class'], colnames=['pred class'])

accuracy: 0.9477611940298507


pred class,0,1
true class,Unnamed: 1_level_1,Unnamed: 2_level_1
0,43,5
1,2,84
