In [3]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split

## データの準備

In [35]:
# 特徴量を構築する関数
def makefeature(x):
    # 数値変換のスケーリング
    x = bank.drop('y',1)

    # 特定の列を指定うして処理
    cn_num = ['age','balance','day','duration','campaign','pdays','previous']
    x_num = x[cn_num]
    x[cn_num] = (x_num - x_num.mean())/x_num.std()

    # ダミー変数への変換
    x_dum = pd.get_dummies(x)
    return x_dum

In [36]:
# データの読み込み
bank = pd.read_csv("dataset/bank/bank-full.csv", sep=";")
bank.head(5)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [37]:
# 特徴量を取得
features, label = makefeature(bank.drop('y',1)), bank.y

## SVMによる予測

In [45]:
from sklearn import svm
from sklearn import metrics
from sklearn.metrics import classification_report

In [86]:
# 訓練データとテストデータの作成
random_state = np.random.RandomState(123)
X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=.3, random_state=random_state)

In [46]:
# RBFカーネルのSVMによる予測モデル構築
clf = svm.SVC()
clf.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [57]:
# クラスラベルの予測
pred = clf.predict(X_test)

# クラスごとのPrecision, Recall, F値, Accuracy算出
print(metrics.classification_report(y_test, pred, target_names=['no', 'yes']))

             precision    recall  f1-score   support

         no       0.92      0.97      0.95     11998
        yes       0.67      0.39      0.49      1566

avg / total       0.89      0.91      0.90     13564



## Random Forestによる予測

In [56]:
import numpy as np
from sklearn import ensemble
from sklearn import metrics

In [90]:
# 訓練データとテストデータの作成
random_state = np.random.RandomState(123)
X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=.3, random_state=random_state)

In [89]:
# Random Forestによる予測モデル構築
clf = ensemble.RandomForestClassifier(n_estimators=500, random_state=random_state)
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=500, n_jobs=1, oob_score=False,
            random_state=<mtrand.RandomState object at 0x11de24828>,
            verbose=0, warm_start=False)

In [58]:
# クラスラベルの予測
pred = clf.predict(X_test)

# クラスごとの適合率,再現率,F値,正解率の算出
print(metrics.classification_report(y_test,pred,target_names=['no','yes']))

             precision    recall  f1-score   support

         no       0.92      0.97      0.95     11998
        yes       0.67      0.39      0.49      1566

avg / total       0.89      0.91      0.90     13564



# クロスバリデーション

In [62]:
from sklearn import cross_validation as cv
from sklearn import preprocessing

In [76]:
# 層別k分割
skf = cv.StratifiedKFold(y_train, 10)

# 各分割における訓練データ、テストデータの行番号の表示
for train, test in skf:
    print("%s %s" % (train, test))

[ 3084  3098  3120 ..., 31644 31645 31646] [   0    1    2 ..., 3177 3178 3180]
[    0     1     2 ..., 31644 31645 31646] [3084 3098 3120 ..., 6367 6368 6369]
[    0     1     2 ..., 31644 31645 31646] [6071 6075 6076 ..., 9539 9540 9541]
[    0     1     2 ..., 31644 31645 31646] [ 9168  9173  9187 ..., 12759 12760 12761]
[    0     1     2 ..., 31644 31645 31646] [11997 11998 12004 ..., 15895 15896 15897]
[    0     1     2 ..., 31644 31645 31646] [15222 15231 15238 ..., 19059 19062 19063]
[    0     1     2 ..., 31644 31645 31646] [18552 18585 18587 ..., 22204 22205 22206]
[    0     1     2 ..., 31644 31645 31646] [21659 21662 21671 ..., 25345 25346 25347]
[    0     1     2 ..., 31644 31645 31646] [25106 25138 25143 ..., 28502 28503 28504]
[    0     1     2 ..., 28502 28503 28504] [28269 28282 28288 ..., 31644 31645 31646]


In [71]:
# RBFカーネルのサポートベクターマシン
clf = svm.SVC()
# クラスラベルを1, 0に変換
lb = preprocessing.LabelBinarizer()
y_train_bin = lb.fit_transform(y_train).ravel()

In [80]:
# クロスバリデーションによる評価指標
cv.cross_val_score(clf, X_train, y_train_bin, cv=skf, scoring='f1')

array([ 0.36329588,  0.38420108,  0.36111111,  0.38185255,  0.40740741,
        0.38931298,  0.39344262,  0.41666667,  0.38888889,  0.38356164])

In [92]:
# クロスバリデーションによる予測結果
pred = cv.cross_val_predict(clf, X_train, y_train, cv=skf)
print(classification_report(y_train, pred, target_names=['no', 'yes']))

             precision    recall  f1-score   support

         no       0.92      0.97      0.95     27924
        yes       0.65      0.40      0.50      3723

avg / total       0.89      0.90      0.89     31647



# Grid Search

In [94]:
from sklearn import grid_search as gs

In [96]:
# 探索するハイパーパラメータの範囲
param_grid = [
    {'C': [0.5, 1], 'gamma': [0.05, 0.1]}
]

#各ハイパーパラメータに対するGrid Searchの実行
svc = svm.SVC()
clf = gs.GridSearchCV(svc, param_grid, cv=10)
clf.fit(X_train, y_train)

GridSearchCV(cv=10, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid=[{'C': [0.5, 1], 'gamma': [0.05, 0.1]}],
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [None]:
pred = clf.predict(X_test)
print(classification_report(y_test, pred, target_names=['no', 'yes']))