# 総当たり戦

In [3]:
from sklearn.datasets import load_boston
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
%matplotlib inline

boston = load_boston()
X=pd.DataFrame(boston.data[:,:], columns=boston.feature_names)
y=pd.DataFrame(boston.target[:])

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=0)

lin_1d = LinearRegression()
x_train = X_train.loc[:, ['LSTAT']].as_matrix()
lin_1d.fit(x_train, y_train)

n = np.linspace(np.min(x_train),np.max(x_train), 1000)
y_1d_fit=lin_1d.predict(n[:,np.newaxis])

# LSTAT
x_test = X_test['LSTAT'].values[:,np.newaxis]
score_1d = lin_1d.score(x_test, y_test)
print("一次式における'LSTAT'の住宅価格への決定係数は%f" % (score_1d))

一次式における'LSTAT'の住宅価格への決定係数は0.430957




In [None]:
from itertools import chain, combinations
dim=[3,4]
i = set(list(X.columns))

for di in dim:

    degree_=PolynomialFeatures(degree=di)

    for p in chain.from_iterable(combinations(i, r) for r in range(len(i)+1)):
        if len(p) == 0:
            continue
        else:
            x_train = X_train.loc[:, p].as_matrix()
            x_train_d = degree_.fit_transform(x_train)

            lin_ = LinearRegression(normalize=True)
            #normalize=Trueは入力データを正規化してトレーニングすることを意味します。    
            lin_.fit(x_train_d,y_train)

            x_test = X_test.loc[:, p].as_matrix()
            x_test_d = degree_.fit_transform(x_test)

            score_d = lin_.score(x_test_d, y_test)

            if score_d >= 0.75:
                print('d:{0} s:{1} p:{2}'.format(di,score_d, p))

d:3 s:0.7535202032485527 p:('LSTAT', 'RM', 'NOX', 'RAD', 'CHAS')
d:3 s:0.7685431811199619 p:('LSTAT', 'RM', 'RAD', 'DIS', 'TAX', 'B')


# Gridsearch

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_boston
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
%matplotlib inline

boston = load_boston()
X=pd.DataFrame(boston.data[:,:], columns=boston.feature_names)
y=pd.DataFrame(boston.target[:],columns=['MEDV'])

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=0)

In [4]:
from sklearn.metrics import r2_score, make_scorer
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV

regressor = DecisionTreeRegressor()
params = {"max_depth":  list(range(1,11))}

scoring_fnc=make_scorer(r2_score)

grid = GridSearchCV(estimator=regressor, param_grid=params, scoring=scoring_fnc,cv=5)
grid.fit(X_train,y_train)
print(grid.best_params_)

{'max_depth': 5}


In [17]:
#for params, mean_score, scores in grid.grid_scores_:
#    print("%0.3f (+/-%0.03f) for %r" % (mean_score, scores.std() / 2, params))

In [16]:
means = grid.cv_results_['mean_test_score']
stds = grid.cv_results_['std_test_score']
params = grid.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

0.368496 (0.064662) with: {'max_depth': 1}
0.626112 (0.111121) with: {'max_depth': 2}
0.698623 (0.128678) with: {'max_depth': 3}
0.742889 (0.129903) with: {'max_depth': 4}
0.782156 (0.090582) with: {'max_depth': 5}
0.752935 (0.100415) with: {'max_depth': 6}
0.736055 (0.086652) with: {'max_depth': 7}
0.746791 (0.092950) with: {'max_depth': 8}
0.740257 (0.111650) with: {'max_depth': 9}
0.766889 (0.103395) with: {'max_depth': 10}


In [10]:
grid.cv_results_ 

{'mean_fit_time': array([ 0.00198293,  0.00157156,  0.00190525,  0.00142169,  0.00166516,
         0.00173316,  0.00181875,  0.0019855 ,  0.00214195,  0.00211492]),
 'mean_score_time': array([ 0.00058565,  0.00050249,  0.00049834,  0.00027561,  0.00025125,
         0.000246  ,  0.00024881,  0.00026979,  0.0002737 ,  0.00025415]),
 'mean_test_score': array([ 0.36849619,  0.62611191,  0.69862293,  0.74288852,  0.78215579,
         0.75293457,  0.73605545,  0.74679111,  0.74025721,  0.76688932]),
 'mean_train_score': array([ 0.48476147,  0.73065992,  0.83904408,  0.90665078,  0.93863857,
         0.96125492,  0.97595741,  0.98612753,  0.99196729,  0.99524738]),
 'param_max_depth': masked_array(data = [1 2 3 4 5 6 7 8 9 10],
              mask = [False False False False False False False False False False],
        fill_value = ?),
 'params': [{'max_depth': 1},
  {'max_depth': 2},
  {'max_depth': 3},
  {'max_depth': 4},
  {'max_depth': 5},
  {'max_depth': 6},
  {'max_depth': 7},
  {'max_de

# Gridsearch関数作成
# [20171106追記]
[こちら](http://chrisstrelioff.ws/sandbox/2015/06/25/decision_trees_in_python_again_cross_validation.html)を参考にDecisionTreeRegressor()に書き換えて、再度グリッドサーチを実施し、パラメータを探索しました。


In [6]:
def report(grid_scores, n_top=3):
    """Report top n_top parameters settings, default n_top=3.

    Args
    ----
    grid_scores -- output from grid or random search
    n_top -- how many to report, of top models

    Returns
    -------
    top_params -- [dict] top parameter settings found in
                  search
    """
    top_scores = sorted(grid_scores,
                        key=itemgetter(1),
                        reverse=True)[:n_top]
    for i, score in enumerate(top_scores):
        print("Model with rank: {0}".format(i + 1))
        print(("Mean validation score: "
               "{0:.3f} (std: {1:.3f})").format(
               score.mean_validation_score,
               np.std(score.cv_validation_scores)))
        print("Parameters: {0}".format(score.parameters))
        print("")

    return top_scores[0].parameters

In [7]:
from time import time
from operator import itemgetter

def run_gridsearch(X, y, clf, param_grid, cv=5):
    """Run a grid search for best Decision Tree parameters.

    Args
    ----
    X -- features
    y -- targets (classes)
    cf -- scikit-learn Decision Tree
    param_grid -- [dict] parameter settings to test
    cv -- fold of cross-validation, default 5

    Returns
    -------
    top_params -- [dict] from report()
    """
    grid_search = GridSearchCV(clf,
                               param_grid=param_grid,
                               cv=cv)
    start = time()
    grid_search.fit(X, y)

    print(("\nGridSearchCV took {:.2f} "
           "seconds for {:d} candidate "
           "parameter settings.").format(time() - start,
                len(grid_search.grid_scores_)))

    top_params = report(grid_search.grid_scores_, 3)
    return  top_params

In [8]:
print("-- Grid Parameter Search via 10-fold CV")

# set of parameters to test
param_grid = {"criterion": ["mse", "friedman_mse", "mae"],
              "max_depth": [None, 2, 5, 10],
              "min_samples_leaf": [2, 5, 10],
              "max_leaf_nodes": [None, 2, 5, 10, 20]
              }

dt = DecisionTreeRegressor()
ts_gs = run_gridsearch(X, y, dt, param_grid, cv=10)

-- Grid Parameter Search via 10-fold CV

GridSearchCV took 13.98 seconds for 180 candidate parameter settings.
Model with rank: 1
Mean validation score: 0.366 (std: 0.417)
Parameters: {'criterion': 'mae', 'max_depth': 5, 'max_leaf_nodes': None, 'min_samples_leaf': 10}

Model with rank: 2
Mean validation score: 0.357 (std: 0.410)
Parameters: {'criterion': 'mae', 'max_depth': 10, 'max_leaf_nodes': None, 'min_samples_leaf': 10}

Model with rank: 3
Mean validation score: 0.354 (std: 0.416)
Parameters: {'criterion': 'mae', 'max_depth': None, 'max_leaf_nodes': None, 'min_samples_leaf': 10}





In [18]:
print("\n-- Best Parameters:")
for k, v in ts_gs.items():
    print("parameter: {:<20s} setting: {}".format(k, v))


-- Best Parameters:
parameter: criterion            setting: mae
parameter: max_depth            setting: 5
parameter: max_leaf_nodes       setting: None
parameter: min_samples_leaf     setting: 10
