## 住宅価格の予測タスク

### データの準備

In [6]:
import numpy
from sklearn.datasets import load_boston
from pandas import DataFrame
from sklearn.model_selection import train_test_split

boston = load_boston()

print(boston['DESCR'])
X = boston.data
y = boston.target

feature_names = boston.feature_names

df = DataFrame(data=X, columns=feature_names)
display(df.head())
df['MEDV'] = y

# 不要なカラムの削除
df.drop(columns=['CHAS', 'NOX', 'B'], inplace=True)

display(df.head())

display(df.describe())

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


Unnamed: 0,CRIM,ZN,INDUS,RM,AGE,DIS,RAD,TAX,PTRATIO,LSTAT,MEDV
0,0.00632,18.0,2.31,6.575,65.2,4.09,1.0,296.0,15.3,4.98,24.0
1,0.02731,0.0,7.07,6.421,78.9,4.9671,2.0,242.0,17.8,9.14,21.6
2,0.02729,0.0,7.07,7.185,61.1,4.9671,2.0,242.0,17.8,4.03,34.7
3,0.03237,0.0,2.18,6.998,45.8,6.0622,3.0,222.0,18.7,2.94,33.4
4,0.06905,0.0,2.18,7.147,54.2,6.0622,3.0,222.0,18.7,5.33,36.2


Unnamed: 0,CRIM,ZN,INDUS,RM,AGE,DIS,RAD,TAX,PTRATIO,LSTAT,MEDV
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.613524,11.363636,11.136779,6.284634,68.574901,3.795043,9.549407,408.237154,18.455534,12.653063,22.532806
std,8.601545,23.322453,6.860353,0.702617,28.148861,2.10571,8.707259,168.537116,2.164946,7.141062,9.197104
min,0.00632,0.0,0.46,3.561,2.9,1.1296,1.0,187.0,12.6,1.73,5.0
25%,0.082045,0.0,5.19,5.8855,45.025,2.100175,4.0,279.0,17.4,6.95,17.025
50%,0.25651,0.0,9.69,6.2085,77.5,3.20745,5.0,330.0,19.05,11.36,21.2
75%,3.677083,12.5,18.1,6.6235,94.075,5.188425,24.0,666.0,20.2,16.955,25.0
max,88.9762,100.0,27.74,8.78,100.0,12.1265,24.0,711.0,22.0,37.97,50.0


In [7]:
# 上の図を見るとRM、LSTATが線形に見える
X = df[['RM', 'LSTAT']].values
y = df['MEDV'].values

# 訓練データとテストデータに分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

# 単回帰の為2つの特徴量を別の変数に代入する
X_train0 = X_train[:, 0].reshape(-1, 1)# 訓練データのRM
X_train1 = X_train[:, 1].reshape(-1, 1)# 訓練データのLSTAT
X_test0 = X_test[:, 0].reshape(-1, 1)# テストデータのRM
X_test1 = X_test[:, 1].reshape(-1, 1)# テストデータのLSTAT
# .reshapeで1行の配列にする必要がある

In [8]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# RMは標準化を行う
standard_scaler = StandardScaler()
X_train_scaled0 = standard_scaler.fit_transform(X_train0)
X_test_scaled0 = standard_scaler.transform(X_test0)

# LSTSTは正規化を行う
min_max_scaler = MinMaxScaler()
X_train_scaled1 = min_max_scaler.fit_transform(X_train1)
X_test_scaled1 = min_max_scaler.transform(X_test1)

In [9]:
# X_trainのスケーリングを行う

# X_train と同じ形状の配列を作る
X_train_scaled = numpy.zeros(X_train.shape)
# 1列目にスケール済みのRMを代入する
X_train_scaled[:, 0] = X_train_scaled0.reshape(-1)
# 2列目にスケール済みのLSTATを代入する
X_train_scaled[:, 1] = X_train_scaled1.reshape(-1)
# 確認
print(X_train_scaled[:5])

[[ 0.14526384  0.09602649]
 [-0.20840082  0.28449227]
 [-0.89623682  0.23399558]
 [-0.54396454  0.13383002]
 [-0.55649596  0.45253863]]


In [10]:
# X_test をスケーリングします。

# X_test と同じ形状の配列を作る
X_test_scaled = numpy.zeros(X_test.shape)
# 1列目にスケール済みの RM を代入する
X_test_scaled[:, 0] = X_test_scaled0.reshape(-1)
# 2列目にスケール済みの LSTAT を代入する
X_test_scaled[:, 1] = X_test_scaled1.reshape(-1)
# 確認
print(X_test_scaled[:5])

[[ 0.12577051  0.20171082]
 [ 0.60196466  0.04966887]
 [-0.47713027  0.450883  ]
 [-0.36295507  0.10458057]
 [-0.03992278  0.42880795]]


### 重回帰

In [12]:
# scikit-learn ライブラリから SGDRegressor を importクラスタリングの場合はSGDClassifier
from sklearn.linear_model import SGDRegressor

reg = SGDRegressor(loss='squared_loss', max_iter=10000, tol=1e-3, penalty='none', random_state=42)

reg.fit(X_train_scaled, y_train)

SGDRegressor(alpha=0.0001, average=False, early_stopping=False, epsilon=0.1,
             eta0=0.01, fit_intercept=True, l1_ratio=0.15,
             learning_rate='invscaling', loss='squared_loss', max_iter=10000,
             n_iter_no_change=5, penalty='none', power_t=0.25, random_state=42,
             shuffle=True, tol=0.001, validation_fraction=0.1, verbose=0,
             warm_start=False)

In [14]:
print(reg.score(X_train_scaled, y_train))

0.6465520140289235


In [15]:
print(reg.score(X_test_scaled, y_test))

0.6042122051466812
