<a href="https://colab.research.google.com/github/tomonari-masada/course2021-sml/blob/main/07_linear_regression_2_example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 重回帰による住宅価格の予測

In [1]:
import numpy as np
from scipy import stats, special
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

%config InlineBackend.figure_format = 'retina'

In [2]:
import os
import tarfile
from six.moves import urllib

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
  os.makedirs(housing_path, exist_ok=True)
  tgz_path = os.path.join(housing_path, "housing.tgz")
  urllib.request.urlretrieve(housing_url, tgz_path)
  housing_tgz = tarfile.open(tgz_path)
  housing_tgz.extractall(path=housing_path)
  housing_tgz.close()

In [3]:
fetch_housing_data()

In [4]:
def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

（ここより上の詳細はフォローしなくてもいいいです。）

In [5]:
housing = load_housing_data()
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [6]:
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


## 1) `ocean_proximity`を0/1の数値データへ変換

* pandasの`get_dummies`を使って、カテゴリカル変数`ocean_proximity`の値を0/1の数値データに変換する。

In [7]:
housing_dummies = pd.get_dummies(housing['ocean_proximity'])

In [8]:
housing_dummies.head()

Unnamed: 0,<1H OCEAN,INLAND,ISLAND,NEAR BAY,NEAR OCEAN
0,0,0,0,1,0
1,0,0,0,1,0
2,0,0,0,1,0
3,0,0,0,1,0
4,0,0,0,1,0


In [9]:
housing_num = housing.drop('ocean_proximity', axis=1)

In [10]:
housing = pd.concat([housing_num, housing_dummies], axis=1)

In [11]:
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,<1H OCEAN,INLAND,ISLAND,NEAR BAY,NEAR OCEAN
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,0,0,0,1,0
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,0,0,0,1,0
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,0,0,0,1,0
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,0,0,0,1,0
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,0,0,0,1,0


In [12]:
X = housing_num.drop('median_house_value', axis=1)
y = housing_num["median_house_value"].copy()

## 2) テストデータの欠損値を訓練データの中央値で埋める

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [14]:
print(X_train.shape, X_valid.shape, X_test.shape)

(13209, 8) (3303, 8) (4128, 8)


In [15]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13209 entries, 16490 to 8472
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           13209 non-null  float64
 1   latitude            13209 non-null  float64
 2   housing_median_age  13209 non-null  float64
 3   total_rooms         13209 non-null  float64
 4   total_bedrooms      13209 non-null  float64
 5   population          13209 non-null  float64
 6   households          13209 non-null  float64
 7   median_income       13209 non-null  float64
dtypes: float64(8)
memory usage: 928.8 KB


In [16]:
X_valid.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3303 entries, 2071 to 11239
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           3303 non-null   float64
 1   latitude            3303 non-null   float64
 2   housing_median_age  3303 non-null   float64
 3   total_rooms         3303 non-null   float64
 4   total_bedrooms      3303 non-null   float64
 5   population          3303 non-null   float64
 6   households          3303 non-null   float64
 7   median_income       3303 non-null   float64
dtypes: float64(8)
memory usage: 232.2 KB


In [17]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4128 entries, 20046 to 3665
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           4128 non-null   float64
 1   latitude            4128 non-null   float64
 2   housing_median_age  4128 non-null   float64
 3   total_rooms         4128 non-null   float64
 4   total_bedrooms      3921 non-null   float64
 5   population          4128 non-null   float64
 6   households          4128 non-null   float64
 7   median_income       4128 non-null   float64
dtypes: float64(8)
memory usage: 290.2 KB


* 欠測箇所を中央値で埋める
 * テストデータにだけ、total_bedroomsの値が欠けているエントリがある
 * ここでは訓練データの中央値で埋めることにする。
 * 訓練データだけから得られる情報を使って埋めているので、問題はない。

In [18]:
median_total_bedrooms = np.median(X_train.total_bedrooms[~ X_train.total_bedrooms.isna()])
X_test.total_bedrooms = X_test.total_bedrooms.replace(np.nan, median_total_bedrooms)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


* 欠測箇所がなくなっていることを確認する。

In [19]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4128 entries, 20046 to 3665
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           4128 non-null   float64
 1   latitude            4128 non-null   float64
 2   housing_median_age  4128 non-null   float64
 3   total_rooms         4128 non-null   float64
 4   total_bedrooms      4128 non-null   float64
 5   population          4128 non-null   float64
 6   households          4128 non-null   float64
 7   median_income       4128 non-null   float64
dtypes: float64(8)
memory usage: 290.2 KB


## 3) 交差検証を使いたいので訓練データと検証データをくっつけて一つにする

In [20]:
X_train = pd.concat([X_train, X_valid])

In [21]:
print(X_train.shape)

(16512, 8)


In [22]:
y_train = pd.concat([y_train, y_valid])

In [23]:
print(y_train.shape)

(16512,)


* 交差検証は10-foldで行う。

In [24]:
from sklearn.model_selection import KFold
skf = KFold(n_splits=10, shuffle=True, random_state=123)

## 4) `sklearn.preprocessing.PolynomialFeatures`を使う
 * これにより、実質的には非線形のモデルを使っていることになる。（なぜか？）

### 4-1) 元データのまま

In [25]:
rmses = []
for train_index, valid_index in skf.split(X_train.values, y_train.values):
  reg = LinearRegression()
  reg.fit(X_train.values[train_index], y_train.values[train_index])
  y_valid_pred = reg.predict(X_train.values[valid_index])
  y_valid_pred[y_valid_pred > y_train.values[train_index].max()] = y_train.values[train_index].max()
  rmse = mean_squared_error(y_train.values[valid_index], y_valid_pred, squared=False)
  rmses.append(rmse)
  print(f'RMSE: {rmse:.1f}')
rmses = np.array(rmses)
print(f'mean RMSE: {rmses.mean():.1f} ({rmses.std():.1f})')

RMSE: 63043.8
RMSE: 70870.8
RMSE: 66802.0
RMSE: 73471.2
RMSE: 69199.0
RMSE: 70673.1
RMSE: 66118.7
RMSE: 68620.3
RMSE: 70002.5
RMSE: 66755.1
mean RMSE: 68555.7 (2813.4)


### 4-2) 2次の項を追加する

In [26]:
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(2)
X_train_transformed = poly.fit_transform(X_train)
X_test_transformed = poly.transform(X_test)

In [27]:
print(X_train_transformed.shape, X_test_transformed.shape)

(16512, 45) (4128, 45)


In [28]:
rmses = []
for train_index, valid_index in skf.split(X_train_transformed, y_train.values):
  reg = LinearRegression()
  reg.fit(X_train_transformed[train_index], y_train.values[train_index])
  y_valid_pred = reg.predict(X_train_transformed[valid_index])
  y_valid_pred[y_valid_pred > y_train.values[train_index].max()] = y_train.values[train_index].max()
  rmse = mean_squared_error(y_train.values[valid_index], y_valid_pred, squared=False)
  rmses.append(rmse)
  print(f'RMSE: {rmse:.1f}')
rmses = np.array(rmses)
print(f'mean RMSE: {rmses.mean():.1f} ({rmses.std():.1f})')

RMSE: 57195.4
RMSE: 62011.6
RMSE: 61815.6
RMSE: 69333.6
RMSE: 63059.9
RMSE: 66829.3
RMSE: 60761.5
RMSE: 63113.4
RMSE: 64812.6
RMSE: 62345.7
mean RMSE: 63127.9 (3153.2)


### 4-3) 3次までの項を追加する

In [29]:
poly = PolynomialFeatures(3)
X_train_transformed = poly.fit_transform(X_train)
X_test_transformed = poly.transform(X_test)

In [30]:
print(X_train_transformed.shape, X_test_transformed.shape)

(16512, 165) (4128, 165)


In [31]:
rmses = []
for train_index, valid_index in skf.split(X_train_transformed, y_train.values):
  reg = LinearRegression()
  reg.fit(X_train_transformed[train_index], y_train.values[train_index])
  y_valid_pred = reg.predict(X_train_transformed[valid_index])
  y_valid_pred[y_valid_pred > y_train.values[train_index].max()] = y_train.values[train_index].max()
  rmse = mean_squared_error(y_train.values[valid_index], y_valid_pred, squared=False)
  rmses.append(rmse)
  print(f'RMSE: {rmse:.1f}')
rmses = np.array(rmses)
print(f'mean RMSE: {rmses.mean():.1f} ({rmses.std():.1f})')

RMSE: 55048.1
RMSE: 790185.7
RMSE: 58610.7
RMSE: 64564.5
RMSE: 58904.1
RMSE: 64229.7
RMSE: 58399.3
RMSE: 60765.2
RMSE: 60461.0
RMSE: 59777.1
mean RMSE: 133094.5 (219046.3)


* 極端に評価値が悪い分割があるので要注意！
 * これでは採用できない。

# 最後にテストデータで評価

* `PolynomialFeatures(2)`が良かったのでこれを採用。

In [32]:
poly = PolynomialFeatures(2)
X_train_transformed = poly.fit_transform(X_train)
X_test_transformed = poly.transform(X_test)

* そしてテストデータで評価。

In [33]:
reg = LinearRegression()
reg.fit(X_train_transformed, y_train)
y_test_pred = reg.predict(X_test_transformed)
y_test_pred[y_test_pred > y_train.max()] = y_train.max()
rmse = mean_squared_error(y_test, y_test_pred, squared=False)
print(f'test RMSE: {rmse:f}')

test RMSE: 67693.013974
