[辻真吾・矢吹太朗『ゼロからはじめるデータサイエンス入門』（講談社, 2021）](https://github.com/taroyabuki/fromzero)

In [None]:
# Google Colaboratoryの環境設定
import os
if 'COLAB_GPU' in os.environ:
  !python -m pip install h2o pandarallel pca pmdarima | tail -n 1

## 7.1 自動車の停止距離




## 7.2 データの確認

In [None]:
import statsmodels.api as sm
my_data = sm.datasets.get_rdataset('cars', 'datasets').data

In [None]:
my_data.shape

In [None]:
my_data.head()

In [None]:
my_data.describe()

In [None]:
my_data.plot(x='speed', style='o')

## 7.3 回帰分析

In [None]:
import seaborn as sns
import statsmodels.api as sm

my_data = sm.datasets.get_rdataset('cars', 'datasets').data
ax = sns.regplot(x='speed', y='dist', data=my_data)
ax.vlines(x=21.5, ymin=-5, ymax=67,   linestyles='dotted')
ax.hlines(y=67,   xmin=4,  xmax=21.5, linestyles='dotted')
ax.set_xlim(4, 25)
ax.set_ylim(-5, 125)

In [None]:
import statsmodels.api as sm
my_data = sm.datasets.get_rdataset('cars', 'datasets').data
X, y = my_data[['speed']], my_data['dist']

In [None]:
# モデルの指定
from sklearn.linear_model import LinearRegression
my_model = LinearRegression()

# 訓練（モデルをデータにフィットさせる．）
my_model.fit(X, y)

# まとめて実行してもよい．
# my_model = LinearRegression().fit(X, y)

In [None]:
my_model.intercept_, my_model.coef_

In [None]:
tmp = [[21.5]]
my_model.predict(tmp)

In [None]:
import numpy as np
import pandas as pd

tmp = pd.DataFrame({'speed': np.linspace(min(my_data.speed),
                                         max(my_data.speed),
                                         100)})
tmp['model'] = my_model.predict(tmp)

In [None]:
pd.concat([my_data, tmp]).plot(
    x='speed', style=['o', '-'])

## 7.4 当てはまりの良さの指標

In [None]:
import pandas as pd
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

my_data = sm.datasets.get_rdataset('cars', 'datasets').data
X, y = my_data[['speed']], my_data['dist']

my_model = LinearRegression()
my_model.fit(X, y)
y_ = my_model.predict(X)
my_data['y_'] = y_

In [None]:
pd.options.display.float_format = (
    '{:.2f}'.format)
my_data['residual'] = y - y_
my_data.head()

In [None]:
ax = my_data.plot(x='speed', y='dist', style='o', legend=False)
my_data.plot(x='speed', y='y_', style='-', legend=False, ax=ax)
ax.vlines(x=X, ymin=y, ymax=y_, linestyles='dotted')

In [None]:
mean_squared_error(y, y_)**0.5
# あるいは
(my_data['residual']**2).mean()**0.5


In [None]:
my_model.score(X, y)
# あるいは
r2_score(y_true=y, y_pred=y_)

In [None]:
import numpy as np
np.corrcoef(y, y_)[0, 1]**2

In [None]:
my_test = my_data[:3]
X = my_test[['speed']]
y = my_test['dist']
y_ = my_model.predict(X)

my_model.score(X, y)
# あるいは
r2_score(y_true=y, y_pred=y_)

np.corrcoef(y, y_)[0, 1]**2

In [None]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures

my_data = sm.datasets.get_rdataset('cars', 'datasets').data

my_idx = [1, 10, 26, 33, 38, 43]
my_sample = my_data.iloc[my_idx, ]
X, y = my_sample[['speed']], my_sample['dist']

In [None]:
d = 5
X5 = PolynomialFeatures(d, include_bias=False).fit_transform(X) # Xの1乗から5乗の変数

my_model = LinearRegression()
my_model.fit(X5, y)
y_ = my_model.predict(X5)

In [None]:
((y - y_)**2).mean()**0.5

my_model.score(X5, y)

np.corrcoef(y, y_)[0, 1]**2

In [None]:
tmp = pd.DataFrame({'speed': np.linspace(min(my_data.speed),
                                         max(my_data.speed),
                                         100)})
X5 = PolynomialFeatures(d, include_bias=False).fit_transform(tmp)
tmp['model'] = my_model.predict(X5)

my_sample = my_sample.assign(sample=y)
my_df = pd.concat([my_data, my_sample, tmp])
my_df.plot(x='speed', style=['o', 'o', '-'], ylim=(0, 130))

## 7.5 K最近傍法

In [None]:
# 準備
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.neighbors import KNeighborsRegressor

my_data = sm.datasets.get_rdataset('cars', 'datasets').data
X, y = my_data[['speed']], my_data['dist']

# 訓練
my_model = KNeighborsRegressor()
my_model.fit(X, y)

# 可視化の準備
tmp = pd.DataFrame({'speed': np.linspace(min(my_data.speed),
                                         max(my_data.speed),
                                         100)})
tmp['model'] = my_model.predict(tmp)

In [None]:
pd.concat([my_data, tmp]).plot(
    x='speed', style=['o', '-'])

In [None]:
y_ = my_model.predict(X)

((y - y_)**2).mean()**0.5

my_model.score(X, y)

np.corrcoef(y, y_)[0, 1]**2

## 7.6 検証

In [None]:
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

# データの準備
my_data = sm.datasets.get_rdataset('cars', 'datasets').data
X, y = my_data[['speed']], my_data['dist']

# モデルの指定
my_model = LinearRegression()

# 検証（5分割交差検証）
my_scores = cross_val_score(my_model, X, y)

# 5個の決定係数1を得る．
my_scores

# 平均を決定係数1（検証）とする．
my_scores.mean()

In [None]:
my_scores = cross_val_score(my_model, X, y,
                            scoring='neg_root_mean_squared_error')
-my_scores.mean()

In [None]:
import numpy as np
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score, LeaveOneOut

my_data = sm.datasets.get_rdataset('cars', 'datasets').data
X, y = my_data[['speed']], my_data['dist']
my_model = LinearRegression().fit(X, y)
y_ = my_model.predict(X)

In [None]:
# RMSE（訓練）
mean_squared_error(y, y_)**0.5

# 決定係数1（訓練）
my_model.score(X, y)
# あるいは
r2_score(y_true=y, y_pred=y_)

# 決定係数6（訓練）
np.corrcoef(y, y_)[0, 1]**2

In [None]:
my_scores = cross_val_score(my_model, X, y,
                            scoring='neg_root_mean_squared_error')
-my_scores.mean()

my_scores = cross_val_score(my_model, X, y, scoring='r2') # scoring='r2'は省略可
my_scores.mean()

In [None]:
# 方法1
my_scores1 = cross_val_score(my_model, X, y, cv=LeaveOneOut(),
                             scoring='neg_mean_squared_error')
(-my_scores1.mean())**0.5

# 方法2
my_scores2 = cross_val_score(my_model, X, y, cv=LeaveOneOut(),
                             scoring='neg_root_mean_squared_error')
(my_scores2**2).mean()**0.5

In [None]:
-my_scores2.mean()

In [None]:
import pandas as pd
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score, LeaveOneOut
from sklearn.neighbors import KNeighborsRegressor

my_data = sm.datasets.get_rdataset('cars', 'datasets').data
X, y = my_data[['speed']], my_data['dist']

my_lm_scores = cross_val_score(
    LinearRegression(),
    X, y, cv=LeaveOneOut(), scoring='neg_mean_squared_error')

my_knn_socres = cross_val_score(
    KNeighborsRegressor(n_neighbors=5),
    X, y, cv=LeaveOneOut(), scoring='neg_mean_squared_error')

In [None]:
(-my_lm_scores.mean())**0.5

(-my_knn_socres.mean())**0.5

In [None]:
my_df = pd.DataFrame({
    'lm': -my_lm_scores,
    'knn': -my_knn_socres})
my_df.head()

In [None]:
my_df.boxplot().set_ylabel("$r^2$")

In [None]:
from statsmodels.stats.weightstats import DescrStatsW
d = DescrStatsW(my_df.lm - my_df.knn)
d.ttest_mean()[1] # p値

d.tconfint_mean(alpha=0.05, alternative='two-sided') # 信頼区間

## 7.7 パラメータチューニング

In [None]:
import pandas as pd
import statsmodels.api as sm
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV, LeaveOneOut
from sklearn.neighbors import KNeighborsRegressor

my_data = sm.datasets.get_rdataset('cars', 'datasets').data
X, y = my_data[['speed']], my_data['dist']

my_params = {'n_neighbors': range(1, 16)} # 探索範囲（1以上16未満の整数）

my_search = GridSearchCV(estimator=KNeighborsRegressor(),
                         param_grid=my_params,
                         cv=LeaveOneOut(),
                         scoring='neg_mean_squared_error')
my_search.fit(X, y)

In [None]:
tmp = my_search.cv_results_                # チューニングの詳細
my_scores = (-tmp['mean_test_score'])**0.5 # RMSE
my_results = pd.DataFrame(tmp['params']).assign(validation=my_scores)

In [None]:
my_results.head()

In [None]:
my_results.plot(x='n_neighbors',
                style='o-',
                ylabel='RMSE')

In [None]:
my_search.best_params_

In [None]:
(-my_search.best_score_)**0.5

In [None]:
my_model = my_search.best_estimator_
y_ = my_model.predict(X)
mean_squared_error(y_, y)**0.5

In [None]:
import pandas as pd
import statsmodels.api as sm
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score, LeaveOneOut
from sklearn.neighbors import KNeighborsRegressor

my_data = sm.datasets.get_rdataset('cars', 'datasets').data
X, y = my_data[['speed']], my_data['dist']

def my_loocv(k):
    my_model = KNeighborsRegressor(n_neighbors=k)
    my_scores = cross_val_score(estimator=my_model, X=X, y=y,
                                cv=LeaveOneOut(),
                                scoring='neg_mean_squared_error')
    y_ = my_model.fit(X, y).predict(X)
    return pd.Series([k,
                      (-my_scores.mean())**0.5,        # RMSE（検証）
                      mean_squared_error(y_, y)**0.5], # RMSE（訓練）
                     index=['n_neighbors', 'validation', 'training'])

my_results = pd.Series(range(1, 16)).apply(my_loocv)

In [None]:
my_results.plot(x='n_neighbors',
                style='o-',
                ylabel='RMSE')