## 8.1 ブドウの生育条件とワインの価格

In [None]:
import pandas as pd
my_url = 'http://www.liquidasset.com/winedata.html'
tmp = pd.read_table(my_url, skiprows=62, nrows=38, sep='\\s+', na_values='.')
tmp.describe()
# 以下省略

In [None]:
my_data = tmp.iloc[:, 2:].dropna()
my_data.head()

In [None]:
my_data.shape

In [None]:
my_data.to_csv('wine.csv',
               index=False)

In [None]:
#my_data = pd.read_csv('wine.csv') # 作ったファイルを使う場合
my_url = ('https://raw.githubusercontent.com/taroyabuki'
          '/fromzero/master/data/wine.csv')
my_data = pd.read_csv(my_url)

## 8.2 重回帰分析

In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score, LeaveOneOut

my_url = ('https://raw.githubusercontent.com/taroyabuki'
          '/fromzero/master/data/wine.csv')
my_data = pd.read_csv(my_url)
X, y = my_data.drop(columns=['LPRICE2']), my_data['LPRICE2']

my_model = LinearRegression().fit(X, y)

In [None]:
my_model.intercept_

pd.Series(my_model.coef_,
          index=X.columns)

In [None]:
my_test = [[500, 17, 120, 2]]
my_model.predict(my_test)

In [None]:
y_ = my_model.predict(X)

mean_squared_error(y_, y)**0.5

my_model.score(X, y)

np.corrcoef(y, y_)[0, 1]**2

In [None]:
my_scores = cross_val_score(my_model, X, y,
                            cv=LeaveOneOut(),
                            scoring='neg_mean_squared_error')
(-my_scores.mean())**0.5

In [None]:
import numpy as np
M = np.matrix(X.assign(b0=1))
b = np.linalg.pinv(M) @ y
pd.Series(b,
    index=list(X.columns) + ['b0'])

## 8.3 標準化

In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
my_url = ('https://raw.githubusercontent.com/taroyabuki'
          '/fromzero/master/data/wine.csv')
my_data = pd.read_csv(my_url)
X, y = my_data.drop(columns=['LPRICE2']), my_data['LPRICE2']

# StandardScalerで標準化した結果をデータフレームに戻してから描画する．
pd.DataFrame(StandardScaler().fit_transform(X), columns=X.columns
            ).boxplot(showmeans=True)

In [None]:
my_pipeline = Pipeline([
    ('sc', StandardScaler()),
    ('lr', LinearRegression())])
my_pipeline.fit(X, y)

In [None]:
# 線形回帰の部分だけを取り出す．
my_lr = my_pipeline.named_steps.lr
my_lr.intercept_

pd.Series(my_lr.coef_,
          index=X.columns)

In [None]:
my_test = [[500, 17, 120, 2]]
my_pipeline.predict(my_test)

## 8.4 入力変数の数とモデルの良さ

In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score, LeaveOneOut

my_url = ('https://raw.githubusercontent.com/taroyabuki'
          '/fromzero/master/data/wine.csv')
my_data = pd.read_csv(my_url)

n = len(my_data)
my_data2 = my_data.assign(v1=[i % 2 for i in range(n)],
                          v2=[i % 3 for i in range(n)])
my_data2.head()

In [None]:
X, y = my_data2.drop(columns=['LPRICE2']), my_data2['LPRICE2']
my_model2 = LinearRegression().fit(X, y)

y_ = my_model2.predict(X)
mean_squared_error(y_, y)**0.5

my_scores = cross_val_score(my_model2, X, y,
                            cv=LeaveOneOut(),
                            scoring='neg_mean_squared_error')
(-my_scores.mean())**0.5

## 8.5 変数選択

In [None]:
import pandas as pd
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV, LeaveOneOut
from sklearn.pipeline import Pipeline

my_url = ('https://raw.githubusercontent.com/taroyabuki'
          '/fromzero/master/data/wine.csv')
my_data = pd.read_csv(my_url)

n = len(my_data)
my_data2 = my_data.assign(v1=[i % 2 for i in range(n)],
                          v2=[i % 3 for i in range(n)])
X, y = my_data2.drop(columns=['LPRICE2']), my_data2['LPRICE2']

In [None]:
my_sfs = SequentialFeatureSelector(
    estimator=LinearRegression(),
    direction='forward', # 変数増加法
    cv=LeaveOneOut(),
    scoring='neg_mean_squared_error')

my_pipeline = Pipeline([         # 変数選択の後で再訓練を行うようにする．
    ('sfs', my_sfs),             # 変数選択
    ('lr', LinearRegression())]) # 回帰分析

my_params = {'sfs__n_features_to_select': range(1, 6)} # 選択する変数の上限
my_search = GridSearchCV(estimator=my_pipeline,
                         param_grid=my_params,
                         cv=LeaveOneOut(),
                         scoring='neg_mean_squared_error',
                         n_jobs=-1).fit(X, y)
my_model = my_search.best_estimator_ # 最良のパラメータで再訓練したモデル
my_search.best_estimator_.named_steps.sfs.get_support()

## 8.6 補足：正則化

In [None]:
import numpy as np
import pandas as pd
import warnings
from sklearn.exceptions import ConvergenceWarning
from sklearn.linear_model import ElasticNet, enet_path
from sklearn.model_selection import GridSearchCV, LeaveOneOut
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from scipy.stats import zscore
warnings.simplefilter('ignore', ConvergenceWarning) # これ以降，警告を表示しない．

my_url = ('https://raw.githubusercontent.com/taroyabuki'
          '/fromzero/master/data/wine.csv')
my_data = pd.read_csv(my_url)
X, y = my_data.drop(columns=['LPRICE2']), my_data['LPRICE2']

In [None]:
A = 2
B = 0.1

my_pipeline = Pipeline([
    ('sc', StandardScaler()),
    ('enet', ElasticNet(
        alpha=A,
        l1_ratio=B))])
my_pipeline.fit(X, y)

In [None]:
my_enet = my_pipeline.named_steps.enet
my_enet.intercept_

pd.Series(my_enet.coef_,
          index=X.columns)

In [None]:
my_test = pd.DataFrame(
    [[500, 17, 120, 2]])
my_pipeline.predict(my_test)

In [None]:
As = np.e**np.arange(2, -5.5, -0.1)
B = 0.1

_, my_path, _ = enet_path(
    zscore(X), zscore(y),
    alphas=As,
    l1_ratio=B)

pd.DataFrame(
    my_path.T,
    columns=X.columns,
    index=np.log(As)
).plot(
    xlabel='log A ( = log alpha)',
    ylabel='Coefficients')

In [None]:
As = np.linspace(0, 0.1, 21)
Bs = np.linspace(0, 0.1,  6)

my_pipeline = Pipeline([('sc', StandardScaler()),
                        ('enet', ElasticNet())])
my_search = GridSearchCV(
    estimator=my_pipeline,
    param_grid={'enet__alpha': As, 'enet__l1_ratio': Bs},
    cv=LeaveOneOut(),
    scoring='neg_mean_squared_error',
    n_jobs=-1).fit(X, y)
my_model = my_search.best_estimator_ # 最良モデル

my_search.best_params_               # 最良パラメータ

In [None]:
tmp = my_search.cv_results_                # チューニングの詳細
my_scores = (-tmp['mean_test_score'])**0.5 # RMSE

my_results = pd.DataFrame(tmp['params']).assign(RMSE=my_scores).pivot(
    index='enet__alpha',
    columns='enet__l1_ratio',
    values='RMSE')

my_results.plot(style='o-', xlabel='A ( = alpha)', ylabel='RMSE').legend(
    title='B ( = l1_ratio)')

In [None]:
(-my_search.best_score_)**0.5

## 8.7 ニューラルネットワーク

In [None]:
import matplotlib.pyplot as plt
import numpy as np
x = np.linspace(-6, 6, 100)
y = 1 / (1 + np.exp(-x))
plt.plot(x, y)

In [None]:
import pandas as pd
import warnings
from sklearn.exceptions import ConvergenceWarning
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import cross_val_score, GridSearchCV, LeaveOneOut
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

my_url = ('https://raw.githubusercontent.com/taroyabuki'
          '/fromzero/master/data/wine.csv')
my_data = pd.read_csv(my_url)
X, y = my_data.drop(columns=['LPRICE2']), my_data['LPRICE2']

In [None]:
warnings.simplefilter("ignore", ConvergenceWarning)  # これ以降，警告を表示しない．
my_pipeline = Pipeline([('sc', StandardScaler()),    # 標準化
                        ('mlp', MLPRegressor())])    # ニューラルネットワーク
my_pipeline.fit(X, y)                                # 訓練

my_scores = cross_val_score(my_pipeline, X, y, cv=LeaveOneOut(),
                            scoring='neg_mean_squared_error')
warnings.simplefilter("default", ConvergenceWarning) # これ以降，警告を表示する．

In [None]:
(-my_scores.mean())**0.5

In [None]:
my_pipeline = Pipeline([
    ('sc', StandardScaler()),
    ('mlp', MLPRegressor(tol=1e-5,         # 改善したと見なす基準
                         max_iter=5000))]) # 改善しなくなるまでの反復数
my_layers = (1, 3, 5,                                         # 隠れ層1層の場合
             (1, 1), (3, 1), (5, 1), (1, 2), (3, 2), (5, 2))  # 隠れ層2層の場合
my_params = {'mlp__hidden_layer_sizes': my_layers}
my_search = GridSearchCV(estimator=my_pipeline,
                         param_grid=my_params,
                         cv=LeaveOneOut(),
                         scoring='neg_mean_squared_error',
                         n_jobs=-1).fit(X, y)
my_model = my_search.best_estimator_ # 最良モデル

my_search.best_params_               # 最良パラメータ

In [None]:
(-my_search.best_score_)**0.5