# 10章 回帰の手法を学ぼう #1

In [None]:
pip install japanize-matplotlib

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import japanize_matplotlib

## 3. データセットの準備

In [None]:
df = pd.read_csv('/content/drive/Othercomputers/DATACANVAS/github/learning/learning/samurai/ds_cource/13_machine_learning/california_housing_cleansing.csv')
df.head()

In [None]:
# dropの結果を代入する場合は再実行に注意。
# 以下のようにerrorsを指定することで再実行時のエラーを回避可能
df = df.drop(columns = ['Unnamed: 0'], errors='ignore')
display(df.head())
display(df.shape)

In [None]:
X = df.drop(columns=['住宅価格']).to_numpy()
y = df['住宅価格'].to_numpy()


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [None]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

## 4. 予測モデルの学習

In [None]:
from sklearn.linear_model import LinearRegression


In [None]:
model = LinearRegression()


In [None]:
model.fit(X_train, y_train)

## 5. 予測モデルの評価

In [None]:
model.score(X_train, y_train)


In [None]:
model.score(X_test, y_test)



## 6. 予測

In [None]:
X_new = np.array([[8, 41, 500, 37, -120, 1, 0.2],
                  [2, 10, 2000, 38, -122, 1.5, 0.5],
                  [1, 25, 1000, 38, -121, 2, 1]])
display(X_new)

In [None]:
model.predict(X_new)


## 7.設定したゴールに対する考察

In [None]:
print(model.coef_)
print(model.intercept_)


In [None]:
sns.barplot(x = ['所得', '築年数', '地域人口', '緯度', '経度', '部屋数/人', '寝室数/人'], y=model.coef_)


In [None]:
df.describe()

# 11章 回帰の手法を学ぼう 2

## 11.4 回帰手法の改善（標準化の活用）

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()

In [None]:
scaler.fit(X_train)

In [None]:
X_train_scaled = scaler.transform(X_train)
display(X_train_scaled)

In [None]:
# X_trainの変換前の状態
df_X_train = pd.DataFrame(X_train, columns=['所得', '築年数', '地域人口', '緯度', '経度', '部屋数', '寝室数'])
df_X_train.head()


In [None]:
# 変換後の状態
df_X_train_scaled = pd.DataFrame(X_train_scaled, columns=['所得', '築年数', '地域人口', '緯度', '経度', '部屋数', '寝室数'])
df_X_train_scaled.head()


In [None]:
# 標準化されているか確認
df_X_train_scaled.describe()

In [None]:
X_test_scaled = scaler.transform(X_test)

In [None]:
# X_testの変換前の値
df_X_test = pd.DataFrame(X_test, columns=['所得', '築年数', '地域人口', '緯度', '経度', '部屋数', '寝室数'])
df_X_test.head()


In [None]:
# X_testの変換後の値
df_X_test_scaled = pd.DataFrame(X_test_scaled, columns=['所得', '築年数', '地域人口', '緯度', '経度', '部屋数', '寝室数'])
df_X_test_scaled.head()

In [None]:
df_X_test_scaled.describe()

In [None]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()

In [None]:
model.fit(X_train_scaled, y_train)

In [None]:
model.score(X_train_scaled, y_train)

In [None]:
model.score(X_test_scaled, y_test)

In [None]:
# 予測用のデータ
X_new = np.array([[8, 41, 500, 37, -120, 1, 0.2],
                  [2, 10, 2000, 38, -122, 1.5, 0.5],
                  [1, 25, 1000, 38, -121, 2, 1]])


In [None]:
X_new_scaled = scaler.transform(X_new)
display(X_new_scaled)

In [None]:
model.predict(X_new_scaled)

In [None]:
print(model.coef_)
print(model.intercept_)

In [None]:
sns.barplot(x = ['所得', '築年数', '地域人口', '緯度', '経度', '部屋数/人', '寝室数/人'], y=model.coef_)

# 11.5 多重共線性の対処

In [None]:
# 多重共線性とは、重回帰における説明変数の中に、相関関係が高い要素が共存している状態のことです。
# 多重共線性は、重回帰の計算式の重みに悪影響を与えます。

In [None]:
df.drop(columns=['住宅価格'], errors='ignore').corr()
# 参考) heatmapを描いたほうがわかる

In [None]:
# 経度と緯度の調査
sns.scatterplot(x="経度", y="緯度", data=df)

In [None]:
# 部屋数/人の確認
sns.scatterplot(x='部屋数/人', y='寝室数/人', data=df)

In [None]:
X = df[["所得", "築年数", "地域人口", "経度", "部屋数/人"]].to_numpy()
y = df["住宅価格"].to_numpy()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)


In [None]:
scaler = StandardScaler()


In [None]:
scaler.fit(X_train)


In [None]:
X_train_scaled = scaler.transform(X_train)
X_test_scaled  = scaler.transform(X_test)


In [None]:
model = LinearRegression()


In [None]:
model.fit(X_train_scaled, y_train)


In [None]:
print(model.score(X_train_scaled, y_train))
print(model.score(X_test_scaled, y_test))


In [None]:
X_new = np.array([[8, 41, 500, 38, 2],
                  [10, 10, 1000, 40, 1],
                  [7.5, 25, 3500, 39, 3]])


In [None]:
X_new_scaled = scaler.transform(X_new)


In [None]:
model.predict(X_new_scaled)


In [None]:
print(model.coef_)
print(model.intercept_)


In [None]:
sns.barplot(x = ['所得', '築年数', '地域人口', '緯度', '部屋数/人'], y=model.coef_)


# 実践1:
* 以下のデータを読み込んでください。
```
from sklearn.datasets import load_wine
data = load_wine(as_frame=True)
X = data.data
y = data.target
```
* 各変数の特徴（含む欠損・外れ値）を可視化等で確認してください。
* 標準化をしてください。
* 回帰モデルを作成してください。
* 多重共線性を確認し、必要に応じて変数を削除してください。その結果でモデルを変更してください。
* 推論結果を確認してください。その際、以下の確認をしてください。
    * 決定係数 (R^2)
    * RMSE




In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.datasets import load_wine

# Wine Quality Datasetの読み込み
data = load_wine(as_frame=True)
X = data.data
y = data.target

# 説明変数と目的変数の分離
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 標準回帰
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)
lr_r2 = r2_score(y_test, y_pred_lr)
lr_rmse = np.sqrt(mean_squared_error(y_test, y_pred_lr))
print(f"標準回帰: R^2={lr_r2:.3f}, RMSE={lr_rmse:.3f}")

# 実践2 (応用)
* 標準化以外の変換について調べ、StandardScalerとの違いを記載してください。
* 可能なら上記実践1と同じ問題で別の手法を用いてください。

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.datasets import load_wine

# Wine Quality Datasetの読み込み
data = load_wine(as_frame=True)
X = data.data
y = data.target

# 変数のスケーリング
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

# 説明変数と目的変数の分離
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 標準回帰
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)
lr_r2 = r2_score(y_test, y_pred_lr)
lr_rmse = np.sqrt(mean_squared_error(y_test, y_pred_lr))
print(f"標準回帰: R^2={lr_r2:.3f}, RMSE={lr_rmse:.3f}")