<a href="https://colab.research.google.com/github/takunton/ds/blob/main/work/ds_ml/web/lesson13.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression

In [2]:
# データロード
df = sns.load_dataset('diamonds')
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [3]:
# ダミー変数の生成
# ダミー変数トラップ回避のため、drop_first_True
pd.get_dummies(df[['cut']], drop_first=True)

Unnamed: 0,cut_Good,cut_Ideal,cut_Premium,cut_Very Good
0,0,1,0,0
1,0,0,1,0
2,1,0,0,0
3,0,0,1,0
4,1,0,0,0
...,...,...,...,...
53935,0,1,0,0
53936,1,0,0,0
53937,0,0,0,1
53938,0,0,1,0


In [6]:
# df指定で質的変数の列を自動的にダミー変数生成
df = pd.get_dummies(df, drop_first=True)
df.head()

Unnamed: 0,carat,depth,table,price,x,y,z,cut_Good,cut_Ideal,cut_Premium,...,color_H,color_I,color_J,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
0,0.23,61.5,55.0,326,3.95,3.98,2.43,0,1,0,...,0,0,0,0,0,1,0,0,0,0
1,0.21,59.8,61.0,326,3.89,3.84,2.31,0,0,1,...,0,0,0,0,1,0,0,0,0,0
2,0.23,56.9,65.0,327,4.05,4.07,2.31,1,0,0,...,0,0,0,0,0,0,1,0,0,0
3,0.29,62.4,58.0,334,4.2,4.23,2.63,0,0,1,...,0,1,0,0,0,0,0,1,0,0
4,0.31,63.3,58.0,335,4.34,4.35,2.75,1,0,0,...,0,0,1,0,0,1,0,0,0,0


In [22]:
# 質的変数ありの場合
model = LinearRegression()

df = sns.load_dataset('diamonds')

# one-hotコーディング
df = pd.get_dummies(df, drop_first=True)

# 特徴量
X = df.loc[:, df.columns!= 'price']

# 実際値
y = df['price']

cv = KFold(n_splits=5, random_state=0, shuffle=True)
scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
print('MAE:{}'.format(np.abs(np.mean(scores))))

MAE:740.5483989635794


In [23]:
# 質的変数なしの場合
model = LinearRegression()

df = sns.load_dataset('diamonds')

# 特徴量
X = df.loc[:, ~df.columns.isin(['price', 'cut', 'clarity', 'color'])]

# 実際値
y= df['price']

cv = KFold(n_splits=5, random_state=0, shuffle=True)
scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
print('MAE:{}'.format(np.abs(np.mean(scores))))

MAE:890.1499487422395
