# ダミー変数

In [1]:
import numpy as np
import pandas as pd

import statsmodels.api as sma
from sklearn.preprocessing import StandardScaler

In [2]:
# データのロード
df = pd.read_csv('data/sample_data3.csv')
df

Unnamed: 0,space,rent,age,facing_direction
0,20,13.958824,3,South
1,25,11.533805,13,South
2,28,14.269547,18,North
3,30,18.72268,10,West
4,36,19.602674,6,West
5,51,16.068166,1,South
6,55,23.183599,5,East
7,62,22.212595,2,North
8,83,29.35701,5,North
9,103,37.565129,23,West


In [3]:
# drop_first=Trueによりダミー変数トラップを防止
pd.get_dummies(df, drop_first=True)

Unnamed: 0,space,rent,age,facing_direction_North,facing_direction_South,facing_direction_West
0,20,13.958824,3,0,1,0
1,25,11.533805,13,0,1,0
2,28,14.269547,18,1,0,0
3,30,18.72268,10,0,0,1
4,36,19.602674,6,0,0,1
5,51,16.068166,1,0,1,0
6,55,23.183599,5,0,0,0
7,62,22.212595,2,1,0,0
8,83,29.35701,5,1,0,0
9,103,37.565129,23,0,0,1


## ダミー変数による線形回帰

In [4]:
# データのロード
X = df.drop(columns = ['rent', 'facing_direction'])
y = df['rent']

In [5]:
# 標準化
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

df_X_scaled =pd.DataFrame(X_scaled,
    columns=np.append('space', 'age'))

In [6]:
df_X_scaled

Unnamed: 0,space,age
0,-1.134662,-0.806277
1,-0.941034,0.633504
2,-0.824857,1.353394
3,-0.747405,0.201569
4,-0.515051,-0.374343
5,0.065834,-1.094233
6,0.220736,-0.518321
7,0.491816,-0.950255
8,1.305055,-0.518321
9,2.079569,2.073284


In [7]:
# バイアス項
df_X_scaled = sma.add_constant(df_X_scaled)

In [8]:
# ダミー変数
df_X_scaled['facing_direction'] = df['facing_direction']
df_X_scaled = pd.get_dummies(df_X_scaled, drop_first=True)

In [9]:
# 線形回帰
est = sma.OLS(y, df_X_scaled)
est_trained = est.fit()
print(est_trained.summary())

                            OLS Regression Results                            
Dep. Variable:                   rent   R-squared:                       0.967
Model:                            OLS   Adj. R-squared:                  0.925
Method:                 Least Squares   F-statistic:                     23.26
Date:                Wed, 21 Sep 2022   Prob (F-statistic):            0.00468
Time:                        01:50:22   Log-Likelihood:                -17.344
No. Observations:                  10   AIC:                             46.69
Df Residuals:                       4   BIC:                             48.50
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                             coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------
const                     21

  "anyway, n=%i" % int(n))
