# 操作変数法
内生性のある説明変数（X）に対して、交絡因子の影響を排除するために「Xには影響を与えるが、目的変数（Y）には直接影響しない」操作変数（Z）を用いて因果関係を推定する手法。操作変数を使ってXを推定し、その推定値を用いてYを回帰することで、Xの真の因果効果を求める。

https://chatgpt.com/c/67b574d3-df54-8006-ab46-b9d8b19c5d93

In [2]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
from linearmodels.iv import IV2SLS
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import fetch_california_housing

In [10]:
# データの取得
california = fetch_california_housing(as_frame=True)
df = california.frame

# 列名の確認
df

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422
...,...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,0.781
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,0.771
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,0.923
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32,0.847


In [4]:
# OLS回帰 (部屋数 -> 住宅価格)
ols_model = smf.ols("MedHouseVal ~ AveRooms", data=df).fit()
print(ols_model.summary())

                            OLS Regression Results                            
Dep. Variable:            MedHouseVal   R-squared:                       0.023
Model:                            OLS   Adj. R-squared:                  0.023
Method:                 Least Squares   F-statistic:                     487.8
Date:                Wed, 19 Feb 2025   Prob (F-statistic):          7.57e-107
Time:                        19:54:32   Log-Likelihood:                -32001.
No. Observations:               20640   AIC:                         6.401e+04
Df Residuals:                   20638   BIC:                         6.402e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      1.6838      0.019     87.952      0.0

In [5]:
first_stage = smf.ols("AveRooms ~ Population", data=df).fit()
df["AveRooms_hat"] = first_stage.predict(df["Population"])

In [6]:
second_stage = smf.ols("MedHouseVal ~ AveRooms_hat", data=df).fit()
print(second_stage.summary())

                            OLS Regression Results                            
Dep. Variable:            MedHouseVal   R-squared:                       0.001
Model:                            OLS   Adj. R-squared:                  0.001
Method:                 Least Squares   F-statistic:                     12.55
Date:                Wed, 19 Feb 2025   Prob (F-statistic):           0.000398
Time:                        19:54:45   Log-Likelihood:                -32236.
No. Observations:               20640   AIC:                         6.448e+04
Df Residuals:                   20638   BIC:                         6.449e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept        1.2042      0.244      4.933   

In [8]:
print(f"OLSの回帰係数 (AveRooms -> MedHouseVal): {ols_model.params['AveRooms']}")
print(f"IVの回帰係数 (AveRooms -> MedHouseVal): {iv_model.params['AveRooms']}")

OLSの回帰係数 (AveRooms -> MedHouseVal): 0.07086879328040531
IVの回帰係数 (AveRooms -> MedHouseVal): 0.15920474604229184
