In [1]:
import joblib
from pathlib import Path

from econml.dml import LinearDML, DML
from econml.inference import BootstrapInference
from lightgbm import LGBMRegressor, LGBMClassifier
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import tqdm

from sklearn.linear_model import RidgeCV, LogisticRegression, LinearRegression
from sklearn.model_selection import GroupKFold, StratifiedGroupKFold
from sklearn.base import clone

In [2]:
models = Path("models") / "wild_bootstrap"
models.mkdir(parents=True, exist_ok=True)

data_path = Path("../data")
file_name = "ToAnalysis_Winsorized_2015_2023_With_Profit_Asset.csv"
file_path = data_path / file_name

df = pd.read_csv(file_path, parse_dates=['start date'])
df.columns = [col.replace(" ", "_").lower() for col in df.columns]
df = df.sort_values(by=['ticker', 'start_date'])

df["log_tobin_q_winsor"] = np.log(df['tobin_q_winsor'])
df["year"] = df.start_date.dt.year
df["roa"] = df["net_income"].div(df["total_assets"])
df["log_female_director_ratio"] = np.log(df["female_director_ratio"] + 1e-3)

In [3]:
year = pd.get_dummies(df['year'], dtype='int', drop_first=True)
industry = pd.get_dummies(df['industry_name'], dtype='int')
industry = industry.drop("その他製品", axis=1)

In [4]:
Y_cols = ['log_tobin_q_winsor']
W_cols = [
    'board_size', 
    'log_firm_age', 
    'log_sales',
    'sales_growth', 
    'tangible_assets', 
    'leverage',
#     'foreign_ownership',
#     'managerial_ownership',
]
T_cols = ['female_director_ratio']
G_cols = ['ticker']
X_cols = ['foreign_ownership', 'managerial_ownership']

In [5]:
mundlak_W = df.groupby(by='ticker')[W_cols + X_cols].transform("mean")
mundlak_W.columns = [f"{col}_mean" for col in mundlak_W.columns]

mundlak_T = df.groupby(by='ticker')[T_cols].transform("mean")
mundlak_T.columns = [f"{col}_mean" for col in mundlak_T.columns]

In [6]:
# コントロール変数
W = df[W_cols].join(mundlak_T).join(mundlak_W).join(year).join(industry)

# 説明変数
X = df[X_cols]# 
# .join(industry)

# 出力
Y = df[Y_cols]

# 介入
T = df[T_cols]

# Groups
G = df['ticker']

tmp = pd.concat((W, X, Y, T, G), axis=1).dropna(how='any', axis=0)

W = W.loc[tmp.index]
X = X.loc[tmp.index]
Y = Y.loc[tmp.index]
T = T.loc[tmp.index]
G = G.loc[tmp.index]

In [7]:
model_y = LGBMRegressor(force_row_wise=True, verbose=-1) # 
model_t = LGBMRegressor(force_row_wise=True, verbose=-1) # LinearRegression()

In [8]:
dml0 = LinearDML(model_y=model_y, model_t=model_t,
                 cv=GroupKFold(n_splits=3))
dml0.fit(Y.values.ravel(),
         T.values.ravel(),
         X=X.values, 
         W=W.values, 
         groups=G.values, cache_values=True)

y_res, t_res, _, _ = dml0.residuals_
y_hat = Y.values.ravel() - y_res
t_hat = T.values.ravel() - t_res

In [9]:
groups_unique = G.unique()
rng   = np.random.RandomState(42)

n_boot = 100
ate = []
for b in tqdm.tqdm(range(n_boot)):
    v_g = rng.choice([-1, 1], size=len(groups_unique))
    v = v_g[np.searchsorted(groups_unique, G)]
    y_star = y_hat + y_res * v
    t_star = t_hat + t_res * v  # treatment も揺らすなら

    dml_b = LinearDML(
        model_y=model_y, model_t=model_t, cv=GroupKFold(n_splits=3)
    )
    dml_b.fit(
        y_star, t_star, X=X.values, W=W.values, groups=G.values
    )
    
    ate.append(dml_b.ate(X=X.values))
    file_name = f"{b:02d}.joblib"
    file_path = models / file_name
    joblib.dump(dml_b, file_path, compress=3)   

  0%|                                                                                                                                    | 0/100 [00:03<?, ?it/s]
Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x120270c50>>
Traceback (most recent call last):
  File "/Users/tsuyos-u/Library/Caches/pypoetry/virtualenvs/dml-r0_m-wiA-py3.11/lib/python3.11/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(

KeyboardInterrupt: 


KeyboardInterrupt: 

In [10]:
# within 変換と
# between 変換
dml0.ate(X=X)
# 0.0025

0.23759334319869158

In [None]:
# pd.Series(data=dml0.coef_, index=W.columns)
# pd.Series(ate).median()
# (pd.Series(ate).value_counts().sort_index().cumsum() / 100).plot()
# sns.scatterplot(x=t_hat, y=np.sqrt(t_res / t_res.std()))

In [None]:
# dml0.intercept_
# dml0.ate(X=X)

In [None]:
# pd.Series(data=dml0.coef_, index=X.columns)

In [None]:
# managerial ownershipが高い

In [None]:
# pd.Series(ate).median() * 0.1

In [None]:
# dml_b

In [None]:
# nuisance 

In [None]:
# pd.Series(ate).quantile(0.025)
# pd.Series(ate).quantile(0.975)

In [None]:
# np.exp(pd.Series(ate).median() * 0.1) - 1

In [None]:
fe