In [1]:
import json
import joblib
from pathlib import Path

from econml.dml import LinearDML, DML
from lightgbm import LGBMRegressor, LGBMClassifier
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import tqdm

from sklearn.linear_model import RidgeCV, LogisticRegression, LinearRegression
from sklearn.model_selection import GroupKFold, StratifiedGroupKFold
from sklearn.base import clone

In [2]:
models = Path("models") / "simple" / "female_director_ratio"
models.mkdir(parents=True, exist_ok=True)

data_path = Path("../data")
file_name = "ToAnalysis_Winsorized_2015_2023_With_Profit_Asset.csv"
file_path = data_path / file_name

df = pd.read_csv(file_path, parse_dates=['start date'])
df.columns = [col.replace(" ", "_").lower() for col in df.columns]
df = df.sort_values(by=['ticker', 'start_date'])

df["log_tobin_q_winsor"] = np.log(df['tobin_q_winsor'])
df["year"] = df.start_date.dt.year
df["roa"] = df["net_income"].div(df["total_assets"])
df["log_total_assets"] = np.log(df['total_assets'])

In [3]:
year = pd.get_dummies(df['year'], dtype='int')
year = year.drop(2015, axis=1)
industry = pd.get_dummies(df['industry_name'], dtype='int')
industry = industry.drop("その他製品", axis=1)

In [4]:
Y_cols = ['log_tobin_q_winsor']
W_cols = [
    'board_size', 
    'log_firm_age', 
    'log_sales',
    'sales_growth', 
    'foreign_ownership', 
    'managerial_ownership',
    'tangible_assets', 
    'leverage',
    'roa',
#    'log_total_assets'
]
X_cols = []
T_cols = ['female_director_ratio']
# T_cols = ['female_director']
G_cols = ['ticker']

In [5]:
mundlak_W = df.groupby(by='ticker')[W_cols].transform("mean")
mundlak_W.columns = [f"{col}_mean" for col in mundlak_W.columns]

mundlak_T = df.groupby(by='ticker')[T_cols].transform("mean")
mundlak_T.columns = [f"{col}_mean" for col in mundlak_T.columns]

In [6]:
# コントロール変数
W = df[W_cols]
# W = (W - W.mean()) / W.std()
W = W.join(mundlak_T).join(mundlak_W)
W = W.join(year).join(industry)

# 説明変数
X = df[X_cols]

# 出力
Y = df[Y_cols]

# 介入
T = df[T_cols]

# Groups
G = df['ticker']

tmp = pd.concat((W, X, Y, T, G), axis=1).dropna(how='any', axis=0)

W = W.loc[tmp.index]
X = X.loc[tmp.index]
Y = Y.loc[tmp.index]
T = T.loc[tmp.index]
G = G.loc[tmp.index]

In [7]:
with open("best_params_y.json", mode='r') as f:
    best_params = json.load(f)

model_y = LGBMRegressor(force_row_wise=True, verbose=-1, **best_params) # 

with open("best_params_t.json", mode='r') as f:
    best_params = json.load(f)

model_t = LGBMRegressor(force_row_wise=True, verbose=-1, **best_params) # LinearRegression()

In [8]:
dml0 = LinearDML(model_y=model_y, model_t=model_t,
                 cv=GroupKFold(n_splits=3))
dml0.fit(Y.values.ravel(),
         T.values.ravel(),
         W=W.values, 
         groups=G.values, cache_values=True)

y_res, t_res, _, _ = dml0.residuals_
y_hat = Y.values.ravel() - y_res
t_hat = T.values.ravel() - t_res

In [9]:
def _one_bootstrap(
        b: int,
        groups_unique,
        y_hat, y_res, t_hat, t_res,
        model_y, model_t,
        X, W, G,
        models_dir,
        seed_base: int = 42
    ):
    """b 回目のワイルド・クラスターブートストラップを実行し
       ① ATE を返し ② 推定器を joblib で保存する。
    """
    rng = np.random.RandomState(seed_base + b)              # 反復ごとに異なる乱数系列
    v_g = rng.choice([-1, 1], size=len(groups_unique))
    v   = v_g[np.searchsorted(groups_unique, G)]

    y_star = y_hat + y_res * v
    t_star = t_hat + t_res * v        # ← 処置も揺らすならこちらを採用

    dml_b = LinearDML(
        model_y=model_y, model_t=model_t,
        cv=GroupKFold(n_splits=3),
    )
    dml_b.fit(
        y_star, t_star,
        W=W.values, groups=G.values
    )

    # 結果を保存
    file_path = models_dir / f"{b:04d}.joblib"
    dump(dml_b, file_path, compress=3)

    return dml_b.ate()

In [10]:
from joblib import Parallel, delayed, dump
from joblib import parallel_backend
# from tqdm import tqdm, tqdm_joblib

n_boot   = 500
n_jobs   = 5          # CPU 全コア使用（メモリが厳しければ 4 などに調整）
backend  = "loky"      # Python プロセス毎に分離（デフォルト）。Windows/Mac も可

ate_boot = None        # 後で埋める

groups_unique = G.unique()

with parallel_backend(backend, n_jobs=n_jobs):
    # tqdm に joblib の進捗を拾わせる
#     with tqdm_joblib(tqdm(total=n_boot, desc="Bootstrapping")):
    ate_boot = Parallel()(
        delayed(_one_bootstrap)(
            b,
            groups_unique, y_hat, y_res, t_hat, t_res,
            model_y, model_t,
            X, W, G,
            models
        )
        for b in range(n_boot)
    )

ate_boot = np.asarray(ate_boot)  