In [1]:
# Wild Bootstrap

In [2]:
import joblib
from pathlib import Path

from econml.dml import LinearDML, DML
from econml.inference import BootstrapInference
from lightgbm import LGBMRegressor, LGBMClassifier
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import tqdm

from sklearn.linear_model import RidgeCV, LogisticRegression, LinearRegression
from sklearn.model_selection import GroupKFold, StratifiedGroupKFold
from sklearn.base import clone

In [3]:
def sigmoid(x, c, gamma):
    denom = 1 + np.exp(-gamma * (x - c))
    numer = 1
    return numer / denom

In [4]:
class GroupStratifiedKFold:
    """
    StratifiedGroupKFold をラップして、回帰値 y を 0/1 に2値化して層化するユーティリティ。
    例: y > 0.5 を 1, それ以外を 0 に丸めて層化したいときなど。
    """

    def __init__(self, n_splits: int = 3, random_state: int = None):
        self.sgk = StratifiedGroupKFold(
            n_splits=n_splits,
            shuffle=True,
            random_state=random_state
        )

    def split(self, X, y, groups):
        """
        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
        y : array-like of shape (n_samples,)
            回帰値でも OK。内部で 0/1 に変換して層化します。
        groups : array-like of shape (n_samples,)
            グループ ID（例: 企業 ID）

        Returns
        -------
        Generator of (train_idx, test_idx) タプル
        """
        # 0/1 へ 2 値化（しきい値は適宜変更）
        y_bin = np.where(y > 0.5, 1, 0)
        return self.sgk.split(X, y_bin, groups)

In [5]:
models = Path("models") / "soft_threshold"
models.mkdir(parents=True, exist_ok=True)

data_path = Path("../data")
file_name = "ToAnalysis_Winsorized_2015_2023_With_Profit_Asset.csv"
file_path = data_path / file_name

df = pd.read_csv(file_path, parse_dates=['start date'])
df.columns = [col.replace(" ", "_").lower() for col in df.columns]
df = df.sort_values(by=['ticker', 'start_date'])

df["log_tobin_q_winsor"] = np.log(df['tobin_q_winsor'])
df["year"] = df.start_date.dt.year
df["roa"] = df["net_income"].div(df["total_assets"])

In [6]:
c = 0.25
gamma = 60
models = models / f"{c:1.2f}"
models.mkdir(parents=True, exist_ok=True)

df['female_director_ratio_threshold'] = df['female_director_ratio'].apply(lambda x: sigmoid(x, c, gamma))

In [7]:
year = pd.get_dummies(df['year'], dtype='int', drop_first=True)
industry = pd.get_dummies(df['industry_name'], dtype='int')
industry = industry.drop("その他製品", axis=1)

In [8]:
Y_cols = ['log_tobin_q_winsor']
W_cols = [
    'board_size', 
    'log_firm_age', 
    'log_sales',
    'sales_growth', 
    'tangible_assets', 
    'leverage',
    'foreign_ownership', 
    'managerial_ownership',    
]
T_cols = ['female_director_ratio_threshold']
G_cols = ['ticker']
X_cols = []# ['foreign_ownership', 'managerial_ownership']

In [9]:
mundlak_W = df.groupby(by='ticker')[W_cols].transform("mean")
mundlak_W.columns = [f"{col}_mean" for col in mundlak_W.columns]

mundlak_T = df.groupby(by='ticker')[T_cols].transform("mean")
mundlak_T.columns = [f"{col}_mean" for col in mundlak_T.columns]

In [10]:
# コントロール変数
W = df[W_cols].join(mundlak_T).join(mundlak_W).join(year).join(industry)

# 説明変数
X = df[X_cols].join(industry)

# 出力
Y = df[Y_cols]

# 介入
T = df[T_cols]

# Groups
G = df['ticker']

tmp = pd.concat((W, X, Y, T, G), axis=1).dropna(how='any', axis=0)

W = W.loc[tmp.index]
X = X.loc[tmp.index]
Y = Y.loc[tmp.index]
T = T.loc[tmp.index]
G = G.loc[tmp.index]

In [11]:
model_y = LGBMRegressor(force_row_wise=True, verbose=-1) # 
model_t = LGBMRegressor(force_row_wise=True, verbose=-1) # LinearRegression()

In [12]:
dml0 = LinearDML(model_y=model_y, model_t=model_t,
                 cv=GroupStratifiedKFold(n_splits=3))
dml0.fit(Y.values.ravel(),
         T.values.ravel(),
         W=W.values, groups=G.values, cache_values=True)

y_res, t_res, _, _ = dml0.residuals_
y_hat = Y.values.ravel() - y_res
t_hat = T.values.ravel() - t_res

In [13]:
# sns.scatterplot(x=t_res, y=y_res / y_res.std())

In [14]:
groups_unique = G.unique()
rng   = np.random.RandomState(42)

n_boot = 100

ate = []
for b in tqdm.tqdm(range(n_boot)):
    v_g = rng.choice([-1, 1], size=len(groups_unique))
    v = v_g[np.searchsorted(groups_unique, G)]
    y_star = y_hat + y_res * v
    t_star = t_hat + t_res * v  # treatment も揺らすなら

    dml_b = LinearDML(
        model_y=model_y, model_t=model_t, cv=GroupStratifiedKFold(n_splits=3)
    )
    dml_b.fit(y_star, t_star, W=W.values, groups=G.values)
#     dml_b.fit(y_star, t_star, X=W.values, groups=G.values)    
    
    ate.append(dml_b.ate())
    file_name = f"{b:02d}.joblib"
    file_path = models / file_name
    joblib.dump(dml_b, file_path, compress=3)    

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [05:27<00:00,  3.28s/it]


In [15]:
# pd.Series(ate).sort_values().quantile(0.02)

In [16]:
# pd.Series(ate)

In [17]:
# Y, T, X, W = dml0.residuals_