In [1]:
import joblib
from pathlib import Path

from econml.dml import LinearDML, DML
from econml.inference import BootstrapInference
from lightgbm import LGBMRegressor
import numpy as np
import pandas as pd
import tqdm

from sklearn.linear_model import Ridge, LogisticRegression
from sklearn.model_selection import GroupKFold
from sklearn.base import clone

In [2]:
models = Path("models")
models.mkdir(parents=True, exist_ok=True)

data_path = Path("../data")
file_name = "ToAnalysis_Winsorized_2015_2023_With_Profit_Asset.csv"
file_path = data_path / file_name

df = pd.read_csv(file_path, parse_dates=['start date'])
df.columns = [col.replace(" ", "_").lower() for col in df.columns]
df = df.sort_values(by=['ticker', 'start_date'])

df["log_tobin_q_winsor"] = np.log(df['tobin_q_winsor'])
df["year"] = df.start_date.dt.year
df["roa"] = df["net_income"].div(df["total_assets"])

In [3]:
c = 0.20

df['female_director_ratio_threshold'] = df['female_director_ratio'].apply(lambda x: 1 if x >= c else 0)

models = models / f"threshold_{c:1.2f}"
models.mkdir(parents=True, exist_ok=True)

In [4]:
year = pd.get_dummies(df['year'], dtype='int', drop_first=True)
industry = pd.get_dummies(df['industry_name'], dtype='int')
industry = industry.drop("その他製品", axis=1)

In [5]:
Y_cols = ['log_tobin_q_winsor']
W_cols = [
    'board_size', 
    'log_firm_age', 
    'log_sales',
    'sales_growth', 
    'foreign_ownership', 
    'managerial_ownership',
    'tangible_assets', 
    'leverage'
]
T_cols = ['female_director_ratio']
G_cols = ['ticker']

In [6]:
Y_cols = ['log_tobin_q_winsor']
W_cols = [
    'board_size', 
    'log_firm_age', 
    'log_sales',
    'sales_growth', 
    'foreign_ownership', 
    'managerial_ownership',
    'tangible_assets', 
    'leverage'
]
T_cols = ['female_director_ratio_threshold']
G_cols = ['ticker']

In [7]:
mundlak_W = df.groupby(by='ticker')[W_cols].transform("mean")
mundlak_W.columns = [f"{col}_mean" for col in mundlak_W.columns]

mundlak_T = df.groupby(by='ticker')[T_cols].transform("mean")
mundlak_T.columns = [f"{col}_mean" for col in mundlak_T.columns]

In [8]:
# コントロール変数
W = df[W_cols].join(mundlak_T).join(mundlak_W).join(year)

# 説明変数
X = industry

# 出力
Y = df[Y_cols]

# 介入
T = df[T_cols]

# Groups
G = df['ticker']

tmp = pd.concat((W, X, Y, T, G), axis=1).dropna(how='any', axis=0)

W = W.loc[tmp.index]
X = X.loc[tmp.index]
Y = Y.loc[tmp.index]
T = T.loc[tmp.index]
G = G.loc[tmp.index]

In [9]:
model_y = LGBMRegressor(force_row_wise=True, verbose=-1) # LinearRegression()
# model_t = LGBMRegressor(force_row_wise=True, verbose=-1) # LinearRegression()
model_t = LogisticRegression(class_weight="balanced", max_iter=2000)
dml = LinearDML(
    model_y = model_y,
    model_t = model_t,
    cv = GroupKFold(n_splits=3),   # 同じ企業を foldに跨らせない
)

In [10]:
seed = 42
n_boot = 1000
unique_groups = np.unique(G)
rng = np.random.RandomState(42)

# Cluster Bootstrap
ate = list()
for b in tqdm.tqdm(range(n_boot)):
    sampled_groups = rng.choice(unique_groups, size=len(unique_groups), replace=True)
    idx = np.concatenate([np.where(G == g)[0] for g in sampled_groups])

    dml_b = DML(
        model_y = model_y,
        model_t = model_t, 
        model_final=Ridge(alpha=1e-4, fit_intercept=False),
        discrete_treatment=True,
        cv = GroupKFold(n_splits=3),   # 同じ企業を foldに跨らせない
    )    
    dml_b.fit(Y.values.ravel()[idx],
              T.values.ravel()[idx],
              X=X.values[idx],
              W=W.values[idx],
              groups=G.values[idx])

    ate.append(dml_b.ate(X=X))
    file_name = f"{b:02d}.joblib"
    file_path = models / file_name
    joblib.dump(dml_b, file_path, compress=3)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [1:55:43<00:00,  6.94s/it]


In [16]:
# pd.Series(ate).quantile(0.025)
# pd.Series(ate).plot(kind='hist', bins=25)

In [14]:
# pd.Series(ate).mean()

In [13]:
# np.exp(0.047) - 1

In [15]:
# 異質性を考えることができる