In [1]:
import json
import joblib
from pathlib import Path

from econml.dml import LinearDML, DML
from catboost import CatBoostClassifier, CatBoostRegressor
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import tqdm

from sklearn.model_selection import GroupKFold, StratifiedGroupKFold

In [2]:
class GroupStratifiedKFold:
    def __init__(self, n_splits: int = 3, random_state: int = None):
        self.sgk = StratifiedGroupKFold(
            n_splits=n_splits,
            shuffle=True,
            random_state=random_state
        )

    def split(self, X, y, groups):
        return self.sgk.split(X, y, groups)

In [3]:
models = Path("models") / "simple" / "threshold_effect"
models.mkdir(parents=True, exist_ok=True)

data_path = Path("../data")
file_name = "ToAnalysis_Winsorized_2015_2023_With_Profit_Asset.csv"
file_path = data_path / file_name

df = pd.read_csv(file_path, parse_dates=['start date'])
df.columns = [col.replace(" ", "_").lower() for col in df.columns]
df = df.sort_values(by=['ticker', 'start_date'])

df["log_tobin_q_winsor"] = np.log(df['tobin_q_winsor'])
df["year"] = df.start_date.dt.year
df["roa"] = df["net_income"].div(df["total_assets"])
df["log_total_assets"] = np.log(df['total_assets'])
df["log_leverage"] = np.log(df['leverage'])
df["log_tangible_assets"] = np.log(df['tangible_assets'])

df = df.assign(lag_log_tobin_q_winsor=df.groupby(by=['ticker']).log_tobin_q_winsor.shift(-1))

In [4]:
df['female_threshold'] = df['female_director_ratio'].mask((df['female_director_ratio']<= 0.1) & (df['female_director_ratio']>= 0), 0)
df['female_threshold'] = df['female_threshold'].mask((df['female_director_ratio']<= 0.2) & (df['female_director_ratio']> 0.1), 1)
df['female_threshold'] = df['female_threshold'].mask((df['female_director_ratio']<= 0.3) & (df['female_director_ratio']> 0.2), 2)
df['female_threshold'] = df['female_threshold'].mask((df['female_director_ratio']> 0.3), 3)

In [5]:
# 年の固定効果
year = pd.get_dummies(df['year'], dtype='int')
year = year.drop(2015, axis=1)

# 業種の固定効果
industry = pd.get_dummies(df['industry_name'], dtype='int')
industry = industry.drop("その他製品", axis=1)

In [6]:
Y_cols = ['log_tobin_q_winsor']

# Lag Version
Y_cols = ['log_tobin_q_winsor']

# Control 変数 (ROAは入れた方が予測性能がよくなる)
W_cols = [
    'board_size', 
    'log_firm_age', 
    'log_sales',
    'sales_growth', 
    'foreign_ownership', 
    'managerial_ownership',
    'log_tangible_assets', 
    'log_leverage',
    'roa',
    'net_income'
]

X_cols = []
T_cols = ['female_threshold']
G_cols = ['ticker']

In [7]:
mundlak_W = df.groupby(by='ticker')[W_cols + X_cols].transform("mean")
mundlak_W.columns = [f"{col}_mean" for col in mundlak_W.columns]

mundlak_T = df.groupby(by='ticker')[T_cols].transform("mean")
mundlak_T.columns = [f"{col}_mean" for col in mundlak_T.columns]

In [8]:
# コントロール変数
W = df[W_cols]
W = W.join(mundlak_T).join(mundlak_W)
W = W.join(year).join(industry)

# 説明変数
X = df[X_cols]

# 出力
Y = df[Y_cols]

# 介入
T = df[T_cols]

# Groups
G = df['ticker']

tmp = pd.concat((W, X, Y, T, G), axis=1).dropna(how='any', axis=0)

W = W.loc[tmp.index]
X = X.loc[tmp.index]
Y = Y.loc[tmp.index]
T = T.loc[tmp.index]
G = G.loc[tmp.index]

In [9]:
# Catboost を使用
model_y = CatBoostRegressor(loss_function="RMSE", verbose=0)
model_t = CatBoostClassifier(
    loss_function="MultiClass",
    eval_metric="MultiClass",  # 学習時の指標は log-loss
    verbose=0
)

In [10]:
seed = 42
n_boot = 500
unique_groups = np.unique(G)
rng = np.random.RandomState(42)

# Cluster Bootstrap
ate = list()
for b in tqdm.tqdm(range(n_boot)):
    sampled_groups = rng.choice(unique_groups, size=len(unique_groups), replace=True)
    idx = np.concatenate([np.where(G == g)[0] for g in sampled_groups])

    dml_b = LinearDML(
        model_y = model_y,
        model_t = model_t, 
        discrete_treatment=True,
        cv = GroupStratifiedKFold(n_splits=3)
    )
    dml_b.fit(Y.values.ravel()[idx],
              T.values.ravel()[idx],
              W=W.values[idx],
              groups=G.values[idx])

    ate.append(dml_b.ate())
    file_name = f"{b:02d}.joblib"
    file_path = models / file_name
    joblib.dump(dml_b, file_path, compress=3)

  0%|▋                                                                                                                                                                             | 2/500 [00:27<1:55:57, 13.97s/it]
Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x111c7d590>>
Traceback (most recent call last):
  File "/Users/tsuyos-u/Library/Caches/pypoetry/virtualenvs/dml-r0_m-wiA-py3.11/lib/python3.11/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(

KeyboardInterrupt: 


KeyboardInterrupt: 