In [3]:
import json
import joblib
from pathlib import Path

from econml.dml import LinearDML, DML
from lightgbm import LGBMRegressor, LGBMClassifier
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import tqdm

from sklearn.linear_model import RidgeCV, LogisticRegression, LinearRegression
from sklearn.model_selection import GroupKFold, StratifiedGroupKFold
from sklearn.base import clone

In [5]:
models = Path("models") / "simple" / "threshold_effect"
models.mkdir(parents=True, exist_ok=True)

data_path = Path("../data")
file_name = "ToAnalysis_Winsorized_2015_2023_With_Profit_Asset.csv"
file_path = data_path / file_name

df = pd.read_csv(file_path, parse_dates=['start date'])
df.columns = [col.replace(" ", "_").lower() for col in df.columns]
df = df.sort_values(by=['ticker', 'start_date'])

df["log_tobin_q_winsor"] = np.log(df['tobin_q_winsor'])
df["year"] = df.start_date.dt.year
df["roa"] = df["net_income"].div(df["total_assets"])
df["log_total_assets"] = np.log(df['total_assets'])

In [6]:
df = df.assign(lag_log_tobin_q_winsor=df.groupby(by=['ticker']).log_tobin_q_winsor.shift(-1))

In [7]:
df['female_threshold'] = df['female_director_ratio'].mask((df['female_director_ratio']<= 0.1) & (df['female_director_ratio']> 0), 0)
df['female_threshold'] = df['female_threshold'].mask((df['female_director_ratio']<= 0.2) & (df['female_director_ratio']> 0.1), 1)
df['female_threshold'] = df['female_threshold'].mask((df['female_director_ratio']<= 0.3) & (df['female_director_ratio']> 0.2), 2)
df['female_threshold'] = df['female_threshold'].mask((df['female_director_ratio']> 0.3), 3)

In [8]:
year = pd.get_dummies(df['year'], dtype='int')
year = year.drop(2015, axis=1)
industry = pd.get_dummies(df['industry_name'], dtype='int')
industry = industry.drop("その他製品", axis=1)

In [20]:
Y_cols = ['lag_log_tobin_q_winsor']
W_cols = [
    'board_size', 
    'log_firm_age', 
    'log_sales',
    'sales_growth', 
    'foreign_ownership', 
    'managerial_ownership',
    'tangible_assets', 
    'leverage',
    'roa',
#     'net_income',
#     'log_total_assets'
]
X_cols = []
# T_cols = ['female_director_ratio']
T_cols = ['female_director']
# T_cols = ['female_director_01', 'female_director_02', 'female_director_03', 'female_director_04']
T_cols = ['female_threshold']
G_cols = ['ticker']

In [21]:
mundlak_W = df.groupby(by='ticker')[W_cols].transform("mean")
mundlak_W.columns = [f"{col}_mean" for col in mundlak_W.columns]

mundlak_T = df.groupby(by='ticker')[T_cols].transform("mean")
mundlak_T.columns = [f"{col}_mean" for col in mundlak_T.columns]

In [22]:
# コントロール変数
W = df[W_cols]
# W = (W - W.mean()) / W.std()
W = W.join(mundlak_T).join(mundlak_W)
W = W.join(year).join(industry)

# 説明変数
X = df[X_cols]

# 出力
Y = df[Y_cols]

# 介入
T = df[T_cols]

# Groups
G = df['ticker']

tmp = pd.concat((W, X, Y, T, G), axis=1).dropna(how='any', axis=0)

W = W.loc[tmp.index]
X = X.loc[tmp.index]
Y = Y.loc[tmp.index]
T = T.loc[tmp.index]
G = G.loc[tmp.index]

In [23]:
T

Unnamed: 0,female_threshold
0,0.0
1,0.0
2,0.0
3,0.0
4,1.0
...,...
6474,1.0
6475,0.0
6476,0.0
6477,2.0


In [24]:
with open("best_params_y.json", mode='r') as f:
    best_params = json.load(f)
model_y = LGBMRegressor(force_row_wise=True, verbose=-1, **best_params) 

with open("best_params_t.json", mode='r') as f:
    best_params = json.load(f)

model_t = LGBMClassifier(objective='multiclass', class_weight='balanced')

In [None]:
seed = 42
n_boot = 100
unique_groups = np.unique(G)
rng = np.random.RandomState(42)

# Cluster Bootstrap
ate = list()
for b in tqdm.tqdm(range(n_boot)):
    sampled_groups = rng.choice(unique_groups, size=len(unique_groups), replace=True)
    idx = np.concatenate([np.where(G == g)[0] for g in sampled_groups])

    dml_b = LinearDML(
        model_y = model_y,
        model_t = model_t, 
#        model_final=Ridge(alpha=1e-4, fit_intercept=False),
        discrete_treatment=True,
        cv = GroupKFold(n_splits=3),   # 同じ企業を foldに跨らせない
    )    
    dml_b.fit(Y.values.ravel()[idx],
              T.values.ravel()[idx],
#               X=X.values[idx],
              W=W.values[idx],
              groups=G.values[idx])

    ate.append(dml_b.ate())
    file_name = f"{b:02d}.joblib"
    file_path = models / file_name
    joblib.dump(dml_b, file_path, compress=3)

  0%|                                                                                                                                                                                        | 0/100 [00:00<?, ?it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000398 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4047
[LightGBM] [Info] Number of data points in the train set: 3840, number of used features: 52
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294


  7%|████████████▎                                                                                                                                                                   | 7/100 [01:17<17:36, 11.36s/it]

In [19]:
# T.values

array([[0.],
       [0.],
       [0.],
       ...,
       [0.],
       [2.],
       [2.]])