In [1]:
import joblib
from pathlib import Path

from econml.dml import LinearDML, DML
from econml.inference import BootstrapInference
from lightgbm import LGBMRegressor, LGBMClassifier
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import tqdm

from sklearn.linear_model import RidgeCV, LogisticRegression, LinearRegression
from sklearn.model_selection import GroupKFold, StratifiedGroupKFold
from sklearn.base import clone

In [2]:
from cdt.independence.stats import NormalizedHSIC
# from cdt.independence.kernnls import KCI

No GPU automatically detected. Setting SETTINGS.GPU to 0, and SETTINGS.NJOBS to cpu_count.


In [3]:
models = Path("models") / "wild_bootstrap"
models.mkdir(parents=True, exist_ok=True)

data_path = Path("../data")
file_name = "ToAnalysis_Winsorized_2015_2023_With_Profit_Asset.csv"
file_path = data_path / file_name

df = pd.read_csv(file_path, parse_dates=['start date'])
df.columns = [col.replace(" ", "_").lower() for col in df.columns]
df = df.sort_values(by=['ticker', 'start_date'])

df["log_tobin_q_winsor"] = np.log(df['tobin_q_winsor'])
df["year"] = df.start_date.dt.year
df["roa"] = df["net_income"].div(df["total_assets"])
df["log_female_director_ratio"] = np.log(df["female_director_ratio"] + 1e-3)

In [4]:
year = pd.get_dummies(df['year'], dtype='int', drop_first=True)
industry = pd.get_dummies(df['industry_name'], dtype='int')
industry = industry.drop("その他製品", axis=1)

In [5]:
Y_cols = ['log_tobin_q_winsor']
W_cols = [
    'board_size', 
    'log_firm_age', 
    'log_sales',
    'sales_growth', 
    'tangible_assets', 
    'leverage',
#     'foreign_ownership',
#     'managerial_ownership',
]
T_cols = ['female_director_ratio']
G_cols = ['ticker']
X_cols = ['foreign_ownership', 'managerial_ownership']

In [6]:
mundlak_W = df.groupby(by='ticker')[W_cols + X_cols].transform("mean")
mundlak_W.columns = [f"{col}_mean" for col in mundlak_W.columns]

mundlak_T = df.groupby(by='ticker')[T_cols].transform("mean")
mundlak_T.columns = [f"{col}_mean" for col in mundlak_T.columns]

In [7]:
# コントロール変数
W = df[W_cols].join(mundlak_T).join(mundlak_W).join(year).join(industry)

# 説明変数
X = df[X_cols]# 
# .join(industry)

# 出力
Y = df[Y_cols]

# 介入
T = df[T_cols]

# Groups
G = df['ticker']

tmp = pd.concat((W, X, Y, T, G), axis=1).dropna(how='any', axis=0)

W = W.loc[tmp.index]
X = X.loc[tmp.index]
Y = Y.loc[tmp.index]
T = T.loc[tmp.index]
G = G.loc[tmp.index]

In [20]:
# data = np.column_stack((Y, T, X, W))
# cond_set = list(range(2, data.shape[1])) 

In [19]:
# kci = CIT(data, "kci")                   # ①インスタンス化
# p_val = kci(0, 1, cond_set)                   # ← 位置引数で呼ぶ
# print(f"KCI p-value = {p_val:.4g}")
# p_val = kci(X_id=0, Y_id=1, S_id=[2])    # ②p値を取得
# print(f'p = {p_val:.4g}')

In [18]:
# 条件付き独立性が満たされていない
# np.column_stack(X, W)