In [1]:
import json
import joblib
from pathlib import Path

from econml.dml import LinearDML, DML
from lightgbm import LGBMRegressor, LGBMClassifier
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import tqdm

from sklearn.linear_model import RidgeCV, LogisticRegression, LinearRegression
from sklearn.model_selection import GroupKFold, StratifiedGroupKFold
from sklearn.base import clone

In [2]:
models = Path("models") / "simple" / "female_director_ratio_lag_01"
models.mkdir(parents=True, exist_ok=True)

data_path = Path("../data")
file_name = "ToAnalysis_Winsorized_2015_2023_With_Profit_Asset.csv"
file_path = data_path / file_name

df = pd.read_csv(file_path, parse_dates=['start date'])
df.columns = [col.replace(" ", "_").lower() for col in df.columns]
df = df.sort_values(by=['ticker', 'start_date'])

df["log_tobin_q_winsor"] = np.log(df['tobin_q_winsor'])
df["year"] = df.start_date.dt.year
df["roa"] = df["net_income"].div(df["total_assets"])
df["log_total_assets"] = np.log(df['total_assets'])

In [3]:
df = df.assign(lag_log_tobin_q_winsor=df.groupby(by=['ticker']).log_tobin_q_winsor.shift(-1))

In [42]:
df['female_threshold'] = df['female_director_ratio'].mask((df['female_director_ratio']<= 0.1) & (df['female_director_ratio']> 0), 0)
df['female_threshold'] = df['female_threshold'].mask((df['female_director_ratio']<= 0.2) & (df['female_director_ratio']> 0.1), 1)
df['female_threshold'] = df['female_threshold'].mask((df['female_director_ratio']<= 0.3) & (df['female_director_ratio']> 0.2), 2)
df['female_threshold'] = df['female_threshold'].mask((df['female_director_ratio']> 0.3), 3)
# df['female_threshold'] = df['female_threshold'].mask(df['female_director_ratio']< 0.1, 0)
# df['female_class'] = df['female_director'].mask(df['female_director']>=4, 4)

In [44]:
df['female_threshold']

0       0.0
1       0.0
2       0.0
3       0.0
4       1.0
       ... 
6475    0.0
6476    0.0
6477    2.0
6478    2.0
6479    2.0
Name: female_threshold, Length: 6480, dtype: float64

In [45]:
"""
df['female_director_01'] = df['female_director_ratio'].where(df['female_director']<=0.1, 0)
df['female_director_01'] = df['female_director_01'].mask(df['female_director_01']==1, 1)

df['female_director_02'] = df['female_director'].where(df['female_director']==2, 0)
df['female_director_02'] = df['female_director_02'].mask(df['female_director_02']==2, 1)

df['female_director_03'] = df['female_director'].where(df['female_director']>=3, 0)
df['female_director_03'] = df['female_director_03'].mask(df['female_director_03']>=3, 1)
"""

"\ndf['female_director_01'] = df['female_director_ratio'].where(df['female_director']<=0.1, 0)\ndf['female_director_01'] = df['female_director_01'].mask(df['female_director_01']==1, 1)\n\ndf['female_director_02'] = df['female_director'].where(df['female_director']==2, 0)\ndf['female_director_02'] = df['female_director_02'].mask(df['female_director_02']==2, 1)\n\ndf['female_director_03'] = df['female_director'].where(df['female_director']>=3, 0)\ndf['female_director_03'] = df['female_director_03'].mask(df['female_director_03']>=3, 1)\n"

In [46]:
# df['female_director'][df['female_director_03']==1].sort_values()

In [47]:
year = pd.get_dummies(df['year'], dtype='int')
year = year.drop(2015, axis=1)
industry = pd.get_dummies(df['industry_name'], dtype='int')
industry = industry.drop("その他製品", axis=1)

In [58]:
Y_cols = ['lag_log_tobin_q_winsor']
# Y_cols = ['roa']
W_cols = [
    'board_size', 
    'log_firm_age', 
    'log_sales',
    'sales_growth', 
    'foreign_ownership', 
    'managerial_ownership',
    'tangible_assets', 
    'leverage',
    'roa',
#     'net_income',
#     'log_total_assets'
]
X_cols = []
# T_cols = ['female_director_ratio']
T_cols = ['female_director']
# T_cols = ['female_director_01', 'female_director_02', 'female_director_03', 'female_director_04']
T_cols = ['female_threshold']
G_cols = ['ticker']

In [59]:
mundlak_W = df.groupby(by='ticker')[W_cols].transform("mean")
mundlak_W.columns = [f"{col}_mean" for col in mundlak_W.columns]

mundlak_T = df.groupby(by='ticker')[T_cols].transform("mean")
mundlak_T.columns = [f"{col}_mean" for col in mundlak_T.columns]

In [60]:
# コントロール変数
W = df[W_cols]
# W = (W - W.mean()) / W.std()
W = W.join(mundlak_T).join(mundlak_W)
W = W.join(year).join(industry)

# 説明変数
X = df[X_cols]

# 出力
Y = df[Y_cols]

# 介入
T = df[T_cols]

# Groups
G = df['ticker']

tmp = pd.concat((W, X, Y, T, G), axis=1).dropna(how='any', axis=0)

W = W.loc[tmp.index]
X = X.loc[tmp.index]
Y = Y.loc[tmp.index]
T = T.loc[tmp.index]
G = G.loc[tmp.index]

In [61]:
with open("best_params_y.json", mode='r') as f:
    best_params = json.load(f)
model_y = LGBMRegressor(force_row_wise=True, verbose=-1, **best_params) 

with open("best_params_t.json", mode='r') as f:
    best_params = json.load(f)

model_t = LGBMClassifier(objective='multiclass', class_weight='balanced')
# model_t = LGBMRegressor(force_row_wise=True, verbose=-1, **best_params) # LinearRegression()

In [62]:
dml0 = LinearDML(
    model_y=model_y, model_t=model_t, cv=GroupKFold(n_splits=3), discrete_treatment=True)

dml0.fit(Y.values.ravel(),
         T=T.values,
#          X=X.values,
         W=W.values, 
         groups=G.values, cache_values=True)

#y_res, t_res, _, _ = dml0.residuals_
#y_hat = Y.values.ravel() - y_res
#t_hat = T.values.ravel() - t_res

<econml.dml.dml.LinearDML at 0x333c2b2d0>