In [1]:
import joblib
from pathlib import Path

from econml.dml import LinearDML, DML
from econml.inference import BootstrapInference
from lightgbm import LGBMRegressor, LGBMClassifier
import numpy as np
import pandas as pd
import tqdm

from sklearn.linear_model import Ridge, LogisticRegression, LinearRegression
from sklearn.model_selection import GroupKFold
from sklearn.base import clone

In [2]:
def sigmoid(x, c, gamma):
    denom = 1 + np.exp(-gamma * (x - c))
    numer = 1
    return numer / denom

In [3]:
models = Path("models")
models.mkdir(parents=True, exist_ok=True)

data_path = Path("../data")
file_name = "ToAnalysis_Winsorized_2015_2023_With_Profit_Asset.csv"
file_path = data_path / file_name

df = pd.read_csv(file_path, parse_dates=['start date'])
df.columns = [col.replace(" ", "_").lower() for col in df.columns]
df = df.sort_values(by=['ticker', 'start_date'])

df["log_tobin_q_winsor"] = np.log(df['tobin_q_winsor'])
df["year"] = df.start_date.dt.year
df["roa"] = df["net_income"].div(df["total_assets"])

In [28]:
c = 0.25
gamma = 60

df['female_director_ratio_threshold'] = df['female_director_ratio'].apply(lambda x: sigmoid(x, c, gamma))
# df['female_director_ratio_threshold'] = df['female_director_ratio'].apply(lambda x: 1 if x >= c else 0)

# models = models / f"threshold_{c:1.2f}"
# models.mkdir(parents=True, exist_ok=True)

In [29]:
year = pd.get_dummies(df['year'], dtype='int', drop_first=True)
industry = pd.get_dummies(df['industry_name'], dtype='int')
industry = industry.drop("その他製品", axis=1)

In [41]:
Y_cols = ['log_tobin_q_winsor']
W_cols = [
    'board_size', 
    'log_firm_age', 
    'log_sales',
    'sales_growth', 
    'foreign_ownership', 
    'managerial_ownership',
    'tangible_assets', 
    'leverage',
#     'roa'
#     'ROE'
]
T_cols = ['female_director_ratio_threshold']
G_cols = ['ticker']
X_cols = ['foreign_ownership', 'managerial_ownership']

In [42]:
mundlak_W = df.groupby(by='ticker')[W_cols].transform("mean")
mundlak_W.columns = [f"{col}_mean" for col in mundlak_W.columns]

mundlak_T = df.groupby(by='ticker')[T_cols].transform("mean")
mundlak_T.columns = [f"{col}_mean" for col in mundlak_T.columns]

In [53]:
# コントロール変数
W = df[W_cols].join(mundlak_T).join(mundlak_W).join(year).join(industry)

# 説明変数
X = df[X_cols]# .join(industry)

# 出力
Y = df[Y_cols]

# 介入
T = df[T_cols]

# Groups
G = df['ticker']

tmp = pd.concat((W, X, Y, T, G), axis=1).dropna(how='any', axis=0)

W = W.loc[tmp.index]
X = X.loc[tmp.index]
Y = Y.loc[tmp.index]
T = T.loc[tmp.index]
G = G.loc[tmp.index]

In [54]:
model_y = LGBMRegressor(force_row_wise=True, verbose=-1) # 
# model_y = LinearRegression()
model_t = LGBMRegressor(force_row_wise=True, verbose=-1) # LinearRegression()
# model_t = LinearRegression()
# model_t = LogisticRegression(max_iter=1000, C=10, penalty='l2')
# model_t = LGBMClassifier(class_weight='balanced')
dml = LinearDML(
    model_y = model_y,
    model_t = model_t,
    cv = GroupKFold(n_splits=3),   # 同じ企業を foldに跨らせない
#     discrete_treatment=True
)

In [55]:
dml.fit(
    Y.values.ravel(),
    T.values.ravel(),
    X=X.values,
    W=W.values,
    groups=G.values
)

<econml.dml.dml.LinearDML at 0x323f98b50>

In [56]:
dml.const_marginal_ate_inference(X=X)

mean_point,stderr_mean,zstat,pvalue,ci_mean_lower,ci_mean_upper
0.122,0.053,2.286,0.022,0.017,0.226

std_point,pct_point_lower,pct_point_upper
0.045,-0.023,0.178

stderr_point,ci_point_lower,ci_point_upper
0.069,-0.105,0.305


In [61]:
# dml.coef_
# 0.5
# dml.const_marginal_ate(X=X)

In [58]:
# (np.exp(0.15 * 0.9) - 1) * 0.1
# sigmoid(0.3, 0.25, 50)

In [50]:
# pd.Series(dml.coef_)

In [60]:
# y = sigmoid(np.linspace(0, 0.5, 100), 0.25, 70)

In [59]:
# import matplotlib.pyplot as plt
# plt.plot(np.linspace(0, 0.5, 100), y)