In [1]:
import json
import joblib
from pathlib import Path

from econml.dml import LinearDML, DML
from econml.inference import BootstrapInference
from lightgbm import LGBMRegressor
import numpy as np
import pandas as pd
import tqdm

from sklearn.linear_model import RidgeCV
from sklearn.model_selection import GroupKFold

import seaborn as sns

In [2]:
models = Path("models") / "foreign_ownership"
models.mkdir(parents=True, exist_ok=True)

data_path = Path("../data")
file_name = "ToAnalysis_Winsorized_2015_2023_With_Profit_Asset.csv"
file_path = data_path / file_name

df = pd.read_csv(file_path, parse_dates=['start date'])
df.columns = [col.replace(" ", "_").lower() for col in df.columns]
df = df.sort_values(by=['ticker', 'start_date'])

df["log_tobin_q_winsor"] = np.log(df['tobin_q_winsor'])
df["year"] = df.start_date.dt.year
df["roa"] = df["net_income"].div(df["total_assets"])

In [3]:
# sns.pairplot(data=df.select_dtypes(float))

In [4]:
# df

In [5]:
year = pd.get_dummies(df['year'], dtype='int')
year = year.drop(2015, axis=1)
industry = pd.get_dummies(df['industry_name'], dtype='int')
industry = industry.drop("その他製品", axis=1)

In [6]:
Y_cols = ['log_tobin_q_winsor']
W_cols = [
    'board_size', 
    'log_firm_age', 
    'log_sales',
    'sales_growth', 
    'foreign_ownership', 
    'managerial_ownership',
    'tangible_assets', 
    'leverage',
    'roa'
]
X_cols = []
# T_cols = ['female_director_ratio']
T_cols = ['female_director']
G_cols = ['ticker']

In [7]:
mundlak_W = df.groupby(by='ticker')[W_cols].transform("mean")
mundlak_W.columns = [f"{col}_mean" for col in mundlak_W.columns]

mundlak_T = df.groupby(by='ticker')[T_cols].transform("mean")
mundlak_T.columns = [f"{col}_mean" for col in mundlak_T.columns]

In [8]:
# コントロール変数
W = df[W_cols]
# W = (W - W.mean()) / W.std()
W = W.join(mundlak_T).join(mundlak_W)
W = W.join(year).join(industry)

# 説明変数
X = df[X_cols]

# 出力
Y = df[Y_cols]

# 介入
T = df[T_cols]

# Groups
G = df['ticker']

tmp = pd.concat((W, X, Y, T, G), axis=1).dropna(how='any', axis=0)

W = W.loc[tmp.index]
X = X.loc[tmp.index]
Y = Y.loc[tmp.index]
T = T.loc[tmp.index]
G = G.loc[tmp.index]

In [9]:
# LinearModels
# with open("best_params.json", mode='r') as f:
#     best_params = json.load(f)

In [10]:
# with open("best_params_y.json", mode='w') as f:
#     json.dump(best_params, f) 

In [11]:
with open("best_params_y.json", mode='r') as f:
    best_params = json.load(f)

model_y = LGBMRegressor(force_row_wise=True, verbose=-1, **best_params) # 

with open("best_params_t.json", mode='r') as f:
    best_params = json.load(f)
model_t = LGBMRegressor(force_row_wise=True, verbose=-1, **best_params) # LinearRegression()

In [12]:
dml0 = LinearDML(model_y=model_y, model_t=model_t,
                 cv=GroupKFold(n_splits=3))
dml0.fit(
    Y.values.ravel(),
    T.values.ravel(),
    W=W.values, 
    groups=G.values, 
    cache_values=True
)

y_res, t_res, _, _ = dml0.residuals_
y_hat = Y.values.ravel() - y_res
t_hat = T.values.ravel() - t_res

In [22]:
# year
# panel OLS

In [20]:
# (dml0.ate() * 3)

In [14]:
# cv = GroupKFold(n_splits=5)

In [24]:
# from sklearn.metrics import root_mean_squared_error

In [16]:
# Y = T

In [23]:
"""
mse = []
for train, test in cv.split(Y, W, G):
    train_Y = Y.loc[train]
    train_W = W.loc[train]
    train_G = G.loc[train]

    model = LGBMRegressor(force_row_wise=True, verbose=-1)
    model.fit(train_W.values, train_Y.values)
    pred_Y = model.predict(W.loc[test].values)
    mse.append(root_mean_squared_error(Y.loc[test], pred_Y))

mse = pd.Series(mse)
"""

'\nmse = []\nfor train, test in cv.split(Y, W, G):\n    train_Y = Y.loc[train]\n    train_W = W.loc[train]\n    train_G = G.loc[train]\n\n    model = LGBMRegressor(force_row_wise=True, verbose=-1)\n    model.fit(train_W.values, train_Y.values)\n    pred_Y = model.predict(W.loc[test].values)\n    mse.append(root_mean_squared_error(Y.loc[test], pred_Y))\n\nmse = pd.Series(mse)\n'

In [None]:
# mse.mean()
# def params(X, Y, G, **params):
#     cv = GroupKFold(n_splits=5)    

In [None]:
# def eval(X, Y, G, **params):  

In [None]:
# industry

In [None]:
dml0.ate()


In [None]:
import json
with open("best_params_t.json", mode='w') as f:
    json.dump(study.best_params, f)

In [None]:
# Robust テスト