In [197]:
# %% Imports
import ast
import itertools
from pathlib import Path

import pandas as pd
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler

from P08_feature_importances.T00_lib.classes_ml import (
    DataHandler,
    MyEval,
    MyUtil,
)
from P08_feature_importances.T00_lib.optuna_ml import OptunaUtil
from P08_feature_importances.T00_lib.utils import check_jupyter


In [198]:
BASE_DIR = Path.cwd()  # Current directory of the running file
DATA_DIR = BASE_DIR.parent / "T02_combine_features"
OPTUNA_DIR = BASE_DIR.parent / "T03_optuna"
CURRENT_DIR = BASE_DIR

In [199]:
study_info_filename = "S02_combine_study.xlsx"
study_info = pd.read_excel(OPTUNA_DIR / study_info_filename)
study_info["model_params"] = study_info["model_params"].apply(ast.literal_eval)
df = pd.read_excel(DATA_DIR / "S02_data_combined_loc.xlsx")
print(f"df.shape: {df.shape}")

df.shape: (378, 87)


In [200]:
_dfX = df.iloc[:, :-3]
_dfY = df.iloc[:, -3:]

# Extract features and targets
_X = _dfX.values
_Y = _dfY.values
print(f"_X.shape: {_X.shape}")
print(f"_Y.shape: {_Y.shape}")

# Use only the third target variable
_Y = _Y[:, 2:3]

_X.shape: (378, 84)
_Y.shape: (378, 3)


In [201]:
# Create DataHandler instance
data_handler = DataHandler(
    _X=_X, _Y=_Y, scalerX=StandardScaler(), scalerY=StandardScaler()
)

In [202]:
idx = 1
random_state = 1
test_size = 0.0
data_handler.split_and_scale(random_state=random_state, test_size=test_size)
X_train, Y_train = data_handler.get_train()

df_X_train = pd.DataFrame(X_train, columns=_dfX.columns)
df_Y_train = pd.DataFrame(Y_train, columns=["target"])

No test set, using all data for training.


In [203]:
print(X_train.shape)
print(Y_train.shape)

(378, 84)
(378, 1)


In [204]:
import numpy as np
# test_len = len(list(itertools.combinations(np.arange(84), 3)))
# print(test_len)

In [205]:
# Exhaustive feature selection using OLS
# This is computationally expensive for large feature sets
cols = _dfX.columns.tolist()
X_train_small = X_train[:, :10]
predictors = [i for i in range(X_train_small.shape[1])]
results = []
for k in range(1, len(predictors) + 1):
    for combo in itertools.combinations(predictors, k):
        X = sm.add_constant(X_train_small[:, list(combo)])
        model = sm.OLS(Y_train, X).fit()
        results.append(
            {
                "predictors": [cols[i] for i in combo],
                "n_vars": k,
                "r2": model.rsquared,
                "adj_r2": model.rsquared_adj,
                "f_pvalue": model.f_pvalue,
                "AIC": model.aic,
                "BIC": model.bic,
            }
        )

df_results = pd.DataFrame(results).sort_values(by="adj_r2", ascending=False)
df_results

Unnamed: 0,predictors,n_vars,r2,adj_r2,f_pvalue,AIC,BIC
932,"[R, W, D, position, location, Fx_location, Fz_...",7,0.319317,0.308309,1.871779e-28,941.316370,968.860629
768,"[R, W, D, position, Fx_location, Fz_location]",6,0.319317,0.308309,1.871779e-28,941.316370,968.860629
774,"[R, W, D, location, Fx_location, Fz_location]",6,0.319317,0.308309,1.871779e-28,941.316370,968.860629
852,"[sample_no, R, W, D, position, Fx_location, Fz...",7,0.320801,0.307951,7.144867e-28,942.491556,973.970709
858,"[sample_no, R, W, D, location, Fx_location, Fz...",7,0.320801,0.307951,7.144867e-28,942.491556,973.970709
...,...,...,...,...,...,...,...
9,[Mz_location],1,0.000015,-0.002645,9.402627e-01,1076.711878,1084.581666
149,"[D, Fx_location, Fy_location]",3,0.004888,-0.003094,6.073509e-01,1078.865390,1094.604966
151,"[D, Fx_location, Mz_location]",3,0.004277,-0.003710,6.582438e-01,1079.097471,1094.837047
49,"[Fx_location, Fy_location]",2,0.001265,-0.004062,7.887492e-01,1078.239120,1090.043803


In [206]:
# Analyze the best model
cols = df_results.iloc[0, :]["predictors"]
print(cols)
X = sm.add_constant(df_X_train[cols].values)
model = sm.OLS(Y_train, X).fit()
results = []
results.append(
    {
        "predictors": cols,
        "n_vars": len(cols),
        "r2": model.rsquared,
        "adj_r2": model.rsquared_adj,
        "f_pvalue": model.f_pvalue,
        "AIC": model.aic,
        "BIC": model.bic,
    }
)
results_df = pd.DataFrame(results)
results_df

['R', 'W', 'D', 'position', 'location', 'Fx_location', 'Fz_location']


Unnamed: 0,predictors,n_vars,r2,adj_r2,f_pvalue,AIC,BIC
0,"[R, W, D, position, location, Fx_location, Fz_...",7,0.319317,0.308309,1.8717790000000002e-28,941.31637,968.860629


In [207]:
# Analyze model summary (all predictors)
X = sm.add_constant(df_X_train)
model = sm.OLS(Y_train, X).fit()
results = []
results.append(
    {
        "predictors": cols,
        "n_vars": len(cols),
        "r2": model.rsquared,
        "adj_r2": model.rsquared_adj,
        "f_pvalue": model.f_pvalue,
        "AIC": model.aic,
        "BIC": model.bic,
    }
)
results_df = pd.DataFrame(results)
display(results_df)

# Analyze model summary (all predictors)
cols = df_X_train.columns.tolist()
df_table = model.summary2().tables[1]
df_table = df_table.sort_values(by="P>|t|", ascending=True)
display(df_table)


Unnamed: 0,predictors,n_vars,r2,adj_r2,f_pvalue,AIC,BIC
0,"[R, W, D, position, location, Fx_location, Fz_...",7,0.526878,0.437328,2.938041e-26,911.821532,1151.850078


Unnamed: 0,Coef.,Std.Err.,t,P>|t|,[0.025,0.975]
Fz_location,7.212975e-01,0.192027,3.756221e+00,0.000205,0.343488,1.099107
location,-6.001011e-01,0.172779,-3.473233e+00,0.000586,-0.940039,-0.260163
position,-6.001011e-01,0.172779,-3.473233e+00,0.000586,-0.940039,-0.260163
Fz__weld__ratio_beyond_r_sigma__r_1,-4.547671e-01,0.131516,-3.457886e+00,0.000619,-0.713522,-0.196013
Mz_location,2.599427e+00,0.778713,3.338108e+00,0.000944,1.067329,4.131525
...,...,...,...,...,...,...
Fz__weld__ratio_beyond_r_sigma__r_2,-1.939831e-02,0.154110,-1.258728e-01,0.899912,-0.322607,0.283810
Fx__dwell__quantile__q_0.7,-1.211629e-02,0.106989,-1.132479e-01,0.909906,-0.222615,0.198382
"Mz__dwell__augmented_dickey_fuller__attr_""teststat""__autolag_""AIC""",-4.067971e-03,0.124930,-3.256189e-02,0.974044,-0.249866,0.241730
"Fy__weld__fft_coefficient__attr_""abs""__coeff_61",3.629789e-04,0.094707,3.832659e-03,0.996944,-0.185970,0.186696
