In [1]:
import re
from pathlib import Path

import pandas as pd
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler

from run1.lib.classes_ml import DataHandler

In [2]:
BASE_DIR = Path.cwd()  # Current directory of the running file
ROOT_DIR = BASE_DIR.parent.parent.parent
DATA_DIR = ROOT_DIR / "run1" / "data"
CURRENT_DIR = BASE_DIR

In [3]:
_df = pd.read_excel(DATA_DIR / "S02_data_exp.xlsx")
print(f"df.shape: {_df.shape}")

df.shape: (378, 9)


In [4]:
# Select columns for features and targets
colsY = [c for c in _df.columns if re.search(r"stress_value", c)]

# Select feature columns based on predefined names
colsY = [c for c in colsY if c in ["stress_value_center"]]

# Predefined feature columns
colsX = [c for c in _df.columns if c in ["R", "W", "D", "position"]]
_dfY = _df[colsY]
_dfX = _df[colsX]
print("Selected feature columns:", colsX)
print("Selected target columns:", colsY)
print(f"dfX.shape: {_dfX.shape}")
print(f"dfY.shape: {_dfY.shape}")

Selected feature columns: ['position', 'R', 'W', 'D']
Selected target columns: ['stress_value_center']
dfX.shape: (378, 4)
dfY.shape: (378, 1)


In [5]:
# %% Extract features and targets
_X = _dfX.values
_Y = _dfY.values
print(f"_X.shape: {_X.shape}")
print(f"_Y.shape: {_Y.shape}")

_X.shape: (378, 4)
_Y.shape: (378, 1)


In [6]:
# Create DataHandler instance
data_handler = DataHandler(
    _X=_X,
    _Y=_Y,
    scalerX=StandardScaler(),
    scalerY=StandardScaler(),
    colsX=colsX,
    colsY=colsY,
)

In [7]:
idx = 1
random_state = 1
test_size = 0.0
data_handler.split_and_scale(random_state=random_state, test_size=test_size)
df_X_train, df_Y_train = data_handler.get_train(as_dataframe=True)
display(df_X_train.head())
display(df_Y_train.head())

No test set, using all data for training.


Unnamed: 0,position,R,W,D
0,-0.5,-1.224745,0.0,-1.224745
1,-4.810966e-16,1.224745,1.224745,0.0
2,-1.0,0.0,-1.224745,-1.224745
3,-0.5,1.224745,1.224745,1.224745
4,-1.0,1.224745,0.0,-1.224745


Unnamed: 0,stress_value_center
0,-1.088213
1,0.678466
2,-0.49932
3,-0.05765
4,-0.646543


In [8]:
# Analyze model summary (all predictors)
X = sm.add_constant(df_X_train)
model = sm.OLS(df_Y_train, X).fit()

# Get and print model summary
model_summary = model.summary()
print(model_summary)

# Get p-values and sort features by significance
df_table = model.summary2().tables[1]
df_table = df_table.sort_values(by="P>|t|", ascending=True)
display(df_table)


                             OLS Regression Results                            
Dep. Variable:     stress_value_center   R-squared:                       0.290
Model:                             OLS   Adj. R-squared:                  0.282
Method:                  Least Squares   F-statistic:                     38.01
Date:                 Fri, 30 Jan 2026   Prob (F-statistic):           1.11e-26
Time:                         05:18:35   Log-Likelihood:                -471.74
No. Observations:                  378   AIC:                             953.5
Df Residuals:                      373   BIC:                             973.1
Df Model:                            4                                         
Covariance Type:             nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const      -7.546e-17      0.044  -1.73e-1

Unnamed: 0,Coef.,Std.Err.,t,P>|t|,[0.025,0.975]
position,-0.4963548,0.043641,-11.37346,6.197749999999999e-26,-0.582169,-0.410541
R,-0.1736327,0.043641,-3.978615,8.329439e-05,-0.259447,-0.087818
W,0.1011177,0.043641,2.317009,0.02104416,0.015304,0.186932
D,0.05338944,0.043641,1.223364,0.2219644,-0.032425,0.139204
const,-7.546047000000001e-17,0.043641,-1.729099e-15,1.0,-0.085814,0.085814


In [9]:
import numpy as np

ranking = df_table.reset_index()[["index", "P>|t|"]].rename(
    columns={"index": "feature", "P>|t|": "value"}
)
ranking["measure"] = "OLS_p_value"
ranking["rank"] = np.arange(1, len(ranking) + 1)
# Remove constant term from ranking
ranking = ranking[ranking["feature"] != "const"]
display(ranking)

Unnamed: 0,feature,value,measure,rank
0,position,6.197749999999999e-26,OLS_p_value,1
1,R,8.329439e-05,OLS_p_value,2
2,W,0.02104416,OLS_p_value,3
3,D,0.2219644,OLS_p_value,4


In [10]:
ranking.to_excel("S01.xlsx", index=False)