In [10]:
import re
from pathlib import Path

import pandas as pd
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LassoCV
from sklearn.preprocessing import StandardScaler

from run1.lib.classes_ml import DataHandler

In [11]:
BASE_DIR = Path.cwd()  # Current directory of the running file
ROOT_DIR = BASE_DIR.parent.parent.parent
DATA_DIR = ROOT_DIR / "run1" / "data"
CURRENT_DIR = BASE_DIR

In [12]:
_df = pd.read_excel(DATA_DIR / "S02_data_exp.xlsx")
print(f"df.shape: {_df.shape}")

df.shape: (378, 9)


In [13]:
# Select columns for features and targets
colsY = [c for c in _df.columns if re.search(r"stress_value", c)]

# Select feature columns based on predefined names
colsY = [c for c in colsY if c in ["stress_value_center"]]

# Predefined feature columns
colsX = [c for c in _df.columns if c in ["R", "W", "D", "position"]]
_dfY = _df[colsY]
_dfX = _df[colsX]
print("Selected feature columns:", colsX)
print("Selected target columns:", colsY)
print(f"dfX.shape: {_dfX.shape}")
print(f"dfY.shape: {_dfY.shape}")

Selected feature columns: ['position', 'R', 'W', 'D']
Selected target columns: ['stress_value_center']
dfX.shape: (378, 4)
dfY.shape: (378, 1)


In [14]:
# %% Extract features and targets
_X = _dfX.values
_Y = _dfY.values
print(f"_X.shape: {_X.shape}")
print(f"_Y.shape: {_Y.shape}")

_X.shape: (378, 4)
_Y.shape: (378, 1)


In [15]:
# Create DataHandler instance
data_handler = DataHandler(
    _X=_X,
    _Y=_Y,
    scalerX=StandardScaler(),
    scalerY=StandardScaler(),
    colsX=colsX,
    colsY=colsY,
)

In [16]:
idx = 1
random_state = 1
test_size = 0.0
data_handler.split_and_scale(random_state=random_state, test_size=test_size)
df_X_train, df_Y_train = data_handler.get_train(as_dataframe=True)
display(df_X_train.head())
display(df_Y_train.head())

No test set, using all data for training.


Unnamed: 0,position,R,W,D
0,-0.5,-1.224745,0.0,-1.224745
1,-4.810966e-16,1.224745,1.224745,0.0
2,-1.0,0.0,-1.224745,-1.224745
3,-0.5,1.224745,1.224745,1.224745
4,-1.0,1.224745,0.0,-1.224745


Unnamed: 0,stress_value_center
0,-1.088213
1,0.678466
2,-0.49932
3,-0.05765
4,-0.646543


In [17]:
# Train LassoCV model
model = LassoCV(cv=5, random_state=0, max_iter=10000).fit(
    df_X_train, df_Y_train.values.ravel()
)

In [18]:
# Rank features from coefficients
selector = SelectFromModel(model, prefit=True, threshold=1e-5)
feature_idx = selector.get_support()
feature_names = df_X_train.columns[feature_idx]
feature_importance = abs(model.coef_[feature_idx])
feature_ranking = pd.DataFrame(
    {"Feature": feature_names, "Importance": feature_importance}
).sort_values(by="Importance", ascending=False) 
feature_ranking.reset_index(drop=True, inplace=True)
display(feature_ranking)

Unnamed: 0,Feature,Importance
0,position,0.495858
1,R,0.173136
2,W,0.100621
3,D,0.052893


In [19]:
# Feature selection using Lasso coefficients
# Use a small threshold to capture exactly zero coefficients
sel = SelectFromModel(model, prefit=True, threshold=1e-5)

# To get the names of selected features (if X_train is a pandas DataFrame)
selected_features = df_X_train.columns[sel.get_support()]

print(f"Selected features: {list(selected_features)}")

Selected features: ['position', 'R', 'W', 'D']


In [20]:
import numpy as np
ranking = feature_ranking.rename(columns={"Feature": "feature", "Importance": "value"})
ranking["measure"] = "Lasso_coefficient"
ranking["rank"] = np.arange(1, len(ranking) + 1)
ranking

Unnamed: 0,feature,value,measure,rank
0,position,0.495858,Lasso_coefficient,1
1,R,0.173136,Lasso_coefficient,2
2,W,0.100621,Lasso_coefficient,3
3,D,0.052893,Lasso_coefficient,4


In [21]:
ranking.to_excel("S01.xlsx", index=False)