In [41]:
from pathlib import Path
import pandas as pd

In [42]:
CURRENT_DIR = Path.cwd()  # Current directory of the running file
ROOT_DIR = CURRENT_DIR.parent.parent  # Root directory of the project

print(f"Current Directory: {CURRENT_DIR}")
print(f"Root Directory: {ROOT_DIR}")

Current Directory: c:\Users\admin\Coding\research\weld-ml\run1\Q01_agg
Root Directory: c:\Users\admin\Coding\research\weld-ml


In [43]:
load_infos = [
    {
        "path": ROOT_DIR / "run1" / "P01_no_af" / "T21_ols" / "S01.xlsx",
        "experiment": "P01_no_af",
        "experiment_subtype": "ols",
    },
    {
        "path": ROOT_DIR / "run1" / "P01_no_af" / "T22_lasso" / "S01.xlsx",
        "experiment": "P01_no_af",
        "experiment_subtype": "lasso",
    },
    {
        "path": ROOT_DIR / "run1" / "P01_no_af" / "T23_shap_ml" / "S02.xlsx",
        "experiment": "P01_no_af",
         "experiment_subtype": "shap_ml",
    },
    {
        "path": ROOT_DIR / "run1" / "P01_no_af" / "T24_shap_tabPFN" / "S02.xlsx",
        "experiment": "P01_no_af",
        "experiment_subtype": "shap_tabPFN",
    },

]

In [44]:
df_arr = []
for info in load_infos:
    df_in = pd.read_excel(info["path"])
    df_in["experiment"] = info["experiment"]
    df_in["experiment_subtype"] = info["experiment_subtype"]
    df_arr.append(df_in)
    print(f"Loaded data for experiment: {info['experiment']}")

df = pd.concat(df_arr, ignore_index=True)

Loaded data for experiment: P01_no_af
Loaded data for experiment: P01_no_af
Loaded data for experiment: P01_no_af
Loaded data for experiment: P01_no_af


In [45]:
df

Unnamed: 0,feature,value,measure,rank,experiment,experiment_subtype
0,position,6.197749999999999e-26,OLS_p_value,1,P01_no_af,ols
1,R,8.329439e-05,OLS_p_value,2,P01_no_af,ols
2,W,0.02104416,OLS_p_value,3,P01_no_af,ols
3,D,0.2219644,OLS_p_value,4,P01_no_af,ols
4,position,0.4958584,Lasso_coefficient,1,P01_no_af,lasso
5,R,0.1731363,Lasso_coefficient,2,P01_no_af,lasso
6,W,0.1006214,Lasso_coefficient,3,P01_no_af,lasso
7,D,0.05289309,Lasso_coefficient,4,P01_no_af,lasso
8,position,0.4829459,SHAP_importance,1,P01_no_af,shap_ml
9,R,0.1478022,SHAP_importance,2,P01_no_af,shap_ml


In [46]:
def select_rank(df_in):
    experiment = df_in.name[0]
    experiment_subtype = df_in.name[1]
    print(f"Selecting top ranks for Experiment: {experiment}, Measure: {experiment_subtype}")

    # Filter for OLS p-values
    if experiment_subtype == "ols":
        filt = df_in["value"] <= 0.05
        df_in = df_in[filt]

    df_sorted = df_in.sort_values(by=["rank"], ascending=[True])
    df_out = df_sorted.head(20)
    return df_out


df_top = (
    df.groupby(["experiment", "experiment_subtype"])
    .apply(select_rank, include_groups=False)
    .reset_index()
    .drop(columns=["level_2"])
)
df_top

Selecting top ranks for Experiment: P01_no_af, Measure: lasso
Selecting top ranks for Experiment: P01_no_af, Measure: ols
Selecting top ranks for Experiment: P01_no_af, Measure: shap_ml
Selecting top ranks for Experiment: P01_no_af, Measure: shap_tabPFN


Unnamed: 0,experiment,experiment_subtype,feature,value,measure,rank
0,P01_no_af,lasso,position,0.4958584,Lasso_coefficient,1
1,P01_no_af,lasso,R,0.1731363,Lasso_coefficient,2
2,P01_no_af,lasso,W,0.1006214,Lasso_coefficient,3
3,P01_no_af,lasso,D,0.05289309,Lasso_coefficient,4
4,P01_no_af,ols,position,6.197749999999999e-26,OLS_p_value,1
5,P01_no_af,ols,R,8.329439e-05,OLS_p_value,2
6,P01_no_af,ols,W,0.02104416,OLS_p_value,3
7,P01_no_af,shap_ml,position,0.4829459,SHAP_importance,1
8,P01_no_af,shap_ml,R,0.1478022,SHAP_importance,2
9,P01_no_af,shap_ml,W,0.06939057,SHAP_importance,3


In [47]:
df_pivot = df_top.pivot_table(
    index=["feature"], columns=["experiment", "experiment_subtype"], values=["rank"]
)
df_pivot

Unnamed: 0_level_0,rank,rank,rank,rank
experiment,P01_no_af,P01_no_af,P01_no_af,P01_no_af
experiment_subtype,lasso,ols,shap_ml,shap_tabPFN
feature,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3
D,4.0,,4.0,4.0
R,2.0,2.0,2.0,2.0
W,3.0,3.0,3.0,3.0
position,1.0,1.0,1.0,1.0


In [48]:
colsX = df_pivot.columns
colsX

MultiIndex([('rank', 'P01_no_af',       'lasso'),
            ('rank', 'P01_no_af',         'ols'),
            ('rank', 'P01_no_af',     'shap_ml'),
            ('rank', 'P01_no_af', 'shap_tabPFN')],
           names=[None, 'experiment', 'experiment_subtype'])

In [49]:
df_pivot["count"] = df_pivot[colsX].count(axis=1)
df_pivot["average_rank"] = df_pivot[colsX].mean(axis=1).round(2)
df_pivot = df_pivot.sort_values(by=["count", "average_rank"], ascending=[False, True])
df_pivot

Unnamed: 0_level_0,rank,rank,rank,rank,count,average_rank
experiment,P01_no_af,P01_no_af,P01_no_af,P01_no_af,Unnamed: 5_level_1,Unnamed: 6_level_1
experiment_subtype,lasso,ols,shap_ml,shap_tabPFN,Unnamed: 5_level_2,Unnamed: 6_level_2
feature,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3
position,1.0,1.0,1.0,1.0,4,1.0
R,2.0,2.0,2.0,2.0,4,2.0
W,3.0,3.0,3.0,3.0,4,3.0
D,4.0,,4.0,4.0,3,4.0


In [50]:
df_pivot.to_excel("feature_ranks_pivot_no_af.xlsx")