In [11]:
import os
import pickle
import re
from pathlib import Path
import pandas as pd


In [12]:
relevances = pd.read_excel("S03_af_feature_relevance.xlsx")


In [13]:
relevances

Unnamed: 0,feature,type,p_value,relevant,phase_type,measure_type,stress_type
0,"Fx__fft_coefficient__attr_""abs""__coeff_11",real,1.565675e-10,True,dwell,Fx,stress_value_5052
1,Fx__ar_coefficient__coeff_0__k_10,real,2.866412e-08,True,dwell,Fx,stress_value_5052
2,Fx__variation_coefficient,real,1.952901e-07,True,dwell,Fx,stress_value_5052
3,"Fx__fft_coefficient__attr_""abs""__coeff_51",real,5.246075e-07,True,dwell,Fx,stress_value_5052
4,"Fx__fft_coefficient__attr_""angle""__coeff_75",real,1.000460e-06,True,dwell,Fx,stress_value_5052
...,...,...,...,...,...,...,...
18763,Mz__value_count__value_1,constant,,False,weld,Mz,stress_value_center
18764,Mz__value_count__value_-1,constant,,False,weld,Mz,stress_value_center
18765,Mz__ratio_beyond_r_sigma__r_6,constant,,False,weld,Mz,stress_value_center
18766,Mz__ratio_beyond_r_sigma__r_7,constant,,False,weld,Mz,stress_value_center


In [14]:
rel = relevances[relevances["relevant"]]
print(rel.shape)
rel.groupby(by=["stress_type", "measure_type", "phase_type"]).count()


(717, 7)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,feature,type,p_value,relevant
stress_type,measure_type,phase_type,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
stress_value_5052,Fx,dwell,155,155,155,155
stress_value_5052,Fx,weld,71,71,71,71
stress_value_5052,Fy,dwell,159,159,159,159
stress_value_5052,Fy,weld,86,86,86,86
stress_value_5052,Fz,dwell,50,50,50,50
stress_value_5052,Fz,weld,6,6,6,6
stress_value_5052,Mz,dwell,93,93,93,93
stress_value_5052,Mz,weld,90,90,90,90
stress_value_center,Fz,weld,7,7,7,7


In [15]:
def rename_feature(cur_feature_name: str, phase_type: str) -> str:
    sp = cur_feature_name.split("__")
    sp.insert(1, phase_type)
    new_feature_name = "__".join(sp)
    return new_feature_name


# cur_feature_name = 'Fx__fft_coefficient__attr_"abs"__coeff_11'
# sp = cur_feature_name.split("__")
# sp.insert(1, "weld")
# new_feature_name = "__".join(sp)
# print(new_feature_name)

In [16]:
import itertools

# Set maximum number of features to select per group
MAX_FEATURES = 3

phase_types = ["dwell", "weld"]
measure_types = ["Fx", "Fy", "Fz", "Mz"]
stress_types = ["stress_value_5052", "stress_value_6061", "stress_value_center"]
iter_list = list(itertools.product(phase_types, measure_types, stress_types))
df_arr = []
for phase_type, measure_type, stress_type in iter_list:
    filt1 = relevances["phase_type"] == phase_type
    filt2 = relevances["measure_type"] == measure_type
    filt3 = relevances["stress_type"] == stress_type
    rel = relevances[filt1 & filt2 & filt3].sort_values(by="p_value", ascending=True)

    rel["feature_rename"] = rel["feature"].apply(
        lambda x: rename_feature(x, phase_type)
    )

    # Select top N features
    rel_top = rel.head(MAX_FEATURES)

    df_arr.append(rel_top)

relevances_selected = pd.concat(df_arr, axis=0)
relevances_selected = relevances_selected.reset_index(drop=True)
relevances_selected

Unnamed: 0,feature,type,p_value,relevant,phase_type,measure_type,stress_type,feature_rename
0,"Fx__fft_coefficient__attr_""abs""__coeff_11",real,1.565675e-10,True,dwell,Fx,stress_value_5052,"Fx__dwell__fft_coefficient__attr_""abs""__coeff_11"
1,Fx__ar_coefficient__coeff_0__k_10,real,2.866412e-08,True,dwell,Fx,stress_value_5052,Fx__dwell__ar_coefficient__coeff_0__k_10
2,Fx__variation_coefficient,real,1.952901e-07,True,dwell,Fx,stress_value_5052,Fx__dwell__variation_coefficient
3,Fx__quantile__q_0.7,real,5.089553e-04,False,dwell,Fx,stress_value_6061,Fx__dwell__quantile__q_0.7
4,"Fx__fft_coefficient__attr_""real""__coeff_62",real,7.296259e-04,False,dwell,Fx,stress_value_6061,"Fx__dwell__fft_coefficient__attr_""real""__coeff_62"
...,...,...,...,...,...,...,...,...
67,Mz__last_location_of_maximum,real,7.449105e-04,False,weld,Mz,stress_value_6061,Mz__weld__last_location_of_maximum
68,Mz__first_location_of_maximum,real,7.449105e-04,False,weld,Mz,stress_value_6061,Mz__weld__first_location_of_maximum
69,"Mz__fft_coefficient__attr_""abs""__coeff_58",real,5.592435e-05,False,weld,Mz,stress_value_center,"Mz__weld__fft_coefficient__attr_""abs""__coeff_58"
70,"Mz__fft_coefficient__attr_""abs""__coeff_83",real,1.120336e-04,False,weld,Mz,stress_value_center,"Mz__weld__fft_coefficient__attr_""abs""__coeff_83"


In [17]:
filt = relevances_selected.duplicated(subset=["feature", "phase_type"])
relevances_selected[filt]


Unnamed: 0,feature,type,p_value,relevant,phase_type,measure_type,stress_type,feature_rename


In [18]:
relevances_selected = relevances_selected[~filt]

In [19]:
relevances_selected["relevant"].value_counts()

relevant
False    45
True     27
Name: count, dtype: int64

In [20]:
relevances_selected.to_excel("S04_af_feature_selection.xlsx", index=False)