In [249]:
import itertools
import os
import re
from pathlib import Path

import pandas as pd

In [250]:
# Set maximum number of features to select per group
MAX_FEATURES = 2

In [251]:
BASE_DIR = Path.cwd()  # Current directory of the running file
ROOT_DIR = (
    BASE_DIR.parent.parent.parent
)  # Adjust as necessary to reach the project root
DATA_DIR = ROOT_DIR / "run1" / "data"

In [252]:
rels = pd.read_excel(DATA_DIR / "S03_af_feature_relevances.xlsx")
rels


Unnamed: 0,feature,type,p_value,relevant,phase_type,measure_type,stress_type
0,"Fx__fft_coefficient__attr_""abs""__coeff_11",real,1.565675e-10,True,dwell,Fx,stress_value_5052
1,Fx__ar_coefficient__coeff_0__k_10,real,2.866412e-08,True,dwell,Fx,stress_value_5052
2,Fx__variation_coefficient,real,1.952901e-07,True,dwell,Fx,stress_value_5052
3,"Fx__fft_coefficient__attr_""abs""__coeff_51",real,5.246075e-07,True,dwell,Fx,stress_value_5052
4,"Fx__fft_coefficient__attr_""angle""__coeff_75",real,1.000460e-06,True,dwell,Fx,stress_value_5052
...,...,...,...,...,...,...,...
18763,Mz__value_count__value_1,constant,,False,weld,Mz,stress_value_center
18764,Mz__value_count__value_-1,constant,,False,weld,Mz,stress_value_center
18765,Mz__ratio_beyond_r_sigma__r_6,constant,,False,weld,Mz,stress_value_center
18766,Mz__ratio_beyond_r_sigma__r_7,constant,,False,weld,Mz,stress_value_center


In [253]:
# Filter to only relevant features
rel = rels[rels["relevant"]]
print(f"Shape of relevant features: {rel.shape}")
rel.groupby(by=["stress_type", "measure_type", "phase_type"]).size()


Shape of relevant features: (717, 7)


stress_type          measure_type  phase_type
stress_value_5052    Fx            dwell         155
                                   weld           71
                     Fy            dwell         159
                                   weld           86
                     Fz            dwell          50
                                   weld            6
                     Mz            dwell          93
                                   weld           90
stress_value_center  Fz            weld            7
dtype: int64

In [254]:
def rename_feature(cur_feature_name: str, phase_type: str) -> str:
    sp = cur_feature_name.split("__")
    sp.insert(1, phase_type)
    new_feature_name = "__".join(sp)
    return new_feature_name


# cur_feature_name = 'Fx__fft_coefficient__attr_"abs"__coeff_11'
# sp = cur_feature_name.split("__")
# sp.insert(1, "weld")
# new_feature_name = "__".join(sp)
# print(new_feature_name)

In [255]:
phase_types = ["dwell", "weld"]
measure_types = ["Fx", "Fy", "Fz", "Mz"]
stress_types = ["stress_value_5052", "stress_value_6061", "stress_value_center"]
iter_list = list(itertools.product(phase_types, measure_types, stress_types))
df_arr = []
for phase_type, measure_type, stress_type in iter_list:
    filt1 = rels["phase_type"] == phase_type
    filt2 = rels["measure_type"] == measure_type
    filt3 = rels["stress_type"] == stress_type
    rel = rels[filt1 & filt2 & filt3].sort_values(by="p_value", ascending=True)

    rel["feature_rename"] = rel["feature"].apply(
        lambda x: rename_feature(x, phase_type)
    )

    # Select top N features
    rel_top = rel.head(MAX_FEATURES)

    df_arr.append(rel_top)

rels_selected = pd.concat(df_arr, axis=0).reset_index(drop=True)
rels_selected

Unnamed: 0,feature,type,p_value,relevant,phase_type,measure_type,stress_type,feature_rename
0,"Fx__fft_coefficient__attr_""abs""__coeff_11",real,1.565675e-10,True,dwell,Fx,stress_value_5052,"Fx__dwell__fft_coefficient__attr_""abs""__coeff_11"
1,Fx__ar_coefficient__coeff_0__k_10,real,2.866412e-08,True,dwell,Fx,stress_value_5052,Fx__dwell__ar_coefficient__coeff_0__k_10
2,Fx__quantile__q_0.7,real,0.0005089553,False,dwell,Fx,stress_value_6061,Fx__dwell__quantile__q_0.7
3,"Fx__fft_coefficient__attr_""real""__coeff_62",real,0.0007296259,False,dwell,Fx,stress_value_6061,"Fx__dwell__fft_coefficient__attr_""real""__coeff_62"
4,Fx__partial_autocorrelation__lag_6,real,8.023118e-05,False,dwell,Fx,stress_value_center,Fx__dwell__partial_autocorrelation__lag_6
5,"Fx__change_quantiles__f_agg_""mean""__isabs_True...",real,0.0003438416,False,dwell,Fx,stress_value_center,"Fx__dwell__change_quantiles__f_agg_""mean""__isa..."
6,"Fy__fft_coefficient__attr_""real""__coeff_71",real,2.016897e-08,True,dwell,Fy,stress_value_5052,"Fy__dwell__fft_coefficient__attr_""real""__coeff_71"
7,"Fy__fft_coefficient__attr_""abs""__coeff_59",real,5.597517e-08,True,dwell,Fy,stress_value_5052,"Fy__dwell__fft_coefficient__attr_""abs""__coeff_59"
8,Fy__last_location_of_minimum,real,2.455827e-05,False,dwell,Fy,stress_value_6061,Fy__dwell__last_location_of_minimum
9,Fy__first_location_of_minimum,real,2.455827e-05,False,dwell,Fy,stress_value_6061,Fy__dwell__first_location_of_minimum


In [256]:
# Check for duplicates
filt = rels_selected.duplicated(subset=["feature", "phase_type"])
display(rels_selected[filt])

# Remove duplicates if any
rels_selected = rels_selected[~filt]


Unnamed: 0,feature,type,p_value,relevant,phase_type,measure_type,stress_type,feature_rename


In [None]:
# Summary of relevant features selected
display(rels_selected["relevant"].value_counts())

relevant
False    30
True     18
Name: count, dtype: int64

In [None]:
# Only use relevant features.
rels_selected = rels_selected[rels_selected["relevant"]]

In [258]:
# Load features
_filts = os.listdir(DATA_DIR)
files = [f for f in _filts if f.startswith("S01_af_features_") and f.endswith(".xlsx")]
_load = []
for file in files:
    measure = re.match(r"S01_af_features_(\w+)\.xlsx", file).group(1)
    _load.append(dict(filename=file, measure=measure))
    print(f"Loading features for: {measure}")
load = pd.DataFrame(_load)
load

Loading features for: Fx
Loading features for: Fy
Loading features for: Fz
Loading features for: Mz


Unnamed: 0,filename,measure
0,S01_af_features_Fx.xlsx,Fx
1,S01_af_features_Fy.xlsx,Fy
2,S01_af_features_Fz.xlsx,Fz
3,S01_af_features_Mz.xlsx,Mz


In [259]:
# Load features
features_dict = {}
for idx, row in load.iterrows():
    measure = row["measure"]
    file = row["filename"]
    filepath = DATA_DIR / file
    print(f"Loading features for: {measure}, file: {file}")

    ext_dwell = pd.read_excel(
        filepath,
        sheet_name="af_dwell",
    )
    ext_weld = pd.read_excel(filepath, sheet_name="af_weld")
    # Store in dictionary
    data = {
        "dwell": ext_dwell,
        "weld": ext_weld,
    }
    features_dict[measure] = data

Loading features for: Fx, file: S01_af_features_Fx.xlsx
Loading features for: Fy, file: S01_af_features_Fy.xlsx
Loading features for: Fz, file: S01_af_features_Fz.xlsx
Loading features for: Mz, file: S01_af_features_Mz.xlsx


In [260]:
_selected_features = {}
for idx, row in rels_selected.iterrows():
    phase_type = row["phase_type"]
    measure_type = row["measure_type"]
    stress_type = row["stress_type"]
    feature_name = row["feature"]
    feature_rename = row["feature_rename"]
    # print(
    #     f"Phase: {phase_type}, Measure: {measure_type}, Stress: {stress_type}, Feature: {feature_name}"
    # )
    # Extract feature values
    features_value = features_dict[measure_type][phase_type][feature_name].values
    # Store in selected features dictionary
    _selected_features[feature_rename] = features_value

# Convert to DataFrame
selected_features = pd.DataFrame(_selected_features)

# Add sample number column
selected_features.insert(
    loc=0, column="sample_no", value=features_dict["Fx"]["dwell"]["sample_no"]
)

In [261]:
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [262]:
def calculate_vif(dfX: pd.DataFrame, threshold=10):
    # Add constant
    dfX_const = sm.add_constant(dfX)
    # Calculate VIF for each feature
    vif = pd.DataFrame()
    vif["feature"] = dfX_const.columns
    vif["VIF"] = [
        variance_inflation_factor(dfX_const.values, i)
        for i in range(dfX_const.shape[1])
    ]
    # Sort by VIF descending
    vif = vif.sort_values(by="VIF", ascending=False)
    # Identify features to drop
    vif_to_drop = vif[vif["VIF"] > threshold]
    # Create drop list
    drop_list = vif_to_drop["feature"].values.tolist()
    # Remove constant from drop list if present
    drop_list = [feat for feat in drop_list if feat != "const"]
    # Drop features from original DataFrame
    dfX_dropped = dfX.drop(columns=drop_list)
    return dfX_dropped, vif, drop_list

In [263]:
threshold_first = 5
threshold_second = 5
dfX = selected_features.iloc[:, 1:]

# Initial VIF calculation
print(f"Shape of selected features before VIF: {dfX.shape}")
dfX_dropped, vif, drop_list = calculate_vif(dfX, threshold=threshold_first)
print(f"Shape of selected features after VIF: {dfX_dropped.shape}")
display(vif)
print(drop_list)

# Second VIF calculation after dropping high VIF features
dfX = dfX_dropped.copy()
print(f"Shape of selected features before VIF: {dfX.shape}")
dfX_dropped, vif, drop_list = calculate_vif(dfX, threshold=threshold_second)
print(f"Shape of selected features after VIF: {dfX_dropped.shape}")
display(vif)
print(drop_list)

Shape of selected features before VIF: (54, 18)
Shape of selected features after VIF: (54, 14)


Unnamed: 0,feature,VIF
0,const,705.940765
18,Mz__weld__energy_ratio_by_chunks__num_segments...,11.134191
17,Mz__weld__quantile__q_0.1,9.93642
15,Fz__weld__ratio_beyond_r_sigma__r_1,5.43327
16,Fz__weld__ratio_beyond_r_sigma__r_2,5.37356
9,"Fx__weld__change_quantiles__f_agg_""var""__isabs...",3.697028
2,Fx__dwell__ar_coefficient__coeff_0__k_10,3.490421
12,"Fy__weld__change_quantiles__f_agg_""mean""__isab...",3.255932
14,"Fz__weld__agg_linear_trend__attr_""rvalue""__chu...",2.899069
1,"Fx__dwell__fft_coefficient__attr_""abs""__coeff_11",2.704902


['Mz__weld__energy_ratio_by_chunks__num_segments_10__segment_focus_4', 'Mz__weld__quantile__q_0.1', 'Fz__weld__ratio_beyond_r_sigma__r_1', 'Fz__weld__ratio_beyond_r_sigma__r_2']
Shape of selected features before VIF: (54, 14)
Shape of selected features after VIF: (54, 14)


Unnamed: 0,feature,VIF
0,const,365.719316
9,"Fx__weld__change_quantiles__f_agg_""var""__isabs...",3.088283
12,"Fy__weld__change_quantiles__f_agg_""mean""__isab...",2.987018
14,"Fz__weld__agg_linear_trend__attr_""rvalue""__chu...",2.555362
1,"Fx__dwell__fft_coefficient__attr_""abs""__coeff_11",2.44963
7,"Mz__dwell__augmented_dickey_fuller__attr_""test...",2.247095
4,"Fy__dwell__fft_coefficient__attr_""abs""__coeff_59",2.208328
6,"Fz__dwell__fft_coefficient__attr_""real""__coeff_4",2.204094
5,"Fz__dwell__fft_coefficient__attr_""angle""__coeff_4",2.15367
8,"Mz__dwell__augmented_dickey_fuller__attr_""pval...",1.848237


[]


In [264]:
vif.merge(rels_selected, left_on="feature", right_on="feature_rename", how="left").dropna()

Unnamed: 0,feature_x,VIF,feature_y,type,p_value,relevant,phase_type,measure_type,stress_type,feature_rename
1,"Fx__weld__change_quantiles__f_agg_""var""__isabs...",3.088283,"Fx__change_quantiles__f_agg_""var""__isabs_True_...",real,7.126935e-10,True,weld,Fx,stress_value_5052,"Fx__weld__change_quantiles__f_agg_""var""__isabs..."
2,"Fy__weld__change_quantiles__f_agg_""mean""__isab...",2.987018,"Fy__change_quantiles__f_agg_""mean""__isabs_True...",real,3.179828e-07,True,weld,Fy,stress_value_5052,"Fy__weld__change_quantiles__f_agg_""mean""__isab..."
3,"Fz__weld__agg_linear_trend__attr_""rvalue""__chu...",2.555362,"Fz__agg_linear_trend__attr_""rvalue""__chunk_len...",real,7.167726e-07,True,weld,Fz,stress_value_5052,"Fz__weld__agg_linear_trend__attr_""rvalue""__chu..."
4,"Fx__dwell__fft_coefficient__attr_""abs""__coeff_11",2.44963,"Fx__fft_coefficient__attr_""abs""__coeff_11",real,1.565675e-10,True,dwell,Fx,stress_value_5052,"Fx__dwell__fft_coefficient__attr_""abs""__coeff_11"
5,"Mz__dwell__augmented_dickey_fuller__attr_""test...",2.247095,"Mz__augmented_dickey_fuller__attr_""teststat""__...",real,4.459153e-11,True,dwell,Mz,stress_value_5052,"Mz__dwell__augmented_dickey_fuller__attr_""test..."
6,"Fy__dwell__fft_coefficient__attr_""abs""__coeff_59",2.208328,"Fy__fft_coefficient__attr_""abs""__coeff_59",real,5.597517e-08,True,dwell,Fy,stress_value_5052,"Fy__dwell__fft_coefficient__attr_""abs""__coeff_59"
7,"Fz__dwell__fft_coefficient__attr_""real""__coeff_4",2.204094,"Fz__fft_coefficient__attr_""real""__coeff_4",real,2.398588e-07,True,dwell,Fz,stress_value_5052,"Fz__dwell__fft_coefficient__attr_""real""__coeff_4"
8,"Fz__dwell__fft_coefficient__attr_""angle""__coeff_4",2.15367,"Fz__fft_coefficient__attr_""angle""__coeff_4",real,1.840083e-12,True,dwell,Fz,stress_value_5052,"Fz__dwell__fft_coefficient__attr_""angle""__coeff_4"
9,"Mz__dwell__augmented_dickey_fuller__attr_""pval...",1.848237,"Mz__augmented_dickey_fuller__attr_""pvalue""__au...",real,4.459153e-11,True,dwell,Mz,stress_value_5052,"Mz__dwell__augmented_dickey_fuller__attr_""pval..."
10,"Fx__weld__fft_coefficient__attr_""abs""__coeff_5",1.734066,"Fx__fft_coefficient__attr_""abs""__coeff_5",real,2.20046e-09,True,weld,Fx,stress_value_5052,"Fx__weld__fft_coefficient__attr_""abs""__coeff_5"


In [265]:
# Load location info
# locs = pd.read_excel(DATA_DIR / "S04_loc_values.xlsx").rename(
#     columns={
#         "Location": "location",
#         "Fx": "Fx_location",
#         "Fy": "Fy_location",
#         "Fz": "Fz_location",
#         "Mz": "Mz_location",
#     }
# )
# locs = locs.drop(columns=["loc_idx", "loc_time"])
# locs

In [266]:
# Load combined data
# combined1 = pd.read_excel(DATA_DIR / "S02_data_exp.xlsx")
# display(combined1)

In [267]:
# combined2 = combined1.merge(locs, on=["sample_no", "location"])
# combined2

In [268]:
# combined3 = combined2.merge(selected_features, on=["sample_no"])
# combined3

In [269]:
# colsY = ["stress_value_5052", "stress_value_6061", "stress_value_center"]
# colsX = [c for c in combined3.columns if c not in colsY]
# combined4 = combined3[colsX + colsY]
# combined4

In [270]:
# rels_selected.to_excel("S01_af_feature_list.xlsx", index=False)
# combined4.to_excel("S01_combined_data.xlsx", index=False)