In [120]:
import itertools
import os
import re
from pathlib import Path

import pandas as pd

In [121]:
# Set maximum number of features to select per group
MAX_FEATURES = 2

In [122]:
BASE_DIR = Path.cwd()  # Current directory of the running file
ROOT_DIR = (
    BASE_DIR.parent.parent.parent
)  # Adjust as necessary to reach the project root
DATA_DIR = ROOT_DIR / "run1" / "data"

In [123]:
rels = pd.read_excel(DATA_DIR / "S03_af_feature_relevances.xlsx")
rels


Unnamed: 0,feature,type,p_value,relevant,phase_type,measure_type,stress_type
0,"Fx__fft_coefficient__attr_""abs""__coeff_11",real,1.565675e-10,True,dwell,Fx,stress_value_5052
1,Fx__ar_coefficient__coeff_0__k_10,real,2.866412e-08,True,dwell,Fx,stress_value_5052
2,Fx__variation_coefficient,real,1.952901e-07,True,dwell,Fx,stress_value_5052
3,"Fx__fft_coefficient__attr_""abs""__coeff_51",real,5.246075e-07,True,dwell,Fx,stress_value_5052
4,"Fx__fft_coefficient__attr_""angle""__coeff_75",real,1.000460e-06,True,dwell,Fx,stress_value_5052
...,...,...,...,...,...,...,...
18763,Mz__value_count__value_1,constant,,False,weld,Mz,stress_value_center
18764,Mz__value_count__value_-1,constant,,False,weld,Mz,stress_value_center
18765,Mz__ratio_beyond_r_sigma__r_6,constant,,False,weld,Mz,stress_value_center
18766,Mz__ratio_beyond_r_sigma__r_7,constant,,False,weld,Mz,stress_value_center


In [124]:
# Filter to only relevant features
rel = rels[rels["relevant"]]
print(f"Shape of relevant features: {rel.shape}")
rel.groupby(by=["stress_type", "measure_type", "phase_type"]).size()


Shape of relevant features: (717, 7)


stress_type          measure_type  phase_type
stress_value_5052    Fx            dwell         155
                                   weld           71
                     Fy            dwell         159
                                   weld           86
                     Fz            dwell          50
                                   weld            6
                     Mz            dwell          93
                                   weld           90
stress_value_center  Fz            weld            7
dtype: int64

In [125]:
def rename_feature(cur_feature_name: str, phase_type: str) -> str:
    sp = cur_feature_name.split("__")
    sp.insert(1, phase_type)
    new_feature_name = "__".join(sp)
    return new_feature_name


# cur_feature_name = 'Fx__fft_coefficient__attr_"abs"__coeff_11'
# sp = cur_feature_name.split("__")
# sp.insert(1, "weld")
# new_feature_name = "__".join(sp)
# print(new_feature_name)

In [126]:
phase_types = ["dwell", "weld"]
measure_types = ["Fx", "Fy", "Fz", "Mz"]
stress_types = ["stress_value_5052", "stress_value_6061", "stress_value_center"]
iter_list = list(itertools.product(phase_types, measure_types, stress_types))
df_arr = []
for phase_type, measure_type, stress_type in iter_list:
    filt1 = rels["phase_type"] == phase_type
    filt2 = rels["measure_type"] == measure_type
    filt3 = rels["stress_type"] == stress_type
    rel = rels[filt1 & filt2 & filt3].sort_values(by="p_value", ascending=True)

    rel["feature_rename"] = rel["feature"].apply(
        lambda x: rename_feature(x, phase_type)
    )

    # Select top N features
    rel_top = rel.head(MAX_FEATURES)

    df_arr.append(rel_top)

rels_selected = pd.concat(df_arr, axis=0).reset_index(drop=True)
rels_selected

Unnamed: 0,feature,type,p_value,relevant,phase_type,measure_type,stress_type,feature_rename
0,"Fx__fft_coefficient__attr_""abs""__coeff_11",real,1.565675e-10,True,dwell,Fx,stress_value_5052,"Fx__dwell__fft_coefficient__attr_""abs""__coeff_11"
1,Fx__ar_coefficient__coeff_0__k_10,real,2.866412e-08,True,dwell,Fx,stress_value_5052,Fx__dwell__ar_coefficient__coeff_0__k_10
2,Fx__quantile__q_0.7,real,0.0005089553,False,dwell,Fx,stress_value_6061,Fx__dwell__quantile__q_0.7
3,"Fx__fft_coefficient__attr_""real""__coeff_62",real,0.0007296259,False,dwell,Fx,stress_value_6061,"Fx__dwell__fft_coefficient__attr_""real""__coeff_62"
4,Fx__partial_autocorrelation__lag_6,real,8.023118e-05,False,dwell,Fx,stress_value_center,Fx__dwell__partial_autocorrelation__lag_6
5,"Fx__change_quantiles__f_agg_""mean""__isabs_True...",real,0.0003438416,False,dwell,Fx,stress_value_center,"Fx__dwell__change_quantiles__f_agg_""mean""__isa..."
6,"Fy__fft_coefficient__attr_""real""__coeff_71",real,2.016897e-08,True,dwell,Fy,stress_value_5052,"Fy__dwell__fft_coefficient__attr_""real""__coeff_71"
7,"Fy__fft_coefficient__attr_""abs""__coeff_59",real,5.597517e-08,True,dwell,Fy,stress_value_5052,"Fy__dwell__fft_coefficient__attr_""abs""__coeff_59"
8,Fy__last_location_of_minimum,real,2.455827e-05,False,dwell,Fy,stress_value_6061,Fy__dwell__last_location_of_minimum
9,Fy__first_location_of_minimum,real,2.455827e-05,False,dwell,Fy,stress_value_6061,Fy__dwell__first_location_of_minimum


In [127]:
# Check for duplicates
filt = rels_selected.duplicated(subset=["feature", "phase_type"])
display(rels_selected[filt])

# Remove duplicates if any
rels_selected = rels_selected[~filt]


Unnamed: 0,feature,type,p_value,relevant,phase_type,measure_type,stress_type,feature_rename


In [128]:
# Summary of relevant features selected
display(rels_selected["relevant"].value_counts())

relevant
False    30
True     18
Name: count, dtype: int64

### Only use relevant features. This is different from previous feature selection


In [129]:
rels_selected = rels_selected[rels_selected["relevant"]]

In [130]:
# Load features
_filts = os.listdir(DATA_DIR)
files = [f for f in _filts if f.startswith("S01_af_features_") and f.endswith(".xlsx")]
_load = []
for file in files:
    measure = re.match(r"S01_af_features_(\w+)\.xlsx", file).group(1)
    _load.append(dict(filename=file, measure=measure))
    print(f"Loading features for: {measure}")
load = pd.DataFrame(_load)
load

Loading features for: Fx
Loading features for: Fy
Loading features for: Fz
Loading features for: Mz


Unnamed: 0,filename,measure
0,S01_af_features_Fx.xlsx,Fx
1,S01_af_features_Fy.xlsx,Fy
2,S01_af_features_Fz.xlsx,Fz
3,S01_af_features_Mz.xlsx,Mz


In [131]:
# Load features
features_dict = {}
for idx, row in load.iterrows():
    measure = row["measure"]
    file = row["filename"]
    filepath = DATA_DIR / file
    print(f"Loading features for: {measure}, file: {file}")

    ext_dwell = pd.read_excel(
        filepath,
        sheet_name="af_dwell",
    )
    ext_weld = pd.read_excel(filepath, sheet_name="af_weld")
    # Store in dictionary
    data = {
        "dwell": ext_dwell,
        "weld": ext_weld,
    }
    features_dict[measure] = data

Loading features for: Fx, file: S01_af_features_Fx.xlsx
Loading features for: Fy, file: S01_af_features_Fy.xlsx
Loading features for: Fz, file: S01_af_features_Fz.xlsx
Loading features for: Mz, file: S01_af_features_Mz.xlsx


In [132]:
_selected_features = {}
for idx, row in rels_selected.iterrows():
    phase_type = row["phase_type"]
    measure_type = row["measure_type"]
    stress_type = row["stress_type"]
    feature_name = row["feature"]
    feature_rename = row["feature_rename"]
    # print(
    #     f"Phase: {phase_type}, Measure: {measure_type}, Stress: {stress_type}, Feature: {feature_name}"
    # )
    # Extract feature values
    features_value = features_dict[measure_type][phase_type][feature_name].values
    # Store in selected features dictionary
    _selected_features[feature_rename] = features_value

# Convert to DataFrame
selected_features = pd.DataFrame(_selected_features)

# Add sample number column
selected_features.insert(
    loc=0, column="sample_no", value=features_dict["Fx"]["dwell"]["sample_no"]
)

In [133]:
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [134]:
def calculate_vif(dfX: pd.DataFrame, threshold=10):
    # Add constant
    dfX_const = sm.add_constant(dfX)
    # Calculate VIF for each feature
    vif = pd.DataFrame()
    vif["feature_rename"] = dfX_const.columns
    vif["VIF"] = [
        variance_inflation_factor(dfX_const.values, i)
        for i in range(dfX_const.shape[1])
    ]
    # Sort by VIF descending
    vif = vif.sort_values(by="VIF", ascending=False)
    # Remove "const"
    vif = vif[vif["feature_rename"] != "const"]
    # Identify features to drop
    vif_to_drop = vif[vif["VIF"] > threshold]
    # Create drop list
    drop_list = vif_to_drop["feature_rename"].values.tolist()
    # Drop features from original DataFrame
    dfX_dropped = dfX.drop(columns=drop_list)
    return dfX_dropped, vif, drop_list

### Prototype VIF


In [135]:
threshold_vif = 9
target_max_vif = 5

dfX = selected_features.iloc[:, 1:]

# Initial VIF calculation
print(f"Shape of selected features before VIF: {dfX.shape}")
dfX_dropped, vif, drop_list = calculate_vif(dfX, threshold=threshold_vif)
print(f"Shape of selected features after VIF: {dfX_dropped.shape}")
display(vif)
print(drop_list)

# Second VIF calculation after dropping high VIF features
dfX = dfX_dropped.copy()
dfX_dropped, vif, drop_list = calculate_vif(dfX, threshold=threshold_vif)
print(f"Shape of selected features after VIF: {dfX_dropped.shape}")
actual_max_vif = vif["VIF"].max()
display(vif)
print(actual_max_vif)

Shape of selected features before VIF: (54, 18)
Shape of selected features after VIF: (54, 16)


Unnamed: 0,feature_rename,VIF
18,Mz__weld__energy_ratio_by_chunks__num_segments...,11.134191
17,Mz__weld__quantile__q_0.1,9.93642
15,Fz__weld__ratio_beyond_r_sigma__r_1,5.43327
16,Fz__weld__ratio_beyond_r_sigma__r_2,5.37356
9,"Fx__weld__change_quantiles__f_agg_""var""__isabs...",3.697028
2,Fx__dwell__ar_coefficient__coeff_0__k_10,3.490421
12,"Fy__weld__change_quantiles__f_agg_""mean""__isab...",3.255932
14,"Fz__weld__agg_linear_trend__attr_""rvalue""__chu...",2.899069
1,"Fx__dwell__fft_coefficient__attr_""abs""__coeff_11",2.704902
5,"Fz__dwell__fft_coefficient__attr_""angle""__coeff_4",2.582828


['Mz__weld__energy_ratio_by_chunks__num_segments_10__segment_focus_4', 'Mz__weld__quantile__q_0.1']
Shape of selected features after VIF: (54, 16)


Unnamed: 0,feature_rename,VIF
15,Fz__weld__ratio_beyond_r_sigma__r_1,5.397359
16,Fz__weld__ratio_beyond_r_sigma__r_2,5.349838
9,"Fx__weld__change_quantiles__f_agg_""var""__isabs...",3.185366
12,"Fy__weld__change_quantiles__f_agg_""mean""__isab...",3.071974
14,"Fz__weld__agg_linear_trend__attr_""rvalue""__chu...",2.768545
1,"Fx__dwell__fft_coefficient__attr_""abs""__coeff_11",2.679633
5,"Fz__dwell__fft_coefficient__attr_""angle""__coeff_4",2.56332
4,"Fy__dwell__fft_coefficient__attr_""abs""__coeff_59",2.348972
6,"Fz__dwell__fft_coefficient__attr_""real""__coeff_4",2.300315
7,"Mz__dwell__augmented_dickey_fuller__attr_""test...",2.293884


5.397359012172896


In [136]:
import numpy as np

threshold_vif = 20
target_max_vif = 5
actual_max_vif = np.inf

while actual_max_vif > target_max_vif:
    threshold_vif -= 1
    dfX = selected_features.iloc[:, 1:]

    # Initial VIF calculation
    print(f"Shape of selected features before VIF: {dfX.shape}")
    dfX_dropped, vif, drop_list = calculate_vif(dfX, threshold=threshold_vif)
    print(f"Shape of selected features after VIF(1): {dfX_dropped.shape}")

    # Second VIF calculation after dropping high VIF features
    dfX = dfX_dropped.copy()
    dfX_dropped, vif, drop_list = calculate_vif(dfX, threshold=threshold_vif)
    print(f"Shape of selected features after VIF(2): {dfX_dropped.shape}")
    actual_max_vif = vif["VIF"].max()
    print(f"Current max VIF: {actual_max_vif} at threshold: {threshold_vif}")

    print("-----")


Shape of selected features before VIF: (54, 18)
Shape of selected features after VIF(1): (54, 18)
Shape of selected features after VIF(2): (54, 18)
Current max VIF: 11.134190920720416 at threshold: 19
-----
Shape of selected features before VIF: (54, 18)
Shape of selected features after VIF(1): (54, 18)
Shape of selected features after VIF(2): (54, 18)
Current max VIF: 11.134190920720416 at threshold: 18
-----
Shape of selected features before VIF: (54, 18)
Shape of selected features after VIF(1): (54, 18)
Shape of selected features after VIF(2): (54, 18)
Current max VIF: 11.134190920720416 at threshold: 17
-----
Shape of selected features before VIF: (54, 18)
Shape of selected features after VIF(1): (54, 18)
Shape of selected features after VIF(2): (54, 18)
Current max VIF: 11.134190920720416 at threshold: 16
-----
Shape of selected features before VIF: (54, 18)
Shape of selected features after VIF(1): (54, 18)
Shape of selected features after VIF(2): (54, 18)
Current max VIF: 11.1341

In [137]:
display(vif)

Unnamed: 0,feature_rename,VIF
9,"Fx__weld__change_quantiles__f_agg_""var""__isabs...",3.088283
12,"Fy__weld__change_quantiles__f_agg_""mean""__isab...",2.987018
14,"Fz__weld__agg_linear_trend__attr_""rvalue""__chu...",2.555362
1,"Fx__dwell__fft_coefficient__attr_""abs""__coeff_11",2.44963
7,"Mz__dwell__augmented_dickey_fuller__attr_""test...",2.247095
4,"Fy__dwell__fft_coefficient__attr_""abs""__coeff_59",2.208328
6,"Fz__dwell__fft_coefficient__attr_""real""__coeff_4",2.204094
5,"Fz__dwell__fft_coefficient__attr_""angle""__coeff_4",2.15367
8,"Mz__dwell__augmented_dickey_fuller__attr_""pval...",1.848237
10,"Fx__weld__fft_coefficient__attr_""abs""__coeff_5",1.734066


In [138]:
rels_selected_vif = vif.merge(
    rels_selected,
    left_on="feature_rename",
    right_on="feature_rename",
    how="left",
    suffixes=("", "_y"),
)
rels_selected_vif = rels_selected_vif.drop(
    columns=[c for c in vif.columns if c.endswith("_y")]
)
display(rels_selected_vif)

Unnamed: 0,feature_rename,VIF,feature,type,p_value,relevant,phase_type,measure_type,stress_type
0,"Fx__weld__change_quantiles__f_agg_""var""__isabs...",3.088283,"Fx__change_quantiles__f_agg_""var""__isabs_True_...",real,7.126935e-10,True,weld,Fx,stress_value_5052
1,"Fy__weld__change_quantiles__f_agg_""mean""__isab...",2.987018,"Fy__change_quantiles__f_agg_""mean""__isabs_True...",real,3.179828e-07,True,weld,Fy,stress_value_5052
2,"Fz__weld__agg_linear_trend__attr_""rvalue""__chu...",2.555362,"Fz__agg_linear_trend__attr_""rvalue""__chunk_len...",real,7.167726e-07,True,weld,Fz,stress_value_5052
3,"Fx__dwell__fft_coefficient__attr_""abs""__coeff_11",2.44963,"Fx__fft_coefficient__attr_""abs""__coeff_11",real,1.565675e-10,True,dwell,Fx,stress_value_5052
4,"Mz__dwell__augmented_dickey_fuller__attr_""test...",2.247095,"Mz__augmented_dickey_fuller__attr_""teststat""__...",real,4.459153e-11,True,dwell,Mz,stress_value_5052
5,"Fy__dwell__fft_coefficient__attr_""abs""__coeff_59",2.208328,"Fy__fft_coefficient__attr_""abs""__coeff_59",real,5.597517e-08,True,dwell,Fy,stress_value_5052
6,"Fz__dwell__fft_coefficient__attr_""real""__coeff_4",2.204094,"Fz__fft_coefficient__attr_""real""__coeff_4",real,2.398588e-07,True,dwell,Fz,stress_value_5052
7,"Fz__dwell__fft_coefficient__attr_""angle""__coeff_4",2.15367,"Fz__fft_coefficient__attr_""angle""__coeff_4",real,1.840083e-12,True,dwell,Fz,stress_value_5052
8,"Mz__dwell__augmented_dickey_fuller__attr_""pval...",1.848237,"Mz__augmented_dickey_fuller__attr_""pvalue""__au...",real,4.459153e-11,True,dwell,Mz,stress_value_5052
9,"Fx__weld__fft_coefficient__attr_""abs""__coeff_5",1.734066,"Fx__fft_coefficient__attr_""abs""__coeff_5",real,2.20046e-09,True,weld,Fx,stress_value_5052


In [139]:
cols = ["sample_no"] + rels_selected_vif["feature_rename"].tolist()
selected_features_vif = selected_features[cols]
print(f"Shape of selected features before VIF final: {selected_features.shape}")
print(f"Shape of selected features after VIF final: {selected_features_vif.shape}")

Shape of selected features before VIF final: (54, 19)
Shape of selected features after VIF final: (54, 15)


In [140]:
# Load location info
locs = pd.read_excel(DATA_DIR / "S04_loc_values.xlsx").rename(
    columns={
        "Location": "location",
        "Fx": "Fx_location",
        "Fy": "Fy_location",
        "Fz": "Fz_location",
        "Mz": "Mz_location",
    }
)
locs = locs.drop(columns=["loc_idx", "loc_time"])
locs

Unnamed: 0,sample_no,location,Fx_location,Fy_location,Fz_location,Mz_location
0,1,1,-0.077671,0.143026,1.244326,1.873865
1,1,2,-0.027450,0.207481,1.625039,4.615337
2,1,3,-0.038704,0.272508,1.799653,5.209334
3,1,4,-0.023484,0.278517,1.882699,5.764727
4,1,5,-0.038251,0.280276,1.955412,6.097497
...,...,...,...,...,...,...
373,54,3,-0.079764,0.196450,1.352393,5.512565
374,54,4,-0.091477,0.259717,1.529089,6.898179
375,54,5,-0.068128,0.241906,1.657737,7.940597
376,54,6,-0.066880,0.261006,1.712860,8.849661


In [141]:
# Load combined data
combined1 = pd.read_excel(DATA_DIR / "S02_data_exp.xlsx")
display(combined1)

Unnamed: 0,sample_no,location,position,R,W,D,stress_value_5052,stress_value_6061,stress_value_center
0,1,1,0.153846,1400,60,10,28.0,51.0,12.0
1,2,1,0.153846,1400,60,15,14.0,-21.0,17.0
2,3,1,0.153846,1400,60,20,10.0,35.0,12.0
3,4,1,0.153846,1400,70,10,10.0,-10.0,20.0
4,5,1,0.153846,1400,70,15,6.0,41.0,14.0
...,...,...,...,...,...,...,...,...,...
373,50,7,0.846154,1600,70,15,4.0,-23.0,2.0
374,51,7,0.846154,1600,70,20,0.0,-1.0,2.0
375,52,7,0.846154,1600,80,10,-2.0,-41.0,5.0
376,53,7,0.846154,1600,80,15,10.0,-90.0,1.0


In [142]:
combined2 = combined1.merge(locs, on=["sample_no", "location"])
combined2

Unnamed: 0,sample_no,location,position,R,W,D,stress_value_5052,stress_value_6061,stress_value_center,Fx_location,Fy_location,Fz_location,Mz_location
0,1,1,0.153846,1400,60,10,28.0,51.0,12.0,-0.077671,0.143026,1.244326,1.873865
1,2,1,0.153846,1400,60,15,14.0,-21.0,17.0,-0.133276,0.164254,1.203367,-1.054677
2,3,1,0.153846,1400,60,20,10.0,35.0,12.0,-0.059639,0.269418,1.444542,2.940728
3,4,1,0.153846,1400,70,10,10.0,-10.0,20.0,-0.051020,0.211907,1.601667,3.661974
4,5,1,0.153846,1400,70,15,6.0,41.0,14.0,-0.100744,0.179582,1.095031,-0.016799
...,...,...,...,...,...,...,...,...,...,...,...,...,...
373,50,7,0.846154,1600,70,15,4.0,-23.0,2.0,-0.084714,0.284958,1.905742,8.950274
374,51,7,0.846154,1600,70,20,0.0,-1.0,2.0,-0.094956,0.257101,1.669120,9.627879
375,52,7,0.846154,1600,80,10,-2.0,-41.0,5.0,-0.203323,0.173404,1.671576,4.696642
376,53,7,0.846154,1600,80,15,10.0,-90.0,1.0,-0.099644,0.266207,1.686495,10.497974


In [143]:
combined3 = combined2.merge(selected_features, on=["sample_no"])
combined3

Unnamed: 0,sample_no,location,position,R,W,D,stress_value_5052,stress_value_6061,stress_value_center,Fx_location,...,"Fx__weld__change_quantiles__f_agg_""var""__isabs_True__qh_0.6__ql_0.2","Fx__weld__fft_coefficient__attr_""abs""__coeff_5",Fy__weld__energy_ratio_by_chunks__num_segments_10__segment_focus_5,"Fy__weld__change_quantiles__f_agg_""mean""__isabs_True__qh_0.4__ql_0.0","Fz__weld__change_quantiles__f_agg_""mean""__isabs_False__qh_1.0__ql_0.4","Fz__weld__agg_linear_trend__attr_""rvalue""__chunk_len_5__f_agg_""var""",Fz__weld__ratio_beyond_r_sigma__r_1,Fz__weld__ratio_beyond_r_sigma__r_2,Mz__weld__quantile__q_0.1,Mz__weld__energy_ratio_by_chunks__num_segments_10__segment_focus_4
0,1,1,0.153846,1400,60,10,28.0,51.0,12.0,-0.077671,...,0.006729,60.230153,0.113272,0.142512,0.000409,-0.446735,0.278363,0.063841,-6.949290,0.034026
1,2,1,0.153846,1400,60,15,14.0,-21.0,17.0,-0.133276,...,0.008800,64.371976,0.110881,0.158888,0.000723,-0.401224,0.204523,0.086839,-5.712890,0.021547
2,3,1,0.153846,1400,60,20,10.0,35.0,12.0,-0.059639,...,0.006253,88.143882,0.107506,0.149730,0.000582,-0.611343,0.172679,0.076763,-5.951800,0.024692
3,4,1,0.153846,1400,70,10,10.0,-10.0,20.0,-0.051020,...,0.008120,32.083086,0.108980,0.140542,-0.000489,-0.472874,0.146626,0.065057,-4.033597,0.032070
4,5,1,0.153846,1400,70,15,6.0,41.0,14.0,-0.100744,...,0.008490,56.841945,0.111608,0.116390,0.001675,-0.344166,0.284189,0.049533,-7.551309,0.018795
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
373,50,7,0.846154,1600,70,15,4.0,-23.0,2.0,-0.084714,...,0.005643,83.967538,0.108628,0.144391,0.000259,-0.350354,0.352746,0.051059,-6.994062,0.024102
374,51,7,0.846154,1600,70,20,0.0,-1.0,2.0,-0.094956,...,0.006474,8.788741,0.106932,0.128832,0.000801,-0.370529,0.367193,0.024856,-8.050406,0.019972
375,52,7,0.846154,1600,80,10,-2.0,-41.0,5.0,-0.203323,...,0.005295,41.193473,0.089559,0.142455,0.000469,-0.460999,0.314019,0.047995,-18.062500,0.024198
376,53,7,0.846154,1600,80,15,10.0,-90.0,1.0,-0.099644,...,0.003532,27.703044,0.103169,0.118852,-0.000155,-0.226563,0.231258,0.072198,-4.259820,0.027400


In [144]:
combined3

Unnamed: 0,sample_no,location,position,R,W,D,stress_value_5052,stress_value_6061,stress_value_center,Fx_location,...,"Fx__weld__change_quantiles__f_agg_""var""__isabs_True__qh_0.6__ql_0.2","Fx__weld__fft_coefficient__attr_""abs""__coeff_5",Fy__weld__energy_ratio_by_chunks__num_segments_10__segment_focus_5,"Fy__weld__change_quantiles__f_agg_""mean""__isabs_True__qh_0.4__ql_0.0","Fz__weld__change_quantiles__f_agg_""mean""__isabs_False__qh_1.0__ql_0.4","Fz__weld__agg_linear_trend__attr_""rvalue""__chunk_len_5__f_agg_""var""",Fz__weld__ratio_beyond_r_sigma__r_1,Fz__weld__ratio_beyond_r_sigma__r_2,Mz__weld__quantile__q_0.1,Mz__weld__energy_ratio_by_chunks__num_segments_10__segment_focus_4
0,1,1,0.153846,1400,60,10,28.0,51.0,12.0,-0.077671,...,0.006729,60.230153,0.113272,0.142512,0.000409,-0.446735,0.278363,0.063841,-6.949290,0.034026
1,2,1,0.153846,1400,60,15,14.0,-21.0,17.0,-0.133276,...,0.008800,64.371976,0.110881,0.158888,0.000723,-0.401224,0.204523,0.086839,-5.712890,0.021547
2,3,1,0.153846,1400,60,20,10.0,35.0,12.0,-0.059639,...,0.006253,88.143882,0.107506,0.149730,0.000582,-0.611343,0.172679,0.076763,-5.951800,0.024692
3,4,1,0.153846,1400,70,10,10.0,-10.0,20.0,-0.051020,...,0.008120,32.083086,0.108980,0.140542,-0.000489,-0.472874,0.146626,0.065057,-4.033597,0.032070
4,5,1,0.153846,1400,70,15,6.0,41.0,14.0,-0.100744,...,0.008490,56.841945,0.111608,0.116390,0.001675,-0.344166,0.284189,0.049533,-7.551309,0.018795
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
373,50,7,0.846154,1600,70,15,4.0,-23.0,2.0,-0.084714,...,0.005643,83.967538,0.108628,0.144391,0.000259,-0.350354,0.352746,0.051059,-6.994062,0.024102
374,51,7,0.846154,1600,70,20,0.0,-1.0,2.0,-0.094956,...,0.006474,8.788741,0.106932,0.128832,0.000801,-0.370529,0.367193,0.024856,-8.050406,0.019972
375,52,7,0.846154,1600,80,10,-2.0,-41.0,5.0,-0.203323,...,0.005295,41.193473,0.089559,0.142455,0.000469,-0.460999,0.314019,0.047995,-18.062500,0.024198
376,53,7,0.846154,1600,80,15,10.0,-90.0,1.0,-0.099644,...,0.003532,27.703044,0.103169,0.118852,-0.000155,-0.226563,0.231258,0.072198,-4.259820,0.027400


In [145]:
colsG = ["sample_no", "location"]
colsY = ["stress_value_5052", "stress_value_6061", "stress_value_center"]
combined3X = combined3.drop(columns=colsG + colsY)
combined3X

Unnamed: 0,position,R,W,D,Fx_location,Fy_location,Fz_location,Mz_location,"Fx__dwell__fft_coefficient__attr_""abs""__coeff_11",Fx__dwell__ar_coefficient__coeff_0__k_10,...,"Fx__weld__change_quantiles__f_agg_""var""__isabs_True__qh_0.6__ql_0.2","Fx__weld__fft_coefficient__attr_""abs""__coeff_5",Fy__weld__energy_ratio_by_chunks__num_segments_10__segment_focus_5,"Fy__weld__change_quantiles__f_agg_""mean""__isabs_True__qh_0.4__ql_0.0","Fz__weld__change_quantiles__f_agg_""mean""__isabs_False__qh_1.0__ql_0.4","Fz__weld__agg_linear_trend__attr_""rvalue""__chunk_len_5__f_agg_""var""",Fz__weld__ratio_beyond_r_sigma__r_1,Fz__weld__ratio_beyond_r_sigma__r_2,Mz__weld__quantile__q_0.1,Mz__weld__energy_ratio_by_chunks__num_segments_10__segment_focus_4
0,0.153846,1400,60,10,-0.077671,0.143026,1.244326,1.873865,1.910672,0.007489,...,0.006729,60.230153,0.113272,0.142512,0.000409,-0.446735,0.278363,0.063841,-6.949290,0.034026
1,0.153846,1400,60,15,-0.133276,0.164254,1.203367,-1.054677,1.497020,0.013134,...,0.008800,64.371976,0.110881,0.158888,0.000723,-0.401224,0.204523,0.086839,-5.712890,0.021547
2,0.153846,1400,60,20,-0.059639,0.269418,1.444542,2.940728,0.999252,0.009040,...,0.006253,88.143882,0.107506,0.149730,0.000582,-0.611343,0.172679,0.076763,-5.951800,0.024692
3,0.153846,1400,70,10,-0.051020,0.211907,1.601667,3.661974,0.905469,0.005442,...,0.008120,32.083086,0.108980,0.140542,-0.000489,-0.472874,0.146626,0.065057,-4.033597,0.032070
4,0.153846,1400,70,15,-0.100744,0.179582,1.095031,-0.016799,8.439772,0.005019,...,0.008490,56.841945,0.111608,0.116390,0.001675,-0.344166,0.284189,0.049533,-7.551309,0.018795
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
373,0.846154,1600,70,15,-0.084714,0.284958,1.905742,8.950274,3.389169,0.004357,...,0.005643,83.967538,0.108628,0.144391,0.000259,-0.350354,0.352746,0.051059,-6.994062,0.024102
374,0.846154,1600,70,20,-0.094956,0.257101,1.669120,9.627879,1.694796,0.001533,...,0.006474,8.788741,0.106932,0.128832,0.000801,-0.370529,0.367193,0.024856,-8.050406,0.019972
375,0.846154,1600,80,10,-0.203323,0.173404,1.671576,4.696642,1.366042,0.001576,...,0.005295,41.193473,0.089559,0.142455,0.000469,-0.460999,0.314019,0.047995,-18.062500,0.024198
376,0.846154,1600,80,15,-0.099644,0.266207,1.686495,10.497974,0.781104,-0.004785,...,0.003532,27.703044,0.103169,0.118852,-0.000155,-0.226563,0.231258,0.072198,-4.259820,0.027400


In [146]:
# Test VIF on combined data

threshold_vif = 20
target_max_vif = 5
actual_max_vif = np.inf

while actual_max_vif > target_max_vif:
    threshold_vif -= 1
    dfX = combined3X.copy()

    # Initial VIF calculation
    print(f"Shape of selected features before VIF: {dfX.shape}")
    dfX_dropped, vif, drop_list = calculate_vif(dfX, threshold=threshold_vif)
    print(f"Shape of selected features after VIF(1): {dfX_dropped.shape}")

    # Second VIF calculation after dropping high VIF features
    dfX = dfX_dropped.copy()
    dfX_dropped, vif, drop_list = calculate_vif(dfX, threshold=threshold_vif)
    print(f"Shape of selected features after VIF(2): {dfX_dropped.shape}")
    actual_max_vif = vif["VIF"].max()
    print(f"Current max VIF: {actual_max_vif} at threshold: {threshold_vif}")

    print("-----")

Shape of selected features before VIF: (378, 26)
Shape of selected features after VIF(1): (378, 23)
Shape of selected features after VIF(2): (378, 23)
Current max VIF: 6.22064377015975 at threshold: 19
-----
Shape of selected features before VIF: (378, 26)
Shape of selected features after VIF(1): (378, 23)
Shape of selected features after VIF(2): (378, 23)
Current max VIF: 6.22064377015975 at threshold: 18
-----
Shape of selected features before VIF: (378, 26)
Shape of selected features after VIF(1): (378, 22)
Shape of selected features after VIF(2): (378, 22)
Current max VIF: 6.190489138337215 at threshold: 17
-----
Shape of selected features before VIF: (378, 26)
Shape of selected features after VIF(1): (378, 22)
Shape of selected features after VIF(2): (378, 22)
Current max VIF: 6.190489138337215 at threshold: 16
-----
Shape of selected features before VIF: (378, 26)
Shape of selected features after VIF(1): (378, 22)
Shape of selected features after VIF(2): (378, 22)
Current max VIF

In [147]:
vif

Unnamed: 0,feature_rename,VIF
15,"Fx__weld__change_quantiles__f_agg_""var""__isabs...",4.270136
6,Fz_location,3.33189
18,"Fy__weld__change_quantiles__f_agg_""mean""__isab...",3.288097
20,"Fz__weld__agg_linear_trend__attr_""rvalue""__chu...",3.287007
12,"Fz__dwell__fft_coefficient__attr_""real""__coeff_4",2.925117
11,"Fz__dwell__fft_coefficient__attr_""angle""__coeff_4",2.852854
7,"Fx__dwell__fft_coefficient__attr_""abs""__coeff_11",2.808927
13,"Mz__dwell__augmented_dickey_fuller__attr_""test...",2.704683
10,"Fy__dwell__fft_coefficient__attr_""abs""__coeff_59",2.376432
1,position,2.188598


In [None]:
rels_selected_vif_2 = vif.merge(
    rels_selected,
    left_on="feature_rename",
    right_on="feature_rename",
    how="left",
    suffixes=("", "_y"),
)
rels_selected_vif_2 = rels_selected_vif_2.drop(
    columns=[c for c in vif.columns if c.endswith("_y")]
)
display(rels_selected_vif_2)

Unnamed: 0,feature_rename,VIF,feature,type,p_value,relevant,phase_type,measure_type,stress_type,feature_y,type_y,p_value_y,relevant_y,phase_type_y,measure_type_y,stress_type_y
0,"Fx__weld__change_quantiles__f_agg_""var""__isabs...",3.088283,"Fx__change_quantiles__f_agg_""var""__isabs_True_...",real,7.126935e-10,True,weld,Fx,stress_value_5052,"Fx__change_quantiles__f_agg_""var""__isabs_True_...",real,7.126935e-10,True,weld,Fx,stress_value_5052
1,"Fy__weld__change_quantiles__f_agg_""mean""__isab...",2.987018,"Fy__change_quantiles__f_agg_""mean""__isabs_True...",real,3.179828e-07,True,weld,Fy,stress_value_5052,"Fy__change_quantiles__f_agg_""mean""__isabs_True...",real,3.179828e-07,True,weld,Fy,stress_value_5052
2,"Fz__weld__agg_linear_trend__attr_""rvalue""__chu...",2.555362,"Fz__agg_linear_trend__attr_""rvalue""__chunk_len...",real,7.167726e-07,True,weld,Fz,stress_value_5052,"Fz__agg_linear_trend__attr_""rvalue""__chunk_len...",real,7.167726e-07,True,weld,Fz,stress_value_5052
3,"Fx__dwell__fft_coefficient__attr_""abs""__coeff_11",2.44963,"Fx__fft_coefficient__attr_""abs""__coeff_11",real,1.565675e-10,True,dwell,Fx,stress_value_5052,"Fx__fft_coefficient__attr_""abs""__coeff_11",real,1.565675e-10,True,dwell,Fx,stress_value_5052
4,"Mz__dwell__augmented_dickey_fuller__attr_""test...",2.247095,"Mz__augmented_dickey_fuller__attr_""teststat""__...",real,4.459153e-11,True,dwell,Mz,stress_value_5052,"Mz__augmented_dickey_fuller__attr_""teststat""__...",real,4.459153e-11,True,dwell,Mz,stress_value_5052
5,"Fy__dwell__fft_coefficient__attr_""abs""__coeff_59",2.208328,"Fy__fft_coefficient__attr_""abs""__coeff_59",real,5.597517e-08,True,dwell,Fy,stress_value_5052,"Fy__fft_coefficient__attr_""abs""__coeff_59",real,5.597517e-08,True,dwell,Fy,stress_value_5052
6,"Fz__dwell__fft_coefficient__attr_""real""__coeff_4",2.204094,"Fz__fft_coefficient__attr_""real""__coeff_4",real,2.398588e-07,True,dwell,Fz,stress_value_5052,"Fz__fft_coefficient__attr_""real""__coeff_4",real,2.398588e-07,True,dwell,Fz,stress_value_5052
7,"Fz__dwell__fft_coefficient__attr_""angle""__coeff_4",2.15367,"Fz__fft_coefficient__attr_""angle""__coeff_4",real,1.840083e-12,True,dwell,Fz,stress_value_5052,"Fz__fft_coefficient__attr_""angle""__coeff_4",real,1.840083e-12,True,dwell,Fz,stress_value_5052
8,"Mz__dwell__augmented_dickey_fuller__attr_""pval...",1.848237,"Mz__augmented_dickey_fuller__attr_""pvalue""__au...",real,4.459153e-11,True,dwell,Mz,stress_value_5052,"Mz__augmented_dickey_fuller__attr_""pvalue""__au...",real,4.459153e-11,True,dwell,Mz,stress_value_5052
9,"Fx__weld__fft_coefficient__attr_""abs""__coeff_5",1.734066,"Fx__fft_coefficient__attr_""abs""__coeff_5",real,2.20046e-09,True,weld,Fx,stress_value_5052,"Fx__fft_coefficient__attr_""abs""__coeff_5",real,2.20046e-09,True,weld,Fx,stress_value_5052


In [152]:
cols = colsG + vif["feature_rename"].sort_values().tolist() + colsY
combined4 = combined3[cols] 
combined4

Unnamed: 0,sample_no,location,D,Fx__dwell__ar_coefficient__coeff_0__k_10,"Fx__dwell__fft_coefficient__attr_""abs""__coeff_11","Fx__weld__change_quantiles__f_agg_""var""__isabs_True__qh_0.6__ql_0.2","Fx__weld__fft_coefficient__attr_""abs""__coeff_5",Fx_location,"Fy__dwell__fft_coefficient__attr_""abs""__coeff_59","Fy__dwell__fft_coefficient__attr_""real""__coeff_71",...,"Fz__weld__change_quantiles__f_agg_""mean""__isabs_False__qh_1.0__ql_0.4",Fz_location,"Mz__dwell__augmented_dickey_fuller__attr_""pvalue""__autolag_""AIC""","Mz__dwell__augmented_dickey_fuller__attr_""teststat""__autolag_""AIC""",R,W,position,stress_value_5052,stress_value_6061,stress_value_center
0,1,1,10,0.007489,1.910672,0.006729,60.230153,-0.077671,0.309552,-1.545507,...,0.000409,1.244326,0.000004,-5.345424,1400,60,0.153846,28.0,51.0,12.0
1,2,1,15,0.013134,1.497020,0.008800,64.371976,-0.133276,0.436635,0.786300,...,0.000723,1.203367,0.000026,-4.966068,1400,60,0.153846,14.0,-21.0,17.0
2,3,1,20,0.009040,0.999252,0.006253,88.143882,-0.059639,1.023505,0.550453,...,0.000582,1.444542,0.008110,-3.495263,1400,60,0.153846,10.0,35.0,12.0
3,4,1,10,0.005442,0.905469,0.008120,32.083086,-0.051020,1.788437,-0.536916,...,-0.000489,1.601667,0.038451,-2.963456,1400,70,0.153846,10.0,-10.0,20.0
4,5,1,15,0.005019,8.439772,0.008490,56.841945,-0.100744,4.068030,3.543262,...,0.001675,1.095031,0.000125,-4.608391,1400,70,0.153846,6.0,41.0,14.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
373,50,7,15,0.004357,3.389169,0.005643,83.967538,-0.084714,1.690158,-0.026647,...,0.000259,1.905742,0.000025,-4.975763,1600,70,0.846154,4.0,-23.0,2.0
374,51,7,20,0.001533,1.694796,0.006474,8.788741,-0.094956,0.798427,-0.462940,...,0.000801,1.669120,0.000013,-5.111042,1600,70,0.846154,0.0,-1.0,2.0
375,52,7,10,0.001576,1.366042,0.005295,41.193473,-0.203323,0.326939,0.139781,...,0.000469,1.671576,0.013144,-3.340967,1600,80,0.846154,-2.0,-41.0,5.0
376,53,7,15,-0.004785,0.781104,0.003532,27.703044,-0.099644,0.497165,-0.344797,...,-0.000155,1.686495,0.351520,-1.859140,1600,80,0.846154,10.0,-90.0,1.0


In [None]:
rels_selected_vif_2.to_excel("S01_af_feature_list.xlsx", index=False)
combined4.to_excel("S01_combined_data.xlsx", index=False)