In [1]:
import itertools
import os
import pickle
import re
from pathlib import Path

import pandas as pd

In [2]:
# Set maximum number of features to select per group
MAX_FEATURES = 1

In [3]:
BASE_DIR = Path.cwd()  # Current directory of the running file
ROOT_DIR = (
    BASE_DIR.parent.parent.parent
)  # Adjust as necessary to reach the project root
DATA_DIR = ROOT_DIR / "run1" / "data"

In [4]:
rels = pd.read_excel(DATA_DIR / "S03_af_feature_relevances.xlsx")
rels


Unnamed: 0,feature,type,p_value,relevant,phase_type,measure_type,stress_type
0,"Fx__fft_coefficient__attr_""abs""__coeff_11",real,1.565675e-10,True,dwell,Fx,stress_value_5052
1,Fx__ar_coefficient__coeff_0__k_10,real,2.866412e-08,True,dwell,Fx,stress_value_5052
2,Fx__variation_coefficient,real,1.952901e-07,True,dwell,Fx,stress_value_5052
3,"Fx__fft_coefficient__attr_""abs""__coeff_51",real,5.246075e-07,True,dwell,Fx,stress_value_5052
4,"Fx__fft_coefficient__attr_""angle""__coeff_75",real,1.000460e-06,True,dwell,Fx,stress_value_5052
...,...,...,...,...,...,...,...
18763,Mz__value_count__value_1,constant,,False,weld,Mz,stress_value_center
18764,Mz__value_count__value_-1,constant,,False,weld,Mz,stress_value_center
18765,Mz__ratio_beyond_r_sigma__r_6,constant,,False,weld,Mz,stress_value_center
18766,Mz__ratio_beyond_r_sigma__r_7,constant,,False,weld,Mz,stress_value_center


In [5]:
# Filter to only relevant features
rel = rels[rels["relevant"]]
print(f"Shape of relevant features: {rel.shape}")
rel.groupby(by=["stress_type", "measure_type", "phase_type"]).size()


Shape of relevant features: (717, 7)


stress_type          measure_type  phase_type
stress_value_5052    Fx            dwell         155
                                   weld           71
                     Fy            dwell         159
                                   weld           86
                     Fz            dwell          50
                                   weld            6
                     Mz            dwell          93
                                   weld           90
stress_value_center  Fz            weld            7
dtype: int64

In [6]:
def rename_feature(cur_feature_name: str, phase_type: str) -> str:
    sp = cur_feature_name.split("__")
    sp.insert(1, phase_type)
    new_feature_name = "__".join(sp)
    return new_feature_name


# cur_feature_name = 'Fx__fft_coefficient__attr_"abs"__coeff_11'
# sp = cur_feature_name.split("__")
# sp.insert(1, "weld")
# new_feature_name = "__".join(sp)
# print(new_feature_name)

In [7]:
phase_types = ["dwell", "weld"]
measure_types = ["Fx", "Fy", "Fz", "Mz"]
stress_types = ["stress_value_5052", "stress_value_6061", "stress_value_center"]
iter_list = list(itertools.product(phase_types, measure_types, stress_types))
df_arr = []
for phase_type, measure_type, stress_type in iter_list:
    filt1 = rels["phase_type"] == phase_type
    filt2 = rels["measure_type"] == measure_type
    filt3 = rels["stress_type"] == stress_type
    rel = rels[filt1 & filt2 & filt3].sort_values(by="p_value", ascending=True)

    rel["feature_rename"] = rel["feature"].apply(
        lambda x: rename_feature(x, phase_type)
    )

    # Select top N features
    rel_top = rel.head(MAX_FEATURES)

    df_arr.append(rel_top)

rels_selected = pd.concat(df_arr, axis=0).reset_index(drop=True)
rels_selected

Unnamed: 0,feature,type,p_value,relevant,phase_type,measure_type,stress_type,feature_rename
0,"Fx__fft_coefficient__attr_""abs""__coeff_11",real,1.565675e-10,True,dwell,Fx,stress_value_5052,"Fx__dwell__fft_coefficient__attr_""abs""__coeff_11"
1,Fx__quantile__q_0.7,real,0.0005089553,False,dwell,Fx,stress_value_6061,Fx__dwell__quantile__q_0.7
2,Fx__partial_autocorrelation__lag_6,real,8.023118e-05,False,dwell,Fx,stress_value_center,Fx__dwell__partial_autocorrelation__lag_6
3,"Fy__fft_coefficient__attr_""real""__coeff_71",real,2.016897e-08,True,dwell,Fy,stress_value_5052,"Fy__dwell__fft_coefficient__attr_""real""__coeff_71"
4,Fy__last_location_of_minimum,real,2.455827e-05,False,dwell,Fy,stress_value_6061,Fy__dwell__last_location_of_minimum
5,"Fy__change_quantiles__f_agg_""mean""__isabs_True...",real,0.0003033195,False,dwell,Fy,stress_value_center,"Fy__dwell__change_quantiles__f_agg_""mean""__isa..."
6,"Fz__fft_coefficient__attr_""angle""__coeff_4",real,1.840083e-12,True,dwell,Fz,stress_value_5052,"Fz__dwell__fft_coefficient__attr_""angle""__coeff_4"
7,"Fz__fft_coefficient__attr_""angle""__coeff_68",real,0.007103279,False,dwell,Fz,stress_value_6061,"Fz__dwell__fft_coefficient__attr_""angle""__coef..."
8,Fz__approximate_entropy__m_2__r_0.7,real,8.385676e-05,False,dwell,Fz,stress_value_center,Fz__dwell__approximate_entropy__m_2__r_0.7
9,"Mz__augmented_dickey_fuller__attr_""teststat""__...",real,4.459153e-11,True,dwell,Mz,stress_value_5052,"Mz__dwell__augmented_dickey_fuller__attr_""test..."


In [8]:
# Check for duplicates
filt = rels_selected.duplicated(subset=["feature", "phase_type"])
display(rels_selected[filt])

# Remove duplicates if any
rels_selected = rels_selected[~filt]


Unnamed: 0,feature,type,p_value,relevant,phase_type,measure_type,stress_type,feature_rename


In [9]:
# Summary of relevant features selected
rels_selected["relevant"].value_counts()

relevant
False    15
True      9
Name: count, dtype: int64

In [10]:
# Load features
_filts = os.listdir(DATA_DIR)
files = [f for f in _filts if f.startswith("S01_af_features_") and f.endswith(".xlsx")]
_load = []
for file in files:
    measure = re.match(r"S01_af_features_(\w+)\.xlsx", file).group(1)
    _load.append(dict(filename=file, measure=measure))
    print(f"Loading features for: {measure}")
load = pd.DataFrame(_load)
load

Loading features for: Fx
Loading features for: Fy
Loading features for: Fz
Loading features for: Mz


Unnamed: 0,filename,measure
0,S01_af_features_Fx.xlsx,Fx
1,S01_af_features_Fy.xlsx,Fy
2,S01_af_features_Fz.xlsx,Fz
3,S01_af_features_Mz.xlsx,Mz


In [11]:
# Load features
features_dict = {}
for idx, row in load.iterrows():
    measure = row["measure"]
    file = row["filename"]
    filepath = DATA_DIR / file
    print(f"Loading features for: {measure}, file: {file}")

    ext_dwell = pd.read_excel(
        filepath,
        sheet_name="af_dwell",
    )
    ext_weld = pd.read_excel(filepath, sheet_name="af_weld")
    # Store in dictionary
    data = {
        "dwell": ext_dwell,
        "weld": ext_weld,
    }
    features_dict[measure] = data

Loading features for: Fx, file: S01_af_features_Fx.xlsx
Loading features for: Fy, file: S01_af_features_Fy.xlsx
Loading features for: Fz, file: S01_af_features_Fz.xlsx
Loading features for: Mz, file: S01_af_features_Mz.xlsx


In [12]:
_selected_features = {}
for idx, row in rels_selected.iterrows():
    phase_type = row["phase_type"]
    measure_type = row["measure_type"]
    stress_type = row["stress_type"]
    feature_name = row["feature"]
    feature_rename = row["feature_rename"]
    # print(
    #     f"Phase: {phase_type}, Measure: {measure_type}, Stress: {stress_type}, Feature: {feature_name}"
    # )
    # Extract feature values
    features_value = features_dict[measure_type][phase_type][feature_name].values
    # Store in selected features dictionary
    _selected_features[feature_rename] = features_value

# Convert to DataFrame
selected_features = pd.DataFrame(_selected_features)

# Add sample number column
selected_features.insert(loc=0, column="sample_no", value=features_dict["Fx"]["dwell"]["sample_no"])

In [13]:
# Load location info
locs = pd.read_excel(DATA_DIR / "S04_loc_values.xlsx").rename(
    columns={
        "Location": "location",
        "Fx": "Fx_location",
        "Fy": "Fy_location",
        "Fz": "Fz_location",
        "Mz": "Mz_location",
    }
)
locs = locs.drop(columns=["loc_idx", "loc_time"])
locs

Unnamed: 0,sample_no,location,Fx_location,Fy_location,Fz_location,Mz_location
0,1,1,-0.077671,0.143026,1.244326,1.873865
1,1,2,-0.027450,0.207481,1.625039,4.615337
2,1,3,-0.038704,0.272508,1.799653,5.209334
3,1,4,-0.023484,0.278517,1.882699,5.764727
4,1,5,-0.038251,0.280276,1.955412,6.097497
...,...,...,...,...,...,...
373,54,3,-0.079764,0.196450,1.352393,5.512565
374,54,4,-0.091477,0.259717,1.529089,6.898179
375,54,5,-0.068128,0.241906,1.657737,7.940597
376,54,6,-0.066880,0.261006,1.712860,8.849661


In [14]:
# Load combined data
combined1 = pd.read_excel(DATA_DIR / "S02_data_exp.xlsx")
display(combined1)

Unnamed: 0,sample_no,location,position,R,W,D,stress_value_5052,stress_value_6061,stress_value_center
0,1,1,0.153846,1400,60,10,28.0,51.0,12.0
1,2,1,0.153846,1400,60,15,14.0,-21.0,17.0
2,3,1,0.153846,1400,60,20,10.0,35.0,12.0
3,4,1,0.153846,1400,70,10,10.0,-10.0,20.0
4,5,1,0.153846,1400,70,15,6.0,41.0,14.0
...,...,...,...,...,...,...,...,...,...
373,50,7,0.846154,1600,70,15,4.0,-23.0,2.0
374,51,7,0.846154,1600,70,20,0.0,-1.0,2.0
375,52,7,0.846154,1600,80,10,-2.0,-41.0,5.0
376,53,7,0.846154,1600,80,15,10.0,-90.0,1.0


In [15]:
combined2 = combined1.merge(locs, on=["sample_no", "location"])
combined2

Unnamed: 0,sample_no,location,position,R,W,D,stress_value_5052,stress_value_6061,stress_value_center,Fx_location,Fy_location,Fz_location,Mz_location
0,1,1,0.153846,1400,60,10,28.0,51.0,12.0,-0.077671,0.143026,1.244326,1.873865
1,2,1,0.153846,1400,60,15,14.0,-21.0,17.0,-0.133276,0.164254,1.203367,-1.054677
2,3,1,0.153846,1400,60,20,10.0,35.0,12.0,-0.059639,0.269418,1.444542,2.940728
3,4,1,0.153846,1400,70,10,10.0,-10.0,20.0,-0.051020,0.211907,1.601667,3.661974
4,5,1,0.153846,1400,70,15,6.0,41.0,14.0,-0.100744,0.179582,1.095031,-0.016799
...,...,...,...,...,...,...,...,...,...,...,...,...,...
373,50,7,0.846154,1600,70,15,4.0,-23.0,2.0,-0.084714,0.284958,1.905742,8.950274
374,51,7,0.846154,1600,70,20,0.0,-1.0,2.0,-0.094956,0.257101,1.669120,9.627879
375,52,7,0.846154,1600,80,10,-2.0,-41.0,5.0,-0.203323,0.173404,1.671576,4.696642
376,53,7,0.846154,1600,80,15,10.0,-90.0,1.0,-0.099644,0.266207,1.686495,10.497974


In [16]:
combined3 = combined2.merge(selected_features, on=["sample_no"])
combined3

Unnamed: 0,sample_no,location,position,R,W,D,stress_value_5052,stress_value_6061,stress_value_center,Fx_location,...,"Fx__weld__fft_coefficient__attr_""abs""__coeff_58",Fy__weld__energy_ratio_by_chunks__num_segments_10__segment_focus_5,"Fy__weld__fft_coefficient__attr_""imag""__coeff_61","Fy__weld__fft_coefficient__attr_""real""__coeff_51","Fz__weld__change_quantiles__f_agg_""mean""__isabs_False__qh_1.0__ql_0.4","Fz__weld__fft_coefficient__attr_""real""__coeff_84",Fz__weld__ratio_beyond_r_sigma__r_1,Mz__weld__quantile__q_0.1,"Mz__weld__fft_coefficient__attr_""angle""__coeff_15","Mz__weld__fft_coefficient__attr_""abs""__coeff_58"
0,1,1,0.153846,1400,60,10,28.0,51.0,12.0,-0.077671,...,18.493524,0.113272,-0.245535,-5.950581,0.000409,-4.184069,0.278363,-6.949290,93.404989,770.788243
1,2,1,0.153846,1400,60,15,14.0,-21.0,17.0,-0.133276,...,21.514172,0.110881,5.438227,-8.704837,0.000723,1.622507,0.204523,-5.712890,40.096726,358.604392
2,3,1,0.153846,1400,60,20,10.0,35.0,12.0,-0.059639,...,6.610517,0.107506,7.985972,-5.337497,0.000582,-3.515585,0.172679,-5.951800,149.473672,292.417343
3,4,1,0.153846,1400,70,10,10.0,-10.0,20.0,-0.051020,...,2.214507,0.108980,6.712190,-7.740446,-0.000489,1.585279,0.146626,-4.033597,75.561565,311.972632
4,5,1,0.153846,1400,70,15,6.0,41.0,14.0,-0.100744,...,13.253272,0.111608,3.894138,8.372584,0.001675,-6.302522,0.284189,-7.551309,85.348076,601.144018
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
373,50,7,0.846154,1600,70,15,4.0,-23.0,2.0,-0.084714,...,20.968411,0.108628,15.052543,4.172921,0.000259,-2.624641,0.352746,-6.994062,87.390014,823.146921
374,51,7,0.846154,1600,70,20,0.0,-1.0,2.0,-0.094956,...,22.920228,0.106932,1.971530,6.294789,0.000801,-4.264308,0.367193,-8.050406,65.122289,979.362101
375,52,7,0.846154,1600,80,10,-2.0,-41.0,5.0,-0.203323,...,13.516468,0.089559,6.184885,-3.004381,0.000469,-5.987022,0.314019,-18.062500,58.113275,663.567303
376,53,7,0.846154,1600,80,15,10.0,-90.0,1.0,-0.099644,...,9.979603,0.103169,7.514874,1.595205,-0.000155,4.261066,0.231258,-4.259820,68.126310,499.039605


In [17]:
colsY = ["stress_value_5052", "stress_value_6061", "stress_value_center"]
colsX = [c for c in combined3.columns if c not in colsY]
combined4 = combined3[colsX + colsY]
combined4

Unnamed: 0,sample_no,location,position,R,W,D,Fx_location,Fy_location,Fz_location,Mz_location,...,"Fy__weld__fft_coefficient__attr_""real""__coeff_51","Fz__weld__change_quantiles__f_agg_""mean""__isabs_False__qh_1.0__ql_0.4","Fz__weld__fft_coefficient__attr_""real""__coeff_84",Fz__weld__ratio_beyond_r_sigma__r_1,Mz__weld__quantile__q_0.1,"Mz__weld__fft_coefficient__attr_""angle""__coeff_15","Mz__weld__fft_coefficient__attr_""abs""__coeff_58",stress_value_5052,stress_value_6061,stress_value_center
0,1,1,0.153846,1400,60,10,-0.077671,0.143026,1.244326,1.873865,...,-5.950581,0.000409,-4.184069,0.278363,-6.949290,93.404989,770.788243,28.0,51.0,12.0
1,2,1,0.153846,1400,60,15,-0.133276,0.164254,1.203367,-1.054677,...,-8.704837,0.000723,1.622507,0.204523,-5.712890,40.096726,358.604392,14.0,-21.0,17.0
2,3,1,0.153846,1400,60,20,-0.059639,0.269418,1.444542,2.940728,...,-5.337497,0.000582,-3.515585,0.172679,-5.951800,149.473672,292.417343,10.0,35.0,12.0
3,4,1,0.153846,1400,70,10,-0.051020,0.211907,1.601667,3.661974,...,-7.740446,-0.000489,1.585279,0.146626,-4.033597,75.561565,311.972632,10.0,-10.0,20.0
4,5,1,0.153846,1400,70,15,-0.100744,0.179582,1.095031,-0.016799,...,8.372584,0.001675,-6.302522,0.284189,-7.551309,85.348076,601.144018,6.0,41.0,14.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
373,50,7,0.846154,1600,70,15,-0.084714,0.284958,1.905742,8.950274,...,4.172921,0.000259,-2.624641,0.352746,-6.994062,87.390014,823.146921,4.0,-23.0,2.0
374,51,7,0.846154,1600,70,20,-0.094956,0.257101,1.669120,9.627879,...,6.294789,0.000801,-4.264308,0.367193,-8.050406,65.122289,979.362101,0.0,-1.0,2.0
375,52,7,0.846154,1600,80,10,-0.203323,0.173404,1.671576,4.696642,...,-3.004381,0.000469,-5.987022,0.314019,-18.062500,58.113275,663.567303,-2.0,-41.0,5.0
376,53,7,0.846154,1600,80,15,-0.099644,0.266207,1.686495,10.497974,...,1.595205,-0.000155,4.261066,0.231258,-4.259820,68.126310,499.039605,10.0,-90.0,1.0


In [18]:
rels_selected.to_excel(f"S01_af_feature_list_MF{MAX_FEATURES}.xlsx", index=False)
combined4.to_excel(f"S01_aggregate_data_MF{MAX_FEATURES}.xlsx", index=False)