In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from tsfresh import extract_features, select_features
from tsfresh.utilities.dataframe_functions import impute, roll_time_series
from tsfresh.feature_extraction import EfficientFCParameters
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_errored
from xgboost import XGBRegressor

In [2]:
#load and process data with op settings
df3=pd.read_csv(r'C:\Users\65962\Desktop\JUPYTER\CMAPSSData\test_FD003.txt', delim_whitespace=True, header=None)
df3.head()


  df3=pd.read_csv(r'C:\Users\65962\Desktop\JUPYTER\CMAPSSData\test_FD003.txt', delim_whitespace=True, header=None)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
0,1,1,-0.0017,-0.0004,100.0,518.67,641.94,1581.93,1396.93,14.62,...,521.89,2387.94,8133.48,8.376,0.03,391,2388,100.0,39.07,23.4468
1,1,2,0.0006,-0.0002,100.0,518.67,642.02,1584.86,1398.9,14.62,...,521.85,2388.01,8137.44,8.4062,0.03,391,2388,100.0,39.04,23.4807
2,1,3,0.0014,-0.0003,100.0,518.67,641.68,1581.78,1391.92,14.62,...,522.1,2387.94,8138.25,8.3553,0.03,391,2388,100.0,39.1,23.4244
3,1,4,0.0027,0.0001,100.0,518.67,642.2,1584.53,1395.34,14.62,...,522.45,2387.96,8137.07,8.3709,0.03,392,2388,100.0,38.97,23.4782
4,1,5,-0.0001,0.0001,100.0,518.67,642.46,1589.03,1395.86,14.62,...,521.91,2387.97,8134.2,8.4146,0.03,391,2388,100.0,39.09,23.395


In [3]:
columns = ['unit', 'time', 'op1', 'op2', 'op3'] + [f'sr{i+1}' for i in range(23)]
if len(columns) != df3.shape[1]:
    columns = ['unit', 'time', 'op1', 'op2', 'op3'] + [f'sr{i+1}' for i in range(df3.shape[1] - 5)] #to generate correct number of sensor cols; prevent mismatch error

df3.columns = columns
df3['max_time'] = df3.groupby('unit')['time'].transform('max')
df3['remaining_time'] = df3['max_time'] - df3['time']
df3['label'] = df3['remaining_time'].clip(upper=130)
df3.head()

Unnamed: 0,unit,time,op1,op2,op3,sr1,sr2,sr3,sr4,sr5,...,sr15,sr16,sr17,sr18,sr19,sr20,sr21,max_time,remaining_time,label
0,1,1,-0.0017,-0.0004,100.0,518.67,641.94,1581.93,1396.93,14.62,...,8.376,0.03,391,2388,100.0,39.07,23.4468,233,232,130
1,1,2,0.0006,-0.0002,100.0,518.67,642.02,1584.86,1398.9,14.62,...,8.4062,0.03,391,2388,100.0,39.04,23.4807,233,231,130
2,1,3,0.0014,-0.0003,100.0,518.67,641.68,1581.78,1391.92,14.62,...,8.3553,0.03,391,2388,100.0,39.1,23.4244,233,230,130
3,1,4,0.0027,0.0001,100.0,518.67,642.2,1584.53,1395.34,14.62,...,8.3709,0.03,392,2388,100.0,38.97,23.4782,233,229,130
4,1,5,-0.0001,0.0001,100.0,518.67,642.46,1589.03,1395.86,14.62,...,8.4146,0.03,391,2388,100.0,39.09,23.395,233,228,130


In [4]:
#rolling windows

window_size = 30

rolled_df3 = roll_time_series(df3, 
                             column_id='unit',
                             column_sort='time',
                             max_timeshift=window_size-1,
                             min_timeshift=window_size-1,
                             rolling_direction=1)

rolled_df3

Rolling: 100%|█████████████████████████████████████████████████████████████████████████| 40/40 [00:04<00:00,  9.05it/s]


Unnamed: 0,unit,time,op1,op2,op3,sr1,sr2,sr3,sr4,sr5,...,sr16,sr17,sr18,sr19,sr20,sr21,max_time,remaining_time,label,id
0,1,1,-0.0017,-0.0004,100.0,518.67,641.94,1581.93,1396.93,14.62,...,0.03,391,2388,100.0,39.07,23.4468,233,232,130,"(1, 30)"
1,1,2,0.0006,-0.0002,100.0,518.67,642.02,1584.86,1398.90,14.62,...,0.03,391,2388,100.0,39.04,23.4807,233,231,130,"(1, 30)"
2,1,3,0.0014,-0.0003,100.0,518.67,641.68,1581.78,1391.92,14.62,...,0.03,391,2388,100.0,39.10,23.4244,233,230,130,"(1, 30)"
3,1,4,0.0027,0.0001,100.0,518.67,642.20,1584.53,1395.34,14.62,...,0.03,392,2388,100.0,38.97,23.4782,233,229,130,"(1, 30)"
4,1,5,-0.0001,0.0001,100.0,518.67,642.46,1589.03,1395.86,14.62,...,0.03,391,2388,100.0,39.09,23.3950,233,228,130,"(1, 30)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
368755,100,243,0.0011,-0.0003,100.0,518.67,643.04,1594.99,1411.28,14.62,...,0.03,395,2388,100.0,39.40,23.4949,247,4,4,"(100, 247)"
368756,100,244,-0.0024,0.0002,100.0,518.67,642.47,1591.27,1417.64,14.62,...,0.03,394,2388,100.0,39.42,23.6011,247,3,3,"(100, 247)"
368757,100,245,-0.0053,0.0002,100.0,518.67,642.70,1593.81,1412.70,14.62,...,0.03,394,2388,100.0,39.43,23.5482,247,2,2,"(100, 247)"
368758,100,246,-0.0006,0.0001,100.0,518.67,642.19,1595.63,1406.58,14.62,...,0.03,395,2388,100.0,39.40,23.6687,247,1,1,"(100, 247)"


In [5]:
#select cols
selected_cols=['id','op1','op2','op3','sr2', 'sr3', 'sr4',
                    'sr7', 'sr8', 'sr9', 'sr11',
                    'sr12', 'sr13', 'sr14', 'sr15',
                    'sr17', 'sr20', 'sr21']

selected_df3=rolled_df3[selected_cols]
selected_df3

Unnamed: 0,id,op1,op2,op3,sr2,sr3,sr4,sr7,sr8,sr9,sr11,sr12,sr13,sr14,sr15,sr17,sr20,sr21
0,"(1, 30)",-0.0017,-0.0004,100.0,641.94,1581.93,1396.93,554.56,2387.93,9048.65,47.09,521.89,2387.94,8133.48,8.3760,391,39.07,23.4468
1,"(1, 30)",0.0006,-0.0002,100.0,642.02,1584.86,1398.90,554.10,2387.94,9046.53,47.08,521.85,2388.01,8137.44,8.4062,391,39.04,23.4807
2,"(1, 30)",0.0014,-0.0003,100.0,641.68,1581.78,1391.92,554.41,2387.97,9054.92,47.15,522.10,2387.94,8138.25,8.3553,391,39.10,23.4244
3,"(1, 30)",0.0027,0.0001,100.0,642.20,1584.53,1395.34,554.58,2387.94,9055.04,47.26,522.45,2387.96,8137.07,8.3709,392,38.97,23.4782
4,"(1, 30)",-0.0001,0.0001,100.0,642.46,1589.03,1395.86,554.16,2388.01,9048.59,46.94,521.91,2387.97,8134.20,8.4146,391,39.09,23.3950
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
368755,"(100, 247)",0.0011,-0.0003,100.0,643.04,1594.99,1411.28,561.21,2388.24,9088.31,47.61,529.23,2388.24,8162.24,8.2939,395,39.40,23.4949
368756,"(100, 247)",-0.0024,0.0002,100.0,642.47,1591.27,1417.64,561.02,2388.29,9085.35,47.75,529.27,2388.26,8163.24,8.3247,394,39.42,23.6011
368757,"(100, 247)",-0.0053,0.0002,100.0,642.70,1593.81,1412.70,561.16,2388.23,9084.29,47.60,529.48,2388.28,8162.12,8.3264,394,39.43,23.5482
368758,"(100, 247)",-0.0006,0.0001,100.0,642.19,1595.63,1406.58,562.66,2388.24,9092.52,47.69,529.39,2388.33,8164.20,8.2908,395,39.40,23.6687


In [6]:
#extraction of features

efficient_params = EfficientFCParameters()

def batch_feature_extraction(df, batch_size=100):
    feature_batches = []
    for i in range(0, len(df['id'].unique()), batch_size):
        batch_df = df[df['id'].isin(df['id'].unique()[i:i+batch_size])]
        X_batch = extract_features(batch_df,
                                   column_id='id',
                                   default_fc_parameters=efficient_params,
                                   impute_function=impute)
        feature_batches.append(X_batch)
    return pd.concat(feature_batches)

X3 = batch_feature_extraction(selected_df3, batch_size=50)


Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 39/39 [00:10<00:00,  3.66it/s]
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 39/39 [00:10<00:00,  3.62it/s]
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 39/39 [00:10<00:00,  3.71it/s]
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 39/39 [00:10<00:00,  3.68it/s]
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 39/39 [00:10<00:00,  3.78it/s]
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 39/39 [00:10<00:00,  3.79it/s]
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 39/39 [00:10<00:00,  3.83it/s]
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 39/39 [00:10<00:00,  3.79it/s]
Feature Extraction: 100%|███████████████

In [7]:
#select relevant features
y3 = rolled_df3.groupby('id')['label'].last()
X_filtered3 = select_features(X3, y3)
X_filtered3.head()

Unnamed: 0,Unnamed: 1,sr4__c3__lag_3,"sr11__cwt_coefficients__coeff_12__w_20__widths_(2, 5, 10, 20)","sr11__cwt_coefficients__coeff_12__w_10__widths_(2, 5, 10, 20)","sr11__cwt_coefficients__coeff_11__w_20__widths_(2, 5, 10, 20)","sr11__cwt_coefficients__coeff_11__w_10__widths_(2, 5, 10, 20)","sr11__cwt_coefficients__coeff_10__w_20__widths_(2, 5, 10, 20)","sr11__cwt_coefficients__coeff_10__w_10__widths_(2, 5, 10, 20)","sr11__cwt_coefficients__coeff_9__w_20__widths_(2, 5, 10, 20)","sr11__cwt_coefficients__coeff_9__w_10__widths_(2, 5, 10, 20)","sr13__fft_coefficient__attr_""abs""__coeff_0",...,op1__energy_ratio_by_chunks__num_segments_10__segment_focus_3,sr21__ar_coefficient__coeff_2__k_10,"sr21__change_quantiles__f_agg_""var""__isabs_True__qh_0.8__ql_0.2",sr14__fourier_entropy__bins_5,op1__energy_ratio_by_chunks__num_segments_10__segment_focus_6,"op2__fft_coefficient__attr_""real""__coeff_2","sr12__change_quantiles__f_agg_""var""__isabs_True__qh_0.8__ql_0.4",sr4__mean_second_derivative_central,sr8__fourier_entropy__bins_100,"sr15__fft_coefficient__attr_""imag""__coeff_9"
1,30,2717726000.0,195.146541,138.519284,189.959981,140.661471,184.198669,141.399113,177.866699,140.599306,71639.73,...,0.182027,-0.453461,0.000684,1.511086,0.154181,-0.000862,0.006344,-0.124464,2.339372,-0.07925
1,31,2717025000.0,195.15353,138.521738,189.964859,140.669406,184.196195,141.424304,177.881055,140.597177,71639.81,...,0.177672,-0.330675,0.000702,1.401393,0.131256,9.3e-05,0.006344,0.372857,2.599302,0.062146
1,32,2715805000.0,195.166932,138.514186,189.972948,140.671404,184.202297,141.432077,177.879923,140.622538,71639.79,...,0.126587,-0.333158,0.00066,1.180305,0.127945,0.000606,0.00665,-0.125357,2.599302,0.023342
1,33,2714437000.0,195.17775,138.506477,189.978647,140.667048,184.201818,141.435209,177.876633,140.629119,71639.86,...,0.062745,-0.39969,0.000873,1.04084,0.00517,0.000714,0.00665,-0.1075,2.220025,-0.062211
1,34,2713528000.0,195.146855,138.559047,189.977362,140.664361,184.194054,141.432635,177.861397,140.630377,71639.86,...,0.037766,-0.464552,0.000647,1.04084,0.054619,0.000516,0.006114,-0.106607,2.599302,0.021573


In [8]:
# Data scaling and train-validation split

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
relevant_features_scaled3 = scaler.fit_transform(X_filtered3)
relevant_features_scaled_df3 = pd.DataFrame(relevant_features_scaled3, columns=X_filtered3.columns)
relevant_features_scaled_df3.head()


Unnamed: 0,sr4__c3__lag_3,"sr11__cwt_coefficients__coeff_12__w_20__widths_(2, 5, 10, 20)","sr11__cwt_coefficients__coeff_12__w_10__widths_(2, 5, 10, 20)","sr11__cwt_coefficients__coeff_11__w_20__widths_(2, 5, 10, 20)","sr11__cwt_coefficients__coeff_11__w_10__widths_(2, 5, 10, 20)","sr11__cwt_coefficients__coeff_10__w_20__widths_(2, 5, 10, 20)","sr11__cwt_coefficients__coeff_10__w_10__widths_(2, 5, 10, 20)","sr11__cwt_coefficients__coeff_9__w_20__widths_(2, 5, 10, 20)","sr11__cwt_coefficients__coeff_9__w_10__widths_(2, 5, 10, 20)","sr13__fft_coefficient__attr_""abs""__coeff_0",...,op1__energy_ratio_by_chunks__num_segments_10__segment_focus_3,sr21__ar_coefficient__coeff_2__k_10,"sr21__change_quantiles__f_agg_""var""__isabs_True__qh_0.8__ql_0.2",sr14__fourier_entropy__bins_5,op1__energy_ratio_by_chunks__num_segments_10__segment_focus_6,"op2__fft_coefficient__attr_""real""__coeff_2","sr12__change_quantiles__f_agg_""var""__isabs_True__qh_0.8__ql_0.4",sr4__mean_second_derivative_central,sr8__fourier_entropy__bins_100,"sr15__fft_coefficient__attr_""imag""__coeff_9"
0,-0.842181,-0.838413,-0.806226,-0.836881,-0.821118,-0.823963,-0.867343,-0.835428,-0.858655,-0.190428,...,1.093325,-1.004748,0.400922,1.252703,0.709052,-0.756658,-0.270937,-0.862584,-1.796369,-1.014155
1,-0.863929,-0.828755,-0.801419,-0.829929,-0.805709,-0.827613,-0.818414,-0.813428,-0.862834,-0.169484,...,1.035431,-0.600183,0.45632,0.783171,0.406773,0.07836,-0.270937,2.567499,0.104514,0.815963
2,-0.901729,-0.810236,-0.816211,-0.818401,-0.80183,-0.818612,-0.803315,-0.815164,-0.813059,-0.17472,...,0.356298,-0.608362,0.327787,-0.163174,0.363115,0.52683,-0.259084,-0.868742,0.104514,0.313723
3,-0.944127,-0.795287,-0.831311,-0.81028,-0.810288,-0.819318,-0.797233,-0.820204,-0.800143,-0.156393,...,-0.492443,-0.827579,0.98076,-0.760139,-1.255755,0.621352,-0.259084,-0.74558,-2.669155,-0.793617
4,-0.972319,-0.837979,-0.728343,-0.812112,-0.815505,-0.830771,-0.802232,-0.843552,-0.797674,-0.156393,...,-0.824522,-1.041292,0.287477,-0.760139,-0.603734,0.448165,-0.279851,-0.739421,0.104514,0.290826


In [None]:
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Split data into training and validation sets
X_train3, X_val, y_train3, y_val = train_test_split(relevant_features_scaled_df3, y3, test_size=0.2, random_state=42)

In [24]:
#model 1
xgb_model3 = XGBRegressor(objective='reg:squarederror', random_state=42) 

# Train the model
xgb_model3.fit(X_train3, y_train3)
xgb_model3

In [20]:
#model 2: reduce model complexity (do not use: gave a RMSE of 31.27)
#xgb_model3 = XGBRegressor(objective='reg:squarederror',
                          random_state=42,
                          max_depth=2,           # Further reduce depth
                          learning_rate=0.01,    # Lower learning rate for smoother convergence
                          n_estimators=500,      # Increase estimators for fine-tuning with lower learning rate
                          subsample=0.8,
                          colsample_bytree=0.8,
                          reg_alpha=5,
                          reg_lambda=5)

xgb_model3.fit(X_train3, y_train3)
xgb_model3

In [22]:
#model 3: increase regularisation (do not use: gave RMSE of 22.12)
xgb_model3 = XGBRegressor(objective='reg:squarederror', 
                          random_state=42,
                          max_depth=3,
                          learning_rate=0.05,
                          n_estimators=200,
                          subsample=0.8,
                          colsample_bytree=0.8,
                          reg_alpha=5,           
                          reg_lambda=5)  

In [25]:
# Predict and evaluate
y_pred = xgb_model3.predict(X_val)
mse = mean_squared_error(y_val, y_pred)
rmse = mse**0.5
print(rmse) #use model 1

10.416054684399942
