In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from tsfresh import extract_features, select_features
from tsfresh.utilities.dataframe_functions import impute, roll_time_series
from tsfresh.feature_extraction import EfficientFCParameters
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor

In [12]:
#load and process data with op settings
df3=pd.read_csv(r'C:\Users\65962\Desktop\JUPYTER\CMAPSSData\train_FD003.txt', delim_whitespace=True, header=None)
df3.head()


  df3=pd.read_csv(r'C:\Users\65962\Desktop\JUPYTER\CMAPSSData\test_FD003.txt', delim_whitespace=True, header=None)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
0,1,1,-0.0017,-0.0004,100.0,518.67,641.94,1581.93,1396.93,14.62,...,521.89,2387.94,8133.48,8.376,0.03,391,2388,100.0,39.07,23.4468
1,1,2,0.0006,-0.0002,100.0,518.67,642.02,1584.86,1398.9,14.62,...,521.85,2388.01,8137.44,8.4062,0.03,391,2388,100.0,39.04,23.4807
2,1,3,0.0014,-0.0003,100.0,518.67,641.68,1581.78,1391.92,14.62,...,522.1,2387.94,8138.25,8.3553,0.03,391,2388,100.0,39.1,23.4244
3,1,4,0.0027,0.0001,100.0,518.67,642.2,1584.53,1395.34,14.62,...,522.45,2387.96,8137.07,8.3709,0.03,392,2388,100.0,38.97,23.4782
4,1,5,-0.0001,0.0001,100.0,518.67,642.46,1589.03,1395.86,14.62,...,521.91,2387.97,8134.2,8.4146,0.03,391,2388,100.0,39.09,23.395


In [13]:
columns = ['unit', 'time', 'op1', 'op2', 'op3'] + [f'sr{i+1}' for i in range(23)]
if len(columns) != df3.shape[1]:
    columns = ['unit', 'time', 'op1', 'op2', 'op3'] + [f'sr{i+1}' for i in range(df3.shape[1] - 5)] #to generate correct number of sensor cols; prevent mismatch error

df3.columns = columns
df3['max_time'] = df3.groupby('unit')['time'].transform('max')
df3['remaining_time'] = df3['max_time'] - df3['time']
df3['label'] = df3['remaining_time'].clip(upper=130)
df3.head()

Unnamed: 0,unit,time,op1,op2,op3,sr1,sr2,sr3,sr4,sr5,...,sr15,sr16,sr17,sr18,sr19,sr20,sr21,max_time,remaining_time,label
0,1,1,-0.0017,-0.0004,100.0,518.67,641.94,1581.93,1396.93,14.62,...,8.376,0.03,391,2388,100.0,39.07,23.4468,233,232,130
1,1,2,0.0006,-0.0002,100.0,518.67,642.02,1584.86,1398.9,14.62,...,8.4062,0.03,391,2388,100.0,39.04,23.4807,233,231,130
2,1,3,0.0014,-0.0003,100.0,518.67,641.68,1581.78,1391.92,14.62,...,8.3553,0.03,391,2388,100.0,39.1,23.4244,233,230,130
3,1,4,0.0027,0.0001,100.0,518.67,642.2,1584.53,1395.34,14.62,...,8.3709,0.03,392,2388,100.0,38.97,23.4782,233,229,130
4,1,5,-0.0001,0.0001,100.0,518.67,642.46,1589.03,1395.86,14.62,...,8.4146,0.03,391,2388,100.0,39.09,23.395,233,228,130


In [14]:
#rolling windows

window_size = 30

rolled_df3 = roll_time_series(df3, 
                             column_id='unit',
                             column_sort='time',
                             max_timeshift=window_size-1,
                             min_timeshift=window_size-1,
                             rolling_direction=1)

rolled_df3

Rolling: 100%|█████████████████████████████████████████████████████████████████████████| 40/40 [00:04<00:00,  9.26it/s]


Unnamed: 0,unit,time,op1,op2,op3,sr1,sr2,sr3,sr4,sr5,...,sr16,sr17,sr18,sr19,sr20,sr21,max_time,remaining_time,label,id
0,1,1,-0.0017,-0.0004,100.0,518.67,641.94,1581.93,1396.93,14.62,...,0.03,391,2388,100.0,39.07,23.4468,233,232,130,"(1, 30)"
1,1,2,0.0006,-0.0002,100.0,518.67,642.02,1584.86,1398.90,14.62,...,0.03,391,2388,100.0,39.04,23.4807,233,231,130,"(1, 30)"
2,1,3,0.0014,-0.0003,100.0,518.67,641.68,1581.78,1391.92,14.62,...,0.03,391,2388,100.0,39.10,23.4244,233,230,130,"(1, 30)"
3,1,4,0.0027,0.0001,100.0,518.67,642.20,1584.53,1395.34,14.62,...,0.03,392,2388,100.0,38.97,23.4782,233,229,130,"(1, 30)"
4,1,5,-0.0001,0.0001,100.0,518.67,642.46,1589.03,1395.86,14.62,...,0.03,391,2388,100.0,39.09,23.3950,233,228,130,"(1, 30)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
368755,100,243,0.0011,-0.0003,100.0,518.67,643.04,1594.99,1411.28,14.62,...,0.03,395,2388,100.0,39.40,23.4949,247,4,4,"(100, 247)"
368756,100,244,-0.0024,0.0002,100.0,518.67,642.47,1591.27,1417.64,14.62,...,0.03,394,2388,100.0,39.42,23.6011,247,3,3,"(100, 247)"
368757,100,245,-0.0053,0.0002,100.0,518.67,642.70,1593.81,1412.70,14.62,...,0.03,394,2388,100.0,39.43,23.5482,247,2,2,"(100, 247)"
368758,100,246,-0.0006,0.0001,100.0,518.67,642.19,1595.63,1406.58,14.62,...,0.03,395,2388,100.0,39.40,23.6687,247,1,1,"(100, 247)"


In [15]:
#select cols
selected_cols=['id','op1','op2','op3','sr2', 'sr3', 'sr4',
                    'sr7', 'sr8', 'sr9', 'sr11',
                    'sr12', 'sr13', 'sr14', 'sr15',
                    'sr17', 'sr20', 'sr21']

selected_df3=rolled_df3[selected_cols]
selected_df3

Unnamed: 0,id,op1,op2,op3,sr2,sr3,sr4,sr7,sr8,sr9,sr11,sr12,sr13,sr14,sr15,sr17,sr20,sr21
0,"(1, 30)",-0.0017,-0.0004,100.0,641.94,1581.93,1396.93,554.56,2387.93,9048.65,47.09,521.89,2387.94,8133.48,8.3760,391,39.07,23.4468
1,"(1, 30)",0.0006,-0.0002,100.0,642.02,1584.86,1398.90,554.10,2387.94,9046.53,47.08,521.85,2388.01,8137.44,8.4062,391,39.04,23.4807
2,"(1, 30)",0.0014,-0.0003,100.0,641.68,1581.78,1391.92,554.41,2387.97,9054.92,47.15,522.10,2387.94,8138.25,8.3553,391,39.10,23.4244
3,"(1, 30)",0.0027,0.0001,100.0,642.20,1584.53,1395.34,554.58,2387.94,9055.04,47.26,522.45,2387.96,8137.07,8.3709,392,38.97,23.4782
4,"(1, 30)",-0.0001,0.0001,100.0,642.46,1589.03,1395.86,554.16,2388.01,9048.59,46.94,521.91,2387.97,8134.20,8.4146,391,39.09,23.3950
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
368755,"(100, 247)",0.0011,-0.0003,100.0,643.04,1594.99,1411.28,561.21,2388.24,9088.31,47.61,529.23,2388.24,8162.24,8.2939,395,39.40,23.4949
368756,"(100, 247)",-0.0024,0.0002,100.0,642.47,1591.27,1417.64,561.02,2388.29,9085.35,47.75,529.27,2388.26,8163.24,8.3247,394,39.42,23.6011
368757,"(100, 247)",-0.0053,0.0002,100.0,642.70,1593.81,1412.70,561.16,2388.23,9084.29,47.60,529.48,2388.28,8162.12,8.3264,394,39.43,23.5482
368758,"(100, 247)",-0.0006,0.0001,100.0,642.19,1595.63,1406.58,562.66,2388.24,9092.52,47.69,529.39,2388.33,8164.20,8.2908,395,39.40,23.6687


In [6]:
#extraction of features

efficient_params = EfficientFCParameters()

def batch_feature_extraction(df, batch_size=100):
    feature_batches = []
    for i in range(0, len(df['id'].unique()), batch_size):
        batch_df = df[df['id'].isin(df['id'].unique()[i:i+batch_size])]
        X_batch = extract_features(batch_df,
                                   column_id='id',
                                   default_fc_parameters=efficient_params,
                                   impute_function=impute)
        feature_batches.append(X_batch)
    return pd.concat(feature_batches)

X3 = batch_feature_extraction(selected_df3, batch_size=50)


Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 39/39 [00:10<00:00,  3.64it/s]
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 39/39 [00:10<00:00,  3.77it/s]
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 39/39 [00:10<00:00,  3.83it/s]
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 39/39 [00:10<00:00,  3.74it/s]
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 39/39 [00:10<00:00,  3.75it/s]
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 39/39 [00:10<00:00,  3.79it/s]
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 39/39 [00:10<00:00,  3.83it/s]
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 39/39 [00:10<00:00,  3.76it/s]
Feature Extraction: 100%|███████████████

In [7]:
#select relevant features
y3 = rolled_df3.groupby('id')['label'].last()
X_filtered3 = select_features(X3, y3)
X_filtered3.head()

Unnamed: 0,Unnamed: 1,sr12__longest_strike_above_mean,sr9__autocorrelation__lag_4,sr9__autocorrelation__lag_5,sr9__autocorrelation__lag_6,"sr9__agg_autocorrelation__f_agg_""mean""__maxlag_40","sr9__agg_autocorrelation__f_agg_""median""__maxlag_40","sr9__agg_autocorrelation__f_agg_""var""__maxlag_40",sr11__index_mass_quantile__q_0.9,sr12__autocorrelation__lag_1,sr9__quantile__q_0.7,...,"sr12__fft_coefficient__attr_""abs""__coeff_15","op2__fft_coefficient__attr_""imag""__coeff_7","sr15__change_quantiles__f_agg_""mean""__isabs_True__qh_0.6__ql_0.0","sr12__change_quantiles__f_agg_""var""__isabs_True__qh_1.0__ql_0.2","op2__fft_coefficient__attr_""real""__coeff_4","sr9__agg_linear_trend__attr_""stderr""__chunk_len_5__f_agg_""var""",sr11__binned_entropy__max_bins_10,"sr7__change_quantiles__f_agg_""mean""__isabs_False__qh_0.2__ql_0.0","sr17__change_quantiles__f_agg_""var""__isabs_True__qh_0.8__ql_0.4","sr13__change_quantiles__f_agg_""mean""__isabs_False__qh_0.2__ql_0.0"
1,30,4.0,-0.156993,-0.242751,0.024429,0.02566,0.00598,0.179223,0.9,0.172593,9073.08,...,2.73,-0.000931,0.015089,0.056214,0.000313,2.320262,2.064017,-0.14,0.237654,-0.05
1,31,4.0,-0.06092,-0.158389,0.024689,0.03198,-0.010781,0.154344,0.933333,0.175388,9073.08,...,2.69,0.000304,0.010929,0.056354,-0.000764,2.413649,2.070729,-0.14,0.242215,-0.05
1,32,4.0,0.064658,-0.086831,-0.031135,-0.052343,-0.00477,0.062722,0.933333,0.188228,9073.08,...,2.99,0.000795,0.010929,0.05435,-0.00117,1.779132,2.079151,-0.14,0.246094,-0.05
1,33,4.0,0.066106,0.116911,0.043965,-0.06244,0.043965,0.201844,0.9,0.152381,9073.08,...,3.07,-0.000238,0.012367,0.05435,-0.000668,4.29839,2.070729,-0.14,0.246094,-0.05
1,34,3.0,0.03108,0.106668,0.070959,-0.151259,0.027827,0.156176,0.933333,0.157945,9073.08,...,2.61,-0.001143,0.018943,0.056078,0.000175,3.702011,2.092785,-0.14,0.234375,-0.05


In [9]:
# Data scaling and train-validation split

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
relevant_features_scaled3 = scaler.fit_transform(X_filtered3)
relevant_features_scaled_df3 = pd.DataFrame(relevant_features_scaled3, columns=X_filtered3.columns)
relevant_features_scaled_df3.head()


Unnamed: 0,sr12__longest_strike_above_mean,sr9__autocorrelation__lag_4,sr9__autocorrelation__lag_5,sr9__autocorrelation__lag_6,"sr9__agg_autocorrelation__f_agg_""mean""__maxlag_40","sr9__agg_autocorrelation__f_agg_""median""__maxlag_40","sr9__agg_autocorrelation__f_agg_""var""__maxlag_40",sr11__index_mass_quantile__q_0.9,sr12__autocorrelation__lag_1,sr9__quantile__q_0.7,...,"sr12__fft_coefficient__attr_""abs""__coeff_15","op2__fft_coefficient__attr_""imag""__coeff_7","sr15__change_quantiles__f_agg_""mean""__isabs_True__qh_0.6__ql_0.0","sr12__change_quantiles__f_agg_""var""__isabs_True__qh_1.0__ql_0.2","op2__fft_coefficient__attr_""real""__coeff_4","sr9__agg_linear_trend__attr_""stderr""__chunk_len_5__f_agg_""var""",sr11__binned_entropy__max_bins_10,"sr7__change_quantiles__f_agg_""mean""__isabs_False__qh_0.2__ql_0.0","sr17__change_quantiles__f_agg_""var""__isabs_True__qh_0.8__ql_0.4","sr13__change_quantiles__f_agg_""mean""__isabs_False__qh_0.2__ql_0.0"
0,-0.619156,-0.870997,-1.27764,-0.094784,0.948874,0.789166,-0.192067,-1.591191,0.005542,0.478429,...,0.825313,-0.822523,-0.048105,-0.142579,0.282397,0.117663,0.01358,-0.604524,0.181284,-1.029983
1,-0.619156,-0.495395,-0.925039,-0.093653,0.987854,0.658851,-0.29005,0.62846,0.014141,0.478429,...,0.794691,0.26983,-0.976611,-0.139919,-0.686943,0.200114,0.074486,-0.604524,0.22101,-1.029983
2,-0.619156,-0.004446,-0.625957,-0.335752,0.467765,0.705587,-0.650895,0.62846,0.053644,0.478429,...,1.024357,0.704036,-0.976611,-0.177874,-1.051769,-0.360104,0.150914,-0.604524,0.254803,-1.029983
3,-0.619156,0.001218,0.225605,-0.010059,0.405486,1.084512,-0.102975,-1.591191,-0.056644,0.478429,...,1.085602,-0.209583,-0.655655,-0.177874,-0.600428,1.864161,0.074486,-0.604524,0.254803,-1.029983
4,-0.939913,-0.135721,0.182795,0.107007,-0.142333,0.959033,-0.282836,0.62846,-0.039525,0.478429,...,0.733447,-1.010708,0.812031,-0.145156,0.15805,1.337616,0.274635,-0.604524,0.152716,-1.029983


In [None]:
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
# Split data into training and validation sets
X_train3, X_val, y_train3, y_val = train_test_split(relevant_features_scaled_df3, y3, test_size=0.2, random_state=42)

In [34]:
#model 1: without tweaking the parameters 
xgb_model3 = XGBRegressor(objective='reg:squarederror', random_state=42)

# Train the model
xgb_model3.fit(X_train3, y_train3)
xgb_model3

In [30]:
#model 2: reducing complexity of model (gave an rmse of 13.00)
#xgb_model3 = XGBRegressor(objective='reg:squarederror',
                          random_state=42,
                          max_depth=2,           # Further reduce depth
                          learning_rate=0.01,    # Lower learning rate for smoother convergence
                          n_estimators=500,      # Increase estimators for fine-tuning with lower learning rate
                          subsample=0.8,
                          colsample_bytree=0.8,
                          reg_alpha=5,
                          reg_lambda=5)

In [31]:
#model 3: increase regularisation (do not use as RMSE maybe 10.01 but not great in the test data)
xgb_model3 = XGBRegressor(objective='reg:squarederror', 
                          random_state=42,
                          max_depth=3,
                          learning_rate=0.05,
                          n_estimators=200,
                          subsample=0.8,
                          colsample_bytree=0.8,
                          reg_alpha=5,           
                          reg_lambda=5)  

In [35]:
# Predict and evaluate
y_pred = xgb_model3.predict(X_val)
mse = mean_squared_error(y_val, y_pred)
rmse = mse**0.5
print(rmse) # choose model 1

5.675786797304427
