In [3]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from tsfresh import extract_features, select_features
from tsfresh.utilities.dataframe_functions import impute, roll_time_series
from tsfresh.feature_extraction import EfficientFCParameters
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor

In [2]:
#load and process data (with op settings)
df3=pd.read_csv(r'C:\Users\65962\Desktop\JUPYTER\CMAPSSData\train_FD003.txt', delim_whitespace=True, header=None)
df3.head()


  df3=pd.read_csv(r'C:\Users\65962\Desktop\JUPYTER\CMAPSSData\train_FD003.txt', delim_whitespace=True, header=None)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
0,1,1,-0.0005,0.0004,100.0,518.67,642.36,1583.23,1396.84,14.62,...,522.31,2388.01,8145.32,8.4246,0.03,391,2388,100.0,39.11,23.3537
1,1,2,0.0008,-0.0003,100.0,518.67,642.5,1584.69,1396.89,14.62,...,522.42,2388.03,8152.85,8.4403,0.03,392,2388,100.0,38.99,23.4491
2,1,3,-0.0014,-0.0002,100.0,518.67,642.18,1582.35,1405.61,14.62,...,522.03,2388.0,8150.17,8.3901,0.03,391,2388,100.0,38.85,23.3669
3,1,4,-0.002,0.0001,100.0,518.67,642.92,1585.61,1392.27,14.62,...,522.49,2388.08,8146.56,8.3878,0.03,392,2388,100.0,38.96,23.2951
4,1,5,0.0016,0.0,100.0,518.67,641.68,1588.63,1397.65,14.62,...,522.58,2388.03,8147.8,8.3869,0.03,392,2388,100.0,39.14,23.4583


In [3]:
#Define columns (with op settings)
columns = ['unit', 'time', 'op1', 'op2', 'op3'] + [f'sr{i+1}' for i in range(23)]
if len(columns) != df3.shape[1]:
    columns = ['unit', 'time', 'op1', 'op2', 'op3'] + [f'sr{i+1}' for i in range(df3.shape[1] - 5)] #to generate correct number of sensor cols; prevent mismatch error

df3.columns = columns
df3['max_time'] = df3.groupby('unit')['time'].transform('max')
df3['remaining_time'] = df3['max_time'] - df3['time']
df3['label'] = df3['remaining_time'].clip(upper=130)
df3.head()

Unnamed: 0,unit,time,op1,op2,op3,sr1,sr2,sr3,sr4,sr5,...,sr15,sr16,sr17,sr18,sr19,sr20,sr21,max_time,remaining_time,label
0,1,1,-0.0005,0.0004,100.0,518.67,642.36,1583.23,1396.84,14.62,...,8.4246,0.03,391,2388,100.0,39.11,23.3537,259,258,130
1,1,2,0.0008,-0.0003,100.0,518.67,642.5,1584.69,1396.89,14.62,...,8.4403,0.03,392,2388,100.0,38.99,23.4491,259,257,130
2,1,3,-0.0014,-0.0002,100.0,518.67,642.18,1582.35,1405.61,14.62,...,8.3901,0.03,391,2388,100.0,38.85,23.3669,259,256,130
3,1,4,-0.002,0.0001,100.0,518.67,642.92,1585.61,1392.27,14.62,...,8.3878,0.03,392,2388,100.0,38.96,23.2951,259,255,130
4,1,5,0.0016,0.0,100.0,518.67,641.68,1588.63,1397.65,14.62,...,8.3869,0.03,392,2388,100.0,39.14,23.4583,259,254,130


In [4]:
#Rolling windows

window_size = 30

rolled_df3 = roll_time_series(df3, 
                             column_id='unit',
                             column_sort='time',
                             max_timeshift=window_size-1,
                             min_timeshift=window_size-1,
                             rolling_direction=1)

rolled_df3

Rolling: 100%|█████████████████████████████████████████████████████████████████████████| 38/38 [00:05<00:00,  6.61it/s]


Unnamed: 0,unit,time,op1,op2,op3,sr1,sr2,sr3,sr4,sr5,...,sr16,sr17,sr18,sr19,sr20,sr21,max_time,remaining_time,label,id
0,1,1,-0.0005,0.0004,100.0,518.67,642.36,1583.23,1396.84,14.62,...,0.03,391,2388,100.0,39.11,23.3537,259,258,130,"(1, 30)"
1,1,2,0.0008,-0.0003,100.0,518.67,642.50,1584.69,1396.89,14.62,...,0.03,392,2388,100.0,38.99,23.4491,259,257,130,"(1, 30)"
2,1,3,-0.0014,-0.0002,100.0,518.67,642.18,1582.35,1405.61,14.62,...,0.03,391,2388,100.0,38.85,23.3669,259,256,130,"(1, 30)"
3,1,4,-0.0020,0.0001,100.0,518.67,642.92,1585.61,1392.27,14.62,...,0.03,392,2388,100.0,38.96,23.2951,259,255,130,"(1, 30)"
4,1,5,0.0016,0.0000,100.0,518.67,641.68,1588.63,1397.65,14.62,...,0.03,392,2388,100.0,39.14,23.4583,259,254,130,"(1, 30)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
480145,100,148,-0.0016,-0.0003,100.0,518.67,643.78,1596.01,1424.11,14.62,...,0.03,394,2388,100.0,38.44,22.9631,152,4,4,"(100, 152)"
480146,100,149,0.0034,-0.0003,100.0,518.67,643.29,1596.38,1429.14,14.62,...,0.03,395,2388,100.0,38.50,22.9746,152,3,3,"(100, 152)"
480147,100,150,-0.0016,0.0004,100.0,518.67,643.84,1604.53,1431.41,14.62,...,0.03,396,2388,100.0,38.39,23.0682,152,2,2,"(100, 152)"
480148,100,151,-0.0023,0.0004,100.0,518.67,643.94,1597.56,1426.57,14.62,...,0.03,395,2388,100.0,38.31,23.0753,152,1,1,"(100, 152)"


In [5]:
#Select features (with op settings)
selected_cols=['id','op1','op2','op3','sr2', 'sr3', 'sr4',
                    'sr7', 'sr8', 'sr9', 'sr11',
                    'sr12', 'sr13', 'sr14', 'sr15',
                    'sr17', 'sr20', 'sr21']

selected_df3=rolled_df3[selected_cols]
selected_df3

Unnamed: 0,id,op1,op2,op3,sr2,sr3,sr4,sr7,sr8,sr9,sr11,sr12,sr13,sr14,sr15,sr17,sr20,sr21
0,"(1, 30)",-0.0005,0.0004,100.0,642.36,1583.23,1396.84,553.97,2387.96,9062.17,47.30,522.31,2388.01,8145.32,8.4246,391,39.11,23.3537
1,"(1, 30)",0.0008,-0.0003,100.0,642.50,1584.69,1396.89,554.55,2388.00,9061.78,47.23,522.42,2388.03,8152.85,8.4403,392,38.99,23.4491
2,"(1, 30)",-0.0014,-0.0002,100.0,642.18,1582.35,1405.61,554.43,2388.03,9070.23,47.22,522.03,2388.00,8150.17,8.3901,391,38.85,23.3669
3,"(1, 30)",-0.0020,0.0001,100.0,642.92,1585.61,1392.27,555.21,2388.00,9064.57,47.24,522.49,2388.08,8146.56,8.3878,392,38.96,23.2951
4,"(1, 30)",0.0016,0.0000,100.0,641.68,1588.63,1397.65,554.74,2388.04,9076.14,47.15,522.58,2388.03,8147.80,8.3869,392,39.14,23.4583
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
480145,"(100, 152)",-0.0016,-0.0003,100.0,643.78,1596.01,1424.11,551.86,2388.25,9070.82,48.27,519.66,2388.30,8138.08,8.5036,394,38.44,22.9631
480146,"(100, 152)",0.0034,-0.0003,100.0,643.29,1596.38,1429.14,551.86,2388.23,9064.60,48.13,519.91,2388.28,8144.36,8.5174,395,38.50,22.9746
480147,"(100, 152)",-0.0016,0.0004,100.0,643.84,1604.53,1431.41,551.30,2388.25,9063.45,48.18,519.44,2388.24,8135.95,8.5223,396,38.39,23.0682
480148,"(100, 152)",-0.0023,0.0004,100.0,643.94,1597.56,1426.57,550.69,2388.26,9062.22,48.05,520.01,2388.26,8141.24,8.5148,395,38.31,23.0753


In [7]:
#Extraction of features (with op settings)

efficient_params = EfficientFCParameters()

def batch_feature_extraction(df, batch_size=100):
    feature_batches = []
    for i in range(0, len(df['id'].unique()), batch_size):
        batch_df = df[df['id'].isin(df['id'].unique()[i:i+batch_size])]
        X_batch = extract_features(batch_df,
                                   column_id='id',
                                   default_fc_parameters=efficient_params,
                                   impute_function=impute)
        feature_batches.append(X_batch)
    return pd.concat(feature_batches)

X3 = batch_feature_extraction(selected_df3, batch_size=100)


Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 40/40 [00:19<00:00,  2.09it/s]
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 40/40 [00:18<00:00,  2.13it/s]
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 40/40 [00:18<00:00,  2.12it/s]
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 40/40 [00:18<00:00,  2.12it/s]
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 40/40 [00:18<00:00,  2.11it/s]
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 40/40 [00:19<00:00,  2.09it/s]
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 40/40 [00:19<00:00,  2.09it/s]
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 40/40 [00:18<00:00,  2.11it/s]
Feature Extraction: 100%|███████████████

In [8]:
#Select relevant features (with op settings)
y3 = rolled_df3.groupby('id')['label'].last()
X_filtered3 = select_features(X3, y3)
X_filtered3.head()

Unnamed: 0,Unnamed: 1,sr11__median,"sr14__fft_coefficient__attr_""imag""__coeff_2","sr14__cwt_coefficients__coeff_6__w_20__widths_(2, 5, 10, 20)","sr14__cwt_coefficients__coeff_7__w_5__widths_(2, 5, 10, 20)",sr11__first_location_of_maximum,sr11__last_location_of_minimum,sr11__first_location_of_minimum,sr11__maximum,sr11__absolute_maximum,sr11__minimum,...,"sr12__fft_coefficient__attr_""abs""__coeff_15","op2__fft_coefficient__attr_""imag""__coeff_7","sr15__change_quantiles__f_agg_""mean""__isabs_True__qh_0.6__ql_0.0","sr12__change_quantiles__f_agg_""var""__isabs_True__qh_1.0__ql_0.2","op2__fft_coefficient__attr_""real""__coeff_4","sr9__agg_linear_trend__attr_""stderr""__chunk_len_5__f_agg_""var""",sr11__binned_entropy__max_bins_10,"sr7__change_quantiles__f_agg_""mean""__isabs_False__qh_0.2__ql_0.0","sr17__change_quantiles__f_agg_""var""__isabs_True__qh_0.8__ql_0.4","sr13__change_quantiles__f_agg_""mean""__isabs_False__qh_0.2__ql_0.0"
1,30,47.26,5.226501,26968.10338,8838.463622,0.766667,0.566667,0.533333,47.41,47.41,47.0,...,2.73,-0.000931,0.015089,0.056214,0.000313,2.320262,2.064017,-0.14,0.237654,-0.05
1,31,47.25,3.611017,26969.093525,8838.08901,0.733333,0.533333,0.5,47.41,47.41,47.0,...,2.69,0.000304,0.010929,0.056354,-0.000764,2.413649,2.070729,-0.14,0.242215,-0.05
1,32,47.25,0.850532,26968.930469,8838.718534,0.7,0.5,0.466667,47.41,47.41,47.0,...,2.99,0.000795,0.010929,0.05435,-0.00117,1.779132,2.079151,-0.14,0.246094,-0.05
1,33,47.26,-0.214501,26969.387052,8838.973157,0.666667,0.466667,0.433333,47.41,47.41,47.0,...,3.07,-0.000238,0.012367,0.05435,-0.000668,4.29839,2.070729,-0.14,0.246094,-0.05
1,34,47.265,-1.226175,26970.490326,8838.742339,0.966667,0.433333,0.4,47.47,47.47,47.0,...,2.61,-0.001143,0.018943,0.056078,0.000175,3.702011,2.092785,-0.14,0.234375,-0.05


In [9]:
# Data scaling and train-validation split

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
relevant_features_scaled3 = scaler.fit_transform(X_filtered3)
relevant_features_scaled_df3 = pd.DataFrame(relevant_features_scaled3, columns=X_filtered3.columns)
relevant_features_scaled_df3.head()


Unnamed: 0,sr11__median,"sr14__fft_coefficient__attr_""imag""__coeff_2","sr14__cwt_coefficients__coeff_6__w_20__widths_(2, 5, 10, 20)","sr14__cwt_coefficients__coeff_7__w_5__widths_(2, 5, 10, 20)",sr11__first_location_of_maximum,sr11__last_location_of_minimum,sr11__first_location_of_minimum,sr11__maximum,sr11__absolute_maximum,sr11__minimum,...,"sr12__fft_coefficient__attr_""abs""__coeff_15","op2__fft_coefficient__attr_""imag""__coeff_7","sr15__change_quantiles__f_agg_""mean""__isabs_True__qh_0.6__ql_0.0","sr12__change_quantiles__f_agg_""var""__isabs_True__qh_1.0__ql_0.2","op2__fft_coefficient__attr_""real""__coeff_4","sr9__agg_linear_trend__attr_""stderr""__chunk_len_5__f_agg_""var""",sr11__binned_entropy__max_bins_10,"sr7__change_quantiles__f_agg_""mean""__isabs_False__qh_0.2__ql_0.0","sr17__change_quantiles__f_agg_""var""__isabs_True__qh_0.8__ql_0.4","sr13__change_quantiles__f_agg_""mean""__isabs_False__qh_0.2__ql_0.0"
0,-0.517351,-0.193092,0.586761,0.651022,0.588274,0.579977,0.699543,-0.774756,-0.774756,-0.687739,...,0.825313,-0.822523,-0.048105,-0.142579,0.282397,0.117663,0.01358,-0.604524,0.181284,-1.029983
1,-0.558577,-0.265324,0.61169,0.617902,0.472297,0.460878,0.57815,-0.774756,-0.774756,-0.687739,...,0.794691,0.26983,-0.976611,-0.139919,-0.686943,0.200114,0.074486,-0.604524,0.22101,-1.029983
2,-0.558577,-0.388752,0.607585,0.673559,0.356319,0.341779,0.456756,-0.774756,-0.774756,-0.687739,...,1.024357,0.704036,-0.976611,-0.177874,-1.051769,-0.360104,0.150914,-0.604524,0.254803,-1.029983
3,-0.517351,-0.436373,0.619081,0.69607,0.240342,0.22268,0.335362,-0.774756,-0.774756,-0.687739,...,1.085602,-0.209583,-0.655655,-0.177874,-0.600428,1.864161,0.074486,-0.604524,0.254803,-1.029983
4,-0.496738,-0.481607,0.646858,0.675663,1.284138,0.103581,0.213969,-0.546222,-0.546222,-0.687739,...,0.733447,-1.010708,0.812031,-0.145156,0.15805,1.337616,0.274635,-0.604524,0.152716,-1.029983


In [10]:
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Split data into training and validation sets
X_train3, X_val, y_train3, y_val = train_test_split(relevant_features_scaled_df3, y3, test_size=0.2, random_state=42)

In [11]:
#model 1: without tweaking the parameters 
xgb_model3 = XGBRegressor(objective='reg:squarederror', random_state=42)

# Train the model
xgb_model3.fit(X_train3, y_train3)
xgb_model3

In [12]:
# Predict and evaluate
y_pred = xgb_model3.predict(X_val)
mse = mean_squared_error(y_val, y_pred)
rmse = mse**0.5
print(rmse) # choose model 1

5.573059463928551


In [13]:
# Define columns without operation settings (op1, op2, op3)
columns = ['unit', 'time'] + [f'sr{i+1}' for i in range(24)]
if len(columns) != df3.shape[1]:
    columns = ['unit', 'time'] + [f'sr{i+1}' for i in range(df3.shape[1] - 2)]  # Adjust sensor columns dynamically

df3.columns = columns
df3['max_time'] = df3.groupby('unit')['time'].transform('max')
df3['remaining_time'] = df3['max_time'] - df3['time']
df3['label'] = df3['remaining_time'].clip(upper=130)

In [14]:
#Rolling windows
window_size = 40

rolled_df3 = roll_time_series(df3, 
                             column_id='unit',
                             column_sort='time',
                             max_timeshift=window_size-1,
                             min_timeshift=window_size-1,
                             rolling_direction=1)

# Select columns (without op settings)
selected_cols = ['id', 'sr2', 'sr3', 'sr4', 'sr7', 'sr8', 'sr9', 'sr11',
                 'sr12', 'sr13', 'sr14', 'sr15', 'sr17', 'sr20', 'sr21']

selected_df3 = rolled_df3[selected_cols]

Rolling: 100%|█████████████████████████████████████████████████████████████████████████| 38/38 [00:06<00:00,  5.99it/s]


In [15]:
#Extract features
efficient_params = EfficientFCParameters()

def batch_feature_extraction(df, batch_size=100):
    feature_batches = []
    for i in range(0, len(df['id'].unique()), batch_size):
        batch_df = df[df['id'].isin(df['id'].unique()[i:i+batch_size])]
        X_batch = extract_features(batch_df,
                                   column_id='id',
                                   default_fc_parameters=efficient_params,
                                   impute_function=impute)
        feature_batches.append(X_batch)
    return pd.concat(feature_batches)

X3 = batch_feature_extraction(selected_df3, batch_size=100)

Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 40/40 [00:15<00:00,  2.53it/s]
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 40/40 [00:15<00:00,  2.56it/s]
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 40/40 [00:15<00:00,  2.55it/s]
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 40/40 [00:15<00:00,  2.53it/s]
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 40/40 [00:15<00:00,  2.53it/s]
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 40/40 [00:15<00:00,  2.59it/s]
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 40/40 [00:15<00:00,  2.66it/s]
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 40/40 [00:15<00:00,  2.65it/s]
Feature Extraction: 100%|███████████████

In [16]:
# Select relevant features using tsfresh select_features
y3 = rolled_df3.groupby('id')['label'].last()
X_filtered3 = select_features(X3, y3)


In [27]:
from sklearn.feature_selection import SelectKBest, f_regression
# Data scaling and train-validation split
scaler = StandardScaler()
relevant_features_scaled3 = scaler.fit_transform(X_filtered3)
relevant_features_scaled_df3 = pd.DataFrame(relevant_features_scaled3, columns=X_filtered3.columns)
relevant_features_scaled_df3

Unnamed: 0,sr9__lempel_ziv_complexity__bins_5,"sr11__cwt_coefficients__coeff_11__w_10__widths_(2, 5, 10, 20)","sr14__cwt_coefficients__coeff_9__w_20__widths_(2, 5, 10, 20)","sr14__cwt_coefficients__coeff_9__w_10__widths_(2, 5, 10, 20)","sr14__cwt_coefficients__coeff_8__w_20__widths_(2, 5, 10, 20)","sr14__cwt_coefficients__coeff_8__w_10__widths_(2, 5, 10, 20)","sr9__change_quantiles__f_agg_""var""__isabs_True__qh_1.0__ql_0.0","sr9__change_quantiles__f_agg_""var""__isabs_False__qh_1.0__ql_0.0","sr11__cwt_coefficients__coeff_4__w_10__widths_(2, 5, 10, 20)","sr11__cwt_coefficients__coeff_3__w_20__widths_(2, 5, 10, 20)",...,"sr13__fft_coefficient__attr_""real""__coeff_1","sr2__fft_coefficient__attr_""angle""__coeff_19",sr11__large_standard_deviation__r_0.25,"sr2__fft_coefficient__attr_""imag""__coeff_3","sr2__fft_coefficient__attr_""real""__coeff_3","sr11__change_quantiles__f_agg_""mean""__isabs_False__qh_0.8__ql_0.2","sr12__change_quantiles__f_agg_""var""__isabs_False__qh_0.8__ql_0.6","sr12__change_quantiles__f_agg_""var""__isabs_True__qh_0.8__ql_0.6","sr17__change_quantiles__f_agg_""var""__isabs_True__qh_0.6__ql_0.2",sr20__large_standard_deviation__r_0.30000000000000004
0,1.452125,0.089295,-0.809391,-0.817550,-0.811155,-0.800337,1.274559,1.536647,-0.045280,-0.112933,...,-0.036462,0.251281,1.676151,0.124327,-0.556170,0.181426,-0.343460,-0.347273,-0.411692,-0.260695
1,1.452125,0.081847,-0.795291,-0.844228,-0.798424,-0.809520,1.274559,1.536647,0.072887,-0.081530,...,-0.036462,-1.576891,-0.596605,-0.107414,-0.486186,0.103552,-0.343460,-0.347273,-0.303107,-0.260695
2,1.452125,0.067403,-0.781082,-0.735132,-0.781429,-0.835454,1.274559,1.677566,0.128187,-0.056567,...,-0.036462,0.074624,-0.596605,-0.282140,-0.318571,0.034841,-0.343460,-0.347273,-0.303107,-0.260695
3,1.452125,0.065383,-0.784789,-0.729048,-0.782880,-0.733461,1.274559,1.536647,0.194461,-0.047940,...,-0.036462,1.701681,-0.596605,-0.155727,0.314700,0.031448,-0.365017,-0.357987,-0.411692,-0.260695
4,1.452125,0.032568,-0.810676,-0.713567,-0.812258,-0.742570,1.274559,1.536647,0.244696,-0.026149,...,-0.036462,-0.117242,-0.596605,0.038232,0.416968,0.247003,-0.365017,-0.357987,-0.303107,-0.260695
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12692,0.597938,0.444021,0.660965,0.218513,0.641801,0.194081,0.898040,0.578398,0.240614,0.562768,...,3.225666,0.790449,-0.596605,-1.931829,-0.239814,-0.209468,-0.274191,-0.025412,-0.198949,-0.260695
12693,0.597938,0.459220,0.682788,0.226472,0.659996,0.208569,0.898040,0.578398,0.337713,0.634557,...,2.149854,-1.063639,1.676151,-1.795909,0.729615,0.301651,-0.274191,-0.025412,3.276512,-0.260695
12694,0.597938,0.516006,0.731985,0.269372,0.708230,0.232064,0.898040,0.578398,0.349955,0.672849,...,1.001537,0.844473,1.676151,-1.028869,1.936210,0.301651,0.164546,1.581101,3.815668,-0.260695
12695,0.597938,0.549031,0.779197,0.322944,0.754793,0.273221,0.898040,0.578398,0.354437,0.699090,...,-0.191010,-1.049803,1.676151,0.030392,2.325760,0.301651,0.404242,2.458791,4.162220,-0.260695


In [28]:
# Split data into training and validation sets
X_train3, X_val, y_train3, y_val = train_test_split(relevant_features_scaled_df3, y3, test_size=0.2, random_state=42)

In [29]:
# Model 1: Without tweaking the parameters
xgb_model3 = XGBRegressor(objective='reg:squarederror', random_state=42)
xgb_model3.fit(X_train3, y_train3)

# Predict and evaluate
y_pred = xgb_model3.predict(X_val)
mse = mean_squared_error(y_val, y_pred)
rmse = mse ** 0.5
print(rmse)

8.783932956534588


In [4]:
test3=pd.read_csv(r'C:\Users\65962\Desktop\JUPYTER\CMAPSSData\test_FD003.txt', delim_whitespace=True, header=None)
test3.head()

  test3=pd.read_csv(r'C:\Users\65962\Desktop\JUPYTER\CMAPSSData\test_FD003.txt', delim_whitespace=True, header=None)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
0,1,1,-0.0017,-0.0004,100.0,518.67,641.94,1581.93,1396.93,14.62,...,521.89,2387.94,8133.48,8.376,0.03,391,2388,100.0,39.07,23.4468
1,1,2,0.0006,-0.0002,100.0,518.67,642.02,1584.86,1398.9,14.62,...,521.85,2388.01,8137.44,8.4062,0.03,391,2388,100.0,39.04,23.4807
2,1,3,0.0014,-0.0003,100.0,518.67,641.68,1581.78,1391.92,14.62,...,522.1,2387.94,8138.25,8.3553,0.03,391,2388,100.0,39.1,23.4244
3,1,4,0.0027,0.0001,100.0,518.67,642.2,1584.53,1395.34,14.62,...,522.45,2387.96,8137.07,8.3709,0.03,392,2388,100.0,38.97,23.4782
4,1,5,-0.0001,0.0001,100.0,518.67,642.46,1589.03,1395.86,14.62,...,521.91,2387.97,8134.2,8.4146,0.03,391,2388,100.0,39.09,23.395


In [7]:
columns = ['unit', 'time'] + [f'sr{i+1}' for i in range(24)]
if len(columns) != test3.shape[1]:
    columns = ['unit', 'time'] + [f'sr{i+1}' for i in range(test3.shape[1] - 2)]  # Adjust sensor columns dynamically

test3.columns = columns
test3['max_time'] = test3.groupby('unit')['time'].transform('max')
test3['remaining_time'] = test3['max_time'] - test3['time']
test3['label'] = test3['remaining_time'].clip(upper=130)

window_size = 40

rolled_dftest3 = roll_time_series(test3, 
                             column_id='unit',
                             column_sort='time',
                             max_timeshift=window_size-1,
                             min_timeshift=window_size-1,
                             rolling_direction=1)

# Select columns (without op settings)
testselected_cols = ['id', 'sr2', 'sr3', 'sr4', 'sr7', 'sr8', 'sr9', 'sr11',
                 'sr12', 'sr13', 'sr14', 'sr15', 'sr17', 'sr20', 'sr21']

selected_dftest3 = rolled_dftest3[testselected_cols]

efficient_params = EfficientFCParameters()

def batch_feature_extraction(df, batch_size=100):
    feature_batches = []
    for i in range(0, len(df['id'].unique()), batch_size):
        batch_df = df[df['id'].isin(df['id'].unique()[i:i+batch_size])]
        X_batch = extract_features(batch_df,
                                   column_id='id',
                                   default_fc_parameters=efficient_params,
                                   impute_function=impute)
        feature_batches.append(X_batch)
    return pd.concat(feature_batches)

Xtest3 = batch_feature_extraction(selected_dftest3, batch_size=100)


Rolling: 100%|█████████████████████████████████████████████████████████████████████████| 40/40 [00:04<00:00,  8.51it/s]
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 40/40 [00:15<00:00,  2.52it/s]
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 40/40 [00:15<00:00,  2.60it/s]
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 40/40 [00:15<00:00,  2.64it/s]
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 40/40 [00:15<00:00,  2.60it/s]
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 40/40 [00:15<00:00,  2.60it/s]
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 40/40 [00:15<00:00,  2.60it/s]
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 40/40 [00:15<00:00,  2.64it/s]
Feature Extraction: 100%|███████████████

In [11]:
#select relevant features using tsfresh
y3 = rolled_dftest3.groupby('id')['label'].last()
X_testfiltered3 = select_features(Xtest3, y3)


In [14]:
from sklearn.feature_selection import SelectKBest, f_regression
# Data scaling and train-validation split
scaler = StandardScaler()
relevant_features_scaledtest3 = scaler.fit_transform(X_testfiltered3)
relevant_features_scaled_dftest3 = pd.DataFrame(relevant_features_scaledtest3, columns=X_testfiltered3.columns)
relevant_features_scaled_dftest3

Unnamed: 0,sr9__lempel_ziv_complexity__bins_5,"sr11__cwt_coefficients__coeff_11__w_10__widths_(2, 5, 10, 20)","sr14__cwt_coefficients__coeff_9__w_20__widths_(2, 5, 10, 20)","sr14__cwt_coefficients__coeff_9__w_10__widths_(2, 5, 10, 20)","sr14__cwt_coefficients__coeff_8__w_20__widths_(2, 5, 10, 20)","sr14__cwt_coefficients__coeff_8__w_10__widths_(2, 5, 10, 20)","sr9__change_quantiles__f_agg_""var""__isabs_True__qh_1.0__ql_0.0","sr9__change_quantiles__f_agg_""var""__isabs_False__qh_1.0__ql_0.0","sr11__cwt_coefficients__coeff_4__w_10__widths_(2, 5, 10, 20)","sr11__cwt_coefficients__coeff_3__w_20__widths_(2, 5, 10, 20)",...,"sr13__fft_coefficient__attr_""real""__coeff_1","sr2__fft_coefficient__attr_""angle""__coeff_19",sr11__large_standard_deviation__r_0.25,"sr2__fft_coefficient__attr_""imag""__coeff_3","sr2__fft_coefficient__attr_""real""__coeff_3","sr11__change_quantiles__f_agg_""mean""__isabs_False__qh_0.8__ql_0.2","sr12__change_quantiles__f_agg_""var""__isabs_False__qh_0.8__ql_0.6","sr12__change_quantiles__f_agg_""var""__isabs_True__qh_0.8__ql_0.6","sr17__change_quantiles__f_agg_""var""__isabs_True__qh_0.6__ql_0.2",sr20__large_standard_deviation__r_0.30000000000000004
0,1.452125,0.089295,-0.809391,-0.817550,-0.811155,-0.800337,1.274559,1.536647,-0.045280,-0.112933,...,-0.036462,0.251281,1.676151,0.124327,-0.556170,0.181426,-0.343460,-0.347273,-0.411692,-0.260695
1,1.452125,0.081847,-0.795291,-0.844228,-0.798424,-0.809520,1.274559,1.536647,0.072887,-0.081530,...,-0.036462,-1.576891,-0.596605,-0.107414,-0.486186,0.103552,-0.343460,-0.347273,-0.303107,-0.260695
2,1.452125,0.067403,-0.781082,-0.735132,-0.781429,-0.835454,1.274559,1.677566,0.128187,-0.056567,...,-0.036462,0.074624,-0.596605,-0.282140,-0.318571,0.034841,-0.343460,-0.347273,-0.303107,-0.260695
3,1.452125,0.065383,-0.784789,-0.729048,-0.782880,-0.733461,1.274559,1.536647,0.194461,-0.047940,...,-0.036462,1.701681,-0.596605,-0.155727,0.314700,0.031448,-0.365017,-0.357987,-0.411692,-0.260695
4,1.452125,0.032568,-0.810676,-0.713567,-0.812258,-0.742570,1.274559,1.536647,0.244696,-0.026149,...,-0.036462,-0.117242,-0.596605,0.038232,0.416968,0.247003,-0.365017,-0.357987,-0.303107,-0.260695
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12692,0.597938,0.444021,0.660965,0.218513,0.641801,0.194081,0.898040,0.578398,0.240614,0.562768,...,3.225666,0.790449,-0.596605,-1.931829,-0.239814,-0.209468,-0.274191,-0.025412,-0.198949,-0.260695
12693,0.597938,0.459220,0.682788,0.226472,0.659996,0.208569,0.898040,0.578398,0.337713,0.634557,...,2.149854,-1.063639,1.676151,-1.795909,0.729615,0.301651,-0.274191,-0.025412,3.276512,-0.260695
12694,0.597938,0.516006,0.731985,0.269372,0.708230,0.232064,0.898040,0.578398,0.349955,0.672849,...,1.001537,0.844473,1.676151,-1.028869,1.936210,0.301651,0.164546,1.581101,3.815668,-0.260695
12695,0.597938,0.549031,0.779197,0.322944,0.754793,0.273221,0.898040,0.578398,0.354437,0.699090,...,-0.191010,-1.049803,1.676151,0.030392,2.325760,0.301651,0.404242,2.458791,4.162220,-0.260695


In [17]:
# Split data into training and validation sets
X_test3, X_val, y_test3, y_val = train_test_split(relevant_features_scaled_dftest3, y3, test_size=0.2, random_state=42)

In [18]:
# Model 1: Without tweaking the parameters
xgb_modeltest3 = XGBRegressor(objective='reg:squarederror', random_state=42)
xgb_modeltest3.fit(X_test3, y_test3)

# Predict and evaluate
y_pred = xgb_modeltest3.predict(X_val)
mse = mean_squared_error(y_val, y_pred)
rmse = mse ** 0.5
print(rmse)

8.783932956534588
