In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from tsfresh import extract_features, select_features
from tsfresh.utilities.dataframe_functions import impute, roll_time_series
from tsfresh.feature_extraction import EfficientFCParameters
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor

In [2]:
#load and process data with op settings
df3=pd.read_csv(r'C:\Users\65962\Desktop\JUPYTER\CMAPSSData\train_FD003.txt', delim_whitespace=True, header=None)
df3.head()


  df3=pd.read_csv(r'C:\Users\65962\Desktop\JUPYTER\CMAPSSData\train_FD003.txt', delim_whitespace=True, header=None)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
0,1,1,-0.0005,0.0004,100.0,518.67,642.36,1583.23,1396.84,14.62,...,522.31,2388.01,8145.32,8.4246,0.03,391,2388,100.0,39.11,23.3537
1,1,2,0.0008,-0.0003,100.0,518.67,642.5,1584.69,1396.89,14.62,...,522.42,2388.03,8152.85,8.4403,0.03,392,2388,100.0,38.99,23.4491
2,1,3,-0.0014,-0.0002,100.0,518.67,642.18,1582.35,1405.61,14.62,...,522.03,2388.0,8150.17,8.3901,0.03,391,2388,100.0,38.85,23.3669
3,1,4,-0.002,0.0001,100.0,518.67,642.92,1585.61,1392.27,14.62,...,522.49,2388.08,8146.56,8.3878,0.03,392,2388,100.0,38.96,23.2951
4,1,5,0.0016,0.0,100.0,518.67,641.68,1588.63,1397.65,14.62,...,522.58,2388.03,8147.8,8.3869,0.03,392,2388,100.0,39.14,23.4583


In [3]:
columns = ['unit', 'time', 'op1', 'op2', 'op3'] + [f'sr{i+1}' for i in range(23)]
if len(columns) != df3.shape[1]:
    columns = ['unit', 'time', 'op1', 'op2', 'op3'] + [f'sr{i+1}' for i in range(df3.shape[1] - 5)] #to generate correct number of sensor cols; prevent mismatch error

df3.columns = columns
df3['max_time'] = df3.groupby('unit')['time'].transform('max')
df3['remaining_time'] = df3['max_time'] - df3['time']
df3['label'] = df3['remaining_time'].clip(upper=130)
df3.head()

Unnamed: 0,unit,time,op1,op2,op3,sr1,sr2,sr3,sr4,sr5,...,sr15,sr16,sr17,sr18,sr19,sr20,sr21,max_time,remaining_time,label
0,1,1,-0.0005,0.0004,100.0,518.67,642.36,1583.23,1396.84,14.62,...,8.4246,0.03,391,2388,100.0,39.11,23.3537,259,258,130
1,1,2,0.0008,-0.0003,100.0,518.67,642.5,1584.69,1396.89,14.62,...,8.4403,0.03,392,2388,100.0,38.99,23.4491,259,257,130
2,1,3,-0.0014,-0.0002,100.0,518.67,642.18,1582.35,1405.61,14.62,...,8.3901,0.03,391,2388,100.0,38.85,23.3669,259,256,130
3,1,4,-0.002,0.0001,100.0,518.67,642.92,1585.61,1392.27,14.62,...,8.3878,0.03,392,2388,100.0,38.96,23.2951,259,255,130
4,1,5,0.0016,0.0,100.0,518.67,641.68,1588.63,1397.65,14.62,...,8.3869,0.03,392,2388,100.0,39.14,23.4583,259,254,130


In [4]:
#rolling windows

window_size = 30

rolled_df3 = roll_time_series(df3, 
                             column_id='unit',
                             column_sort='time',
                             max_timeshift=window_size-1,
                             min_timeshift=window_size-1,
                             rolling_direction=1)

rolled_df3

Rolling: 100%|█████████████████████████████████████████████████████████████████████████| 38/38 [00:05<00:00,  6.81it/s]


Unnamed: 0,unit,time,op1,op2,op3,sr1,sr2,sr3,sr4,sr5,...,sr16,sr17,sr18,sr19,sr20,sr21,max_time,remaining_time,label,id
0,1,1,-0.0005,0.0004,100.0,518.67,642.36,1583.23,1396.84,14.62,...,0.03,391,2388,100.0,39.11,23.3537,259,258,130,"(1, 30)"
1,1,2,0.0008,-0.0003,100.0,518.67,642.50,1584.69,1396.89,14.62,...,0.03,392,2388,100.0,38.99,23.4491,259,257,130,"(1, 30)"
2,1,3,-0.0014,-0.0002,100.0,518.67,642.18,1582.35,1405.61,14.62,...,0.03,391,2388,100.0,38.85,23.3669,259,256,130,"(1, 30)"
3,1,4,-0.0020,0.0001,100.0,518.67,642.92,1585.61,1392.27,14.62,...,0.03,392,2388,100.0,38.96,23.2951,259,255,130,"(1, 30)"
4,1,5,0.0016,0.0000,100.0,518.67,641.68,1588.63,1397.65,14.62,...,0.03,392,2388,100.0,39.14,23.4583,259,254,130,"(1, 30)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
449005,100,148,-0.0016,-0.0003,100.0,518.67,643.78,1596.01,1424.11,14.62,...,0.03,394,2388,100.0,38.44,22.9631,152,4,4,"(100, 152)"
449006,100,149,0.0034,-0.0003,100.0,518.67,643.29,1596.38,1429.14,14.62,...,0.03,395,2388,100.0,38.50,22.9746,152,3,3,"(100, 152)"
449007,100,150,-0.0016,0.0004,100.0,518.67,643.84,1604.53,1431.41,14.62,...,0.03,396,2388,100.0,38.39,23.0682,152,2,2,"(100, 152)"
449008,100,151,-0.0023,0.0004,100.0,518.67,643.94,1597.56,1426.57,14.62,...,0.03,395,2388,100.0,38.31,23.0753,152,1,1,"(100, 152)"


In [5]:
#select cols
selected_cols=['id','op1','op2','op3','sr2', 'sr3', 'sr4',
                    'sr7', 'sr8', 'sr9', 'sr11',
                    'sr12', 'sr13', 'sr14', 'sr15',
                    'sr17', 'sr20', 'sr21']

selected_df3=rolled_df3[selected_cols]
selected_df3

Unnamed: 0,id,op1,op2,op3,sr2,sr3,sr4,sr7,sr8,sr9,sr11,sr12,sr13,sr14,sr15,sr17,sr20,sr21
0,"(1, 30)",-0.0005,0.0004,100.0,642.36,1583.23,1396.84,553.97,2387.96,9062.17,47.30,522.31,2388.01,8145.32,8.4246,391,39.11,23.3537
1,"(1, 30)",0.0008,-0.0003,100.0,642.50,1584.69,1396.89,554.55,2388.00,9061.78,47.23,522.42,2388.03,8152.85,8.4403,392,38.99,23.4491
2,"(1, 30)",-0.0014,-0.0002,100.0,642.18,1582.35,1405.61,554.43,2388.03,9070.23,47.22,522.03,2388.00,8150.17,8.3901,391,38.85,23.3669
3,"(1, 30)",-0.0020,0.0001,100.0,642.92,1585.61,1392.27,555.21,2388.00,9064.57,47.24,522.49,2388.08,8146.56,8.3878,392,38.96,23.2951
4,"(1, 30)",0.0016,0.0000,100.0,641.68,1588.63,1397.65,554.74,2388.04,9076.14,47.15,522.58,2388.03,8147.80,8.3869,392,39.14,23.4583
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
449005,"(100, 152)",-0.0016,-0.0003,100.0,643.78,1596.01,1424.11,551.86,2388.25,9070.82,48.27,519.66,2388.30,8138.08,8.5036,394,38.44,22.9631
449006,"(100, 152)",0.0034,-0.0003,100.0,643.29,1596.38,1429.14,551.86,2388.23,9064.60,48.13,519.91,2388.28,8144.36,8.5174,395,38.50,22.9746
449007,"(100, 152)",-0.0016,0.0004,100.0,643.84,1604.53,1431.41,551.30,2388.25,9063.45,48.18,519.44,2388.24,8135.95,8.5223,396,38.39,23.0682
449008,"(100, 152)",-0.0023,0.0004,100.0,643.94,1597.56,1426.57,550.69,2388.26,9062.22,48.05,520.01,2388.26,8141.24,8.5148,395,38.31,23.0753


In [6]:
#extraction of features

efficient_params = EfficientFCParameters()

def batch_feature_extraction(df, batch_size=100):
    feature_batches = []
    for i in range(0, len(df['id'].unique()), batch_size):
        batch_df = df[df['id'].isin(df['id'].unique()[i:i+batch_size])]
        X_batch = extract_features(batch_df,
                                   column_id='id',
                                   default_fc_parameters=efficient_params,
                                   impute_function=impute)
        feature_batches.append(X_batch)
    return pd.concat(feature_batches)

X3 = batch_feature_extraction(selected_df3, batch_size=50)


Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 39/39 [00:10<00:00,  3.64it/s]
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 39/39 [00:10<00:00,  3.77it/s]
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 39/39 [00:10<00:00,  3.83it/s]
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 39/39 [00:10<00:00,  3.74it/s]
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 39/39 [00:10<00:00,  3.75it/s]
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 39/39 [00:10<00:00,  3.79it/s]
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 39/39 [00:10<00:00,  3.83it/s]
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 39/39 [00:10<00:00,  3.76it/s]
Feature Extraction: 100%|███████████████

In [7]:
#select relevant features
y3 = rolled_df3.groupby('id')['label'].last()
X_filtered3 = select_features(X3, y3)
X_filtered3.head()

Unnamed: 0,Unnamed: 1,sr12__longest_strike_above_mean,sr9__autocorrelation__lag_4,sr9__autocorrelation__lag_5,sr9__autocorrelation__lag_6,"sr9__agg_autocorrelation__f_agg_""mean""__maxlag_40","sr9__agg_autocorrelation__f_agg_""median""__maxlag_40","sr9__agg_autocorrelation__f_agg_""var""__maxlag_40",sr11__index_mass_quantile__q_0.9,sr12__autocorrelation__lag_1,sr9__quantile__q_0.7,...,"sr12__fft_coefficient__attr_""abs""__coeff_15","op2__fft_coefficient__attr_""imag""__coeff_7","sr15__change_quantiles__f_agg_""mean""__isabs_True__qh_0.6__ql_0.0","sr12__change_quantiles__f_agg_""var""__isabs_True__qh_1.0__ql_0.2","op2__fft_coefficient__attr_""real""__coeff_4","sr9__agg_linear_trend__attr_""stderr""__chunk_len_5__f_agg_""var""",sr11__binned_entropy__max_bins_10,"sr7__change_quantiles__f_agg_""mean""__isabs_False__qh_0.2__ql_0.0","sr17__change_quantiles__f_agg_""var""__isabs_True__qh_0.8__ql_0.4","sr13__change_quantiles__f_agg_""mean""__isabs_False__qh_0.2__ql_0.0"
1,30,4.0,-0.156993,-0.242751,0.024429,0.02566,0.00598,0.179223,0.9,0.172593,9073.08,...,2.73,-0.000931,0.015089,0.056214,0.000313,2.320262,2.064017,-0.14,0.237654,-0.05
1,31,4.0,-0.06092,-0.158389,0.024689,0.03198,-0.010781,0.154344,0.933333,0.175388,9073.08,...,2.69,0.000304,0.010929,0.056354,-0.000764,2.413649,2.070729,-0.14,0.242215,-0.05
1,32,4.0,0.064658,-0.086831,-0.031135,-0.052343,-0.00477,0.062722,0.933333,0.188228,9073.08,...,2.99,0.000795,0.010929,0.05435,-0.00117,1.779132,2.079151,-0.14,0.246094,-0.05
1,33,4.0,0.066106,0.116911,0.043965,-0.06244,0.043965,0.201844,0.9,0.152381,9073.08,...,3.07,-0.000238,0.012367,0.05435,-0.000668,4.29839,2.070729,-0.14,0.246094,-0.05
1,34,3.0,0.03108,0.106668,0.070959,-0.151259,0.027827,0.156176,0.933333,0.157945,9073.08,...,2.61,-0.001143,0.018943,0.056078,0.000175,3.702011,2.092785,-0.14,0.234375,-0.05


In [9]:
# Data scaling and train-validation split

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
relevant_features_scaled3 = scaler.fit_transform(X_filtered3)
relevant_features_scaled_df3 = pd.DataFrame(relevant_features_scaled3, columns=X_filtered3.columns)
relevant_features_scaled_df3.head()


Unnamed: 0,sr12__longest_strike_above_mean,sr9__autocorrelation__lag_4,sr9__autocorrelation__lag_5,sr9__autocorrelation__lag_6,"sr9__agg_autocorrelation__f_agg_""mean""__maxlag_40","sr9__agg_autocorrelation__f_agg_""median""__maxlag_40","sr9__agg_autocorrelation__f_agg_""var""__maxlag_40",sr11__index_mass_quantile__q_0.9,sr12__autocorrelation__lag_1,sr9__quantile__q_0.7,...,"sr12__fft_coefficient__attr_""abs""__coeff_15","op2__fft_coefficient__attr_""imag""__coeff_7","sr15__change_quantiles__f_agg_""mean""__isabs_True__qh_0.6__ql_0.0","sr12__change_quantiles__f_agg_""var""__isabs_True__qh_1.0__ql_0.2","op2__fft_coefficient__attr_""real""__coeff_4","sr9__agg_linear_trend__attr_""stderr""__chunk_len_5__f_agg_""var""",sr11__binned_entropy__max_bins_10,"sr7__change_quantiles__f_agg_""mean""__isabs_False__qh_0.2__ql_0.0","sr17__change_quantiles__f_agg_""var""__isabs_True__qh_0.8__ql_0.4","sr13__change_quantiles__f_agg_""mean""__isabs_False__qh_0.2__ql_0.0"
0,-0.619156,-0.870997,-1.27764,-0.094784,0.948874,0.789166,-0.192067,-1.591191,0.005542,0.478429,...,0.825313,-0.822523,-0.048105,-0.142579,0.282397,0.117663,0.01358,-0.604524,0.181284,-1.029983
1,-0.619156,-0.495395,-0.925039,-0.093653,0.987854,0.658851,-0.29005,0.62846,0.014141,0.478429,...,0.794691,0.26983,-0.976611,-0.139919,-0.686943,0.200114,0.074486,-0.604524,0.22101,-1.029983
2,-0.619156,-0.004446,-0.625957,-0.335752,0.467765,0.705587,-0.650895,0.62846,0.053644,0.478429,...,1.024357,0.704036,-0.976611,-0.177874,-1.051769,-0.360104,0.150914,-0.604524,0.254803,-1.029983
3,-0.619156,0.001218,0.225605,-0.010059,0.405486,1.084512,-0.102975,-1.591191,-0.056644,0.478429,...,1.085602,-0.209583,-0.655655,-0.177874,-0.600428,1.864161,0.074486,-0.604524,0.254803,-1.029983
4,-0.939913,-0.135721,0.182795,0.107007,-0.142333,0.959033,-0.282836,0.62846,-0.039525,0.478429,...,0.733447,-1.010708,0.812031,-0.145156,0.15805,1.337616,0.274635,-0.604524,0.152716,-1.029983


In [10]:
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Split data into training and validation sets
X_train3, X_val, y_train3, y_val = train_test_split(relevant_features_scaled_df3, y3, test_size=0.2, random_state=42)

#model
xgb_model3 = XGBRegressor(objective='reg:squarederror', random_state=42)
xgb_model3.fit(X_train3, y_train3)

In [11]:
# Predict and evaluate
y_pred = xgb_model3.predict(X_val)
mse = mean_squared_error(y_val, y_pred)
rmse = mse**0.5
rmse

np.float64(5.675786797304427)