In [52]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from tsfresh import extract_features, select_features
from tsfresh.utilities.dataframe_functions import impute, roll_time_series
from tsfresh.feature_extraction import EfficientFCParameters
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor

In [53]:
#Function to load and process data with op settings
def load_and_process_data(file_path):
    col_1 = ['unit', 'time', 'op1', 'op2', 'op3']  
    col_2 = [f'sr{i+1}' for i in range(21)] 
    columns = col_1 + col_2
    df = pd.read_csv(file_path, delim_whitespace=True, header=None, names=columns)
    df['max_time'] = df.groupby('unit')['time'].transform('max')
    df['remaining_time'] = df['max_time'] - df['time']
    df['label'] = df['remaining_time'].clip(upper=130)
    return df

train_df_with_op = load_and_process_data(r"C:\Users\65962\Desktop\JUPYTER\CMAPSSData\train_FD003.txt")
train_df_with_op

  df = pd.read_csv(file_path, delim_whitespace=True, header=None, names=columns)


Unnamed: 0,unit,time,op1,op2,op3,sr1,sr2,sr3,sr4,sr5,...,sr15,sr16,sr17,sr18,sr19,sr20,sr21,max_time,remaining_time,label
0,1,1,-0.0005,0.0004,100.0,518.67,642.36,1583.23,1396.84,14.62,...,8.4246,0.03,391,2388,100.0,39.11,23.3537,259,258,130
1,1,2,0.0008,-0.0003,100.0,518.67,642.50,1584.69,1396.89,14.62,...,8.4403,0.03,392,2388,100.0,38.99,23.4491,259,257,130
2,1,3,-0.0014,-0.0002,100.0,518.67,642.18,1582.35,1405.61,14.62,...,8.3901,0.03,391,2388,100.0,38.85,23.3669,259,256,130
3,1,4,-0.0020,0.0001,100.0,518.67,642.92,1585.61,1392.27,14.62,...,8.3878,0.03,392,2388,100.0,38.96,23.2951,259,255,130
4,1,5,0.0016,0.0000,100.0,518.67,641.68,1588.63,1397.65,14.62,...,8.3869,0.03,392,2388,100.0,39.14,23.4583,259,254,130
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24715,100,148,-0.0016,-0.0003,100.0,518.67,643.78,1596.01,1424.11,14.62,...,8.5036,0.03,394,2388,100.0,38.44,22.9631,152,4,4
24716,100,149,0.0034,-0.0003,100.0,518.67,643.29,1596.38,1429.14,14.62,...,8.5174,0.03,395,2388,100.0,38.50,22.9746,152,3,3
24717,100,150,-0.0016,0.0004,100.0,518.67,643.84,1604.53,1431.41,14.62,...,8.5223,0.03,396,2388,100.0,38.39,23.0682,152,2,2
24718,100,151,-0.0023,0.0004,100.0,518.67,643.94,1597.56,1426.57,14.62,...,8.5148,0.03,395,2388,100.0,38.31,23.0753,152,1,1


In [54]:
#summary stats
def display_summary_stats(df):
    print(df.describe())

print(display_summary_stats(train_df_with_op))

               unit          time           op1           op2      op3  \
count  24720.000000  24720.000000  24720.000000  24720.000000  24720.0   
mean      48.631877    139.077063     -0.000024      0.000005    100.0   
std       29.348985     98.846675      0.002194      0.000294      0.0   
min        1.000000      1.000000     -0.008600     -0.000600    100.0   
25%       23.000000     62.000000     -0.001500     -0.000200    100.0   
50%       47.000000    124.000000     -0.000000     -0.000000    100.0   
75%       74.000000    191.000000      0.001500      0.000300    100.0   
max      100.000000    525.000000      0.008600      0.000700    100.0   

            sr1           sr2           sr3           sr4           sr5  ...  \
count  24720.00  24720.000000  24720.000000  24720.000000  2.472000e+04  ...   
mean     518.67    642.457858   1588.079175   1404.471212  1.462000e+01  ...   
std        0.00      0.523031      6.810418      9.773178  3.552786e-15  ...   
min      518.

In [55]:
# Feature Extraction with rolling windows
def feature_extraction(df, window_size=30):
    rolled_train = roll_time_series(df, column_id='unit', column_sort='time', max_timeshift=window_size - 1, min_timeshift=window_size - 1, rolling_direction=1)
    selected_cols = ['unit', 'label', 'op1', 'op2', 'op3'] + [f'sr{i+1}' for i in range(21)]
    selected_train = rolled_train[selected_cols]

    extraction_settings = EfficientFCParameters()
    X_train_features = extract_features(selected_train,
                                        column_id='unit',
                                        impute_function=impute,
                                        default_fc_parameters=extraction_settings)

    #drop cols with NaNs
    X_train_features = X_train_features.dropna(axis=1, how='all')  
    return X_train_features, rolled_train.groupby('unit')['label'].last()

X_trainwithop, y_trainwithop=feature_extraction(train_df_with_op, window_size=30)
X_trainwithop, y_trainwithop

Rolling: 100%|█████████████████████████████████████████████████████████████████████████| 38/38 [00:07<00:00,  5.34it/s]
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 40/40 [05:01<00:00,  7.53s/it]


(     op2__variance_larger_than_standard_deviation  op2__has_duplicate_max  \
 1                                             0.0                     1.0   
 2                                             0.0                     1.0   
 3                                             0.0                     1.0   
 4                                             0.0                     1.0   
 5                                             0.0                     1.0   
 ..                                            ...                     ...   
 96                                            0.0                     1.0   
 97                                            0.0                     1.0   
 98                                            0.0                     1.0   
 99                                            0.0                     1.0   
 100                                           0.0                     1.0   
 
      op2__has_duplicate_min  op2__has_duplicate  op2__sum_val

In [64]:
from tsfresh import select_features
import numpy as np
from sklearn.feature_selection import VarianceThreshold
#feature selection

def perform_feature_selection(X, y):
    if len(np.unique(y)) > 1:
        return select_features(X, y)
    else:
        print("Insufficient label diversity, using variance-based feature selection.")
        selector = VarianceThreshold(threshold=0.1)
        return pd.DataFrame(selector.fit_transform(X), columns=X.columns[selector.get_support()])
        
X_selected_with_op=perform_feature_selection(X_trainwithop, y_trainwithop)
X_selected_with_op

Insufficient label diversity, using variance-based feature selection.


Unnamed: 0,op2__length,op2__variation_coefficient,op2__absolute_sum_of_changes,op2__longest_strike_below_mean,op2__longest_strike_above_mean,op2__count_above_mean,op2__count_below_mean,op2__cid_ce__normalize_True,op2__symmetry_looking__r_0.05,op2__large_standard_deviation__r_0.30000000000000004,...,op1__range_count__max_1__min_-1,op1__range_count__max_0__min_-1000000000000.0,op1__range_count__max_1000000000000.0__min_0,op1__friedrich_coefficients__coeff_0__m_3__r_30,op1__friedrich_coefficients__coeff_1__m_3__r_30,"op1__agg_linear_trend__attr_""rvalue""__chunk_len_50__f_agg_""max""","op1__agg_linear_trend__attr_""rvalue""__chunk_len_50__f_agg_""min""","op1__agg_linear_trend__attr_""rvalue""__chunk_len_50__f_agg_""mean""","op1__augmented_dickey_fuller__attr_""teststat""__autolag_""AIC""",op1__number_crossing_m__m_0
0,6900.0,-5.859920,2.3203,9.0,8.0,3321.0,3579.0,118.223755,0.0,0.0,...,6900.0,2931.0,3969.0,-5051.249444,17.514077,-0.051612,0.540703,0.402569,-3.096623,3473.0
1,6720.0,6.384275,2.1119,9.0,10.0,3603.0,3117.0,113.707392,0.0,0.0,...,6720.0,3071.0,3649.0,443.314253,-1.881706,-0.024701,-0.187904,-0.004462,-2.042638,3153.0
2,5790.0,4.524122,2.0245,10.0,6.0,2853.0,2937.0,107.653785,0.0,1.0,...,5790.0,2518.0,3272.0,9511.308581,14.242838,0.370132,0.203143,0.413902,-2.079941,3281.0
3,7290.0,-18.699426,2.3994,7.0,9.0,4105.0,3185.0,120.483553,1.0,0.0,...,7290.0,3652.0,3638.0,-7287.102338,-2.017187,0.216593,0.008047,0.094423,-3.952707,3478.0
4,5520.0,-7.225012,1.8520,8.0,8.0,2794.0,2726.0,103.578896,1.0,0.0,...,5520.0,2603.0,2917.0,2824.522104,-29.226556,-0.134023,0.539864,0.405162,-2.427724,3209.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,13860.0,15.506874,4.5752,15.0,10.0,6508.0,7352.0,165.211902,1.0,0.0,...,13860.0,6558.0,7302.0,4688.842911,-6.938111,0.102218,-0.032171,0.009563,-4.191204,7131.0
96,7380.0,-18.231683,2.6244,8.0,10.0,3749.0,3631.0,123.953679,1.0,1.0,...,7380.0,3508.0,3872.0,-2236.634982,-19.713795,-0.506654,0.187375,0.227886,-2.578020,3817.0
97,8340.0,-42.122980,2.8492,8.0,12.0,4533.0,3807.0,129.660329,1.0,0.0,...,8340.0,4205.0,4135.0,5826.050141,0.586251,0.011605,-0.099731,0.039155,-1.720758,4124.0
98,3480.0,10.092946,1.2494,8.0,5.0,1579.0,1901.0,89.077878,1.0,0.0,...,3480.0,2082.0,1398.0,648.887603,22.527396,0.420718,0.531478,-0.524941,-3.035601,1747.0


In [62]:
# Data scaling and train-validation split
def scale_split(X, y, test_size=0.2):
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    return train_test_split(X_scaled, y, test_size=test_size, random_state=42)

X_train_splitwithop, X_val_splitwithop, y_train_splitwithop, y_val_splitwithop = scale_split(X_selected_with_op, y_trainwithop)

In [63]:
# Model training
from sklearn.metrics import mean_squared_error, r2_score
def train_xgb_model(X_train, y_train, X_val, y_val):
    xgb_model = XGBRegressor(objective='reg:squarederror', random_state=42)
    xgb_model.fit(X_train, y_train)

    y_pred_val = xgb_model.predict(X_val)
    mse_val = mean_squared_error(y_val, y_pred_val)
    r2_val = r2_score(y_val, y_pred_val)
    print(mse_val,r2_val)
    return xgb_model, y_pred_val

xgb_modelwithop, y_pred_withop = train_xgb_model(X_train_splitwithop, y_train_splitwithop, X_val_splitwithop, y_val_splitwithop)
print(xgb_modelwithop)
print(y_pred_withop)


0.0 1.0
XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=None, n_jobs=None,
             num_parallel_tree=None, random_state=42, ...)
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [28]:
#Function to load and process data without op settings
def load_and_process_withoutop(file_path):
    col_1 = ['unit', 'time', 'op1', 'op2', 'op3']  
    col_2 = [f'sr{i+1}' for i in range(21)] 
    columns = col_1 + col_2
    df = pd.read_csv(file_path, delim_whitespace=True, header=None, names=columns)
    df = df.drop(columns=['op1', 'op2', 'op3'])
    # Calculating remaining useful life (RUL)
    df['max_time'] = df.groupby('unit')['time'].transform('max')
    df['remaining_time'] = df['max_time'] - df['time']
    df['label'] = df['remaining_time'].clip(upper=130)
    return df

train_df_without_op = load_and_process_withoutop(r"C:\Users\65962\Desktop\JUPYTER\CMAPSSData\train_FD003.txt")
train_df_without_op

  df = pd.read_csv(file_path, delim_whitespace=True, header=None, names=columns)


Unnamed: 0,unit,time,sr1,sr2,sr3,sr4,sr5,sr6,sr7,sr8,...,sr15,sr16,sr17,sr18,sr19,sr20,sr21,max_time,remaining_time,label
0,1,1,518.67,642.36,1583.23,1396.84,14.62,21.61,553.97,2387.96,...,8.4246,0.03,391,2388,100.0,39.11,23.3537,259,258,130
1,1,2,518.67,642.50,1584.69,1396.89,14.62,21.61,554.55,2388.00,...,8.4403,0.03,392,2388,100.0,38.99,23.4491,259,257,130
2,1,3,518.67,642.18,1582.35,1405.61,14.62,21.61,554.43,2388.03,...,8.3901,0.03,391,2388,100.0,38.85,23.3669,259,256,130
3,1,4,518.67,642.92,1585.61,1392.27,14.62,21.61,555.21,2388.00,...,8.3878,0.03,392,2388,100.0,38.96,23.2951,259,255,130
4,1,5,518.67,641.68,1588.63,1397.65,14.62,21.61,554.74,2388.04,...,8.3869,0.03,392,2388,100.0,39.14,23.4583,259,254,130
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24715,100,148,518.67,643.78,1596.01,1424.11,14.62,21.61,551.86,2388.25,...,8.5036,0.03,394,2388,100.0,38.44,22.9631,152,4,4
24716,100,149,518.67,643.29,1596.38,1429.14,14.62,21.61,551.86,2388.23,...,8.5174,0.03,395,2388,100.0,38.50,22.9746,152,3,3
24717,100,150,518.67,643.84,1604.53,1431.41,14.62,21.61,551.30,2388.25,...,8.5223,0.03,396,2388,100.0,38.39,23.0682,152,2,2
24718,100,151,518.67,643.94,1597.56,1426.57,14.62,21.61,550.69,2388.26,...,8.5148,0.03,395,2388,100.0,38.31,23.0753,152,1,1


In [29]:
#summary stats
print(display_summary_stats(train_df_without_op))

               unit          time       sr1           sr2           sr3  \
count  24720.000000  24720.000000  24720.00  24720.000000  24720.000000   
mean      48.631877    139.077063    518.67    642.457858   1588.079175   
std       29.348985     98.846675      0.00      0.523031      6.810418   
min        1.000000      1.000000    518.67    640.840000   1564.300000   
25%       23.000000     62.000000    518.67    642.080000   1583.280000   
50%       47.000000    124.000000    518.67    642.400000   1587.520000   
75%       74.000000    191.000000    518.67    642.790000   1592.412500   
max      100.000000    525.000000    518.67    645.110000   1615.390000   

                sr4           sr5           sr6           sr7           sr8  \
count  24720.000000  2.472000e+04  24720.000000  24720.000000  24720.000000   
mean    1404.471212  1.462000e+01     21.595841    555.143808   2388.071555   
std        9.773178  3.552786e-15      0.018116      3.437343      0.158285   
min     

In [66]:
# Feature Extraction with rolling windows
def feature_extraction_withoutop(df, window_size=30):
    rolled_train = roll_time_series(df,column_id='unit',column_sort='time',max_timeshift=window_size - 1,min_timeshift=window_size - 1,rolling_direction=1)
    selected_cols = ['unit', 'label'] + [f'sr{i+1}' for i in range(21)]
    selected_train = rolled_train[selected_cols]

    extraction_settings = EfficientFCParameters()
    X_train_features = extract_features(selected_train,
                                        column_id='unit',
                                        impute_function=impute,
                                        default_fc_parameters=extraction_settings)

    #drop cols with NaNs
    X_train_features = X_train_features.dropna(axis=1, how='all')  
    return X_train_features, rolled_train.groupby('unit')['label'].last()

X_trainwithoutop, y_trainwithoutop=feature_extraction_withoutop(train_df_without_op, window_size=30)
X_trainwithoutop, y_trainwithoutop

Rolling: 100%|█████████████████████████████████████████████████████████████████████████| 38/38 [00:05<00:00,  7.18it/s]
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 40/40 [04:27<00:00,  6.70s/it]


(     label__variance_larger_than_standard_deviation  label__has_duplicate_max  \
 1                                               1.0                       1.0   
 2                                               1.0                       1.0   
 3                                               1.0                       1.0   
 4                                               1.0                       1.0   
 5                                               1.0                       1.0   
 ..                                              ...                       ...   
 96                                              1.0                       1.0   
 97                                              1.0                       1.0   
 98                                              1.0                       1.0   
 99                                              1.0                       1.0   
 100                                             1.0                       1.0   
 
      label__h

In [67]:
# Feature Selection
X_selected_without_op=perform_feature_selection(X_trainwithoutop, y_trainwithoutop)
X_selected_without_op

Insufficient label diversity, using variance-based feature selection.


Unnamed: 0,label__sum_values,label__abs_energy,label__mean_abs_change,label__median,label__mean,label__length,label__standard_deviation,label__variance,label__skewness,label__kurtosis,...,"sr21__agg_linear_trend__attr_""rvalue""__chunk_len_5__f_agg_""mean""","sr21__agg_linear_trend__attr_""rvalue""__chunk_len_10__f_agg_""max""","sr21__agg_linear_trend__attr_""rvalue""__chunk_len_10__f_agg_""min""","sr21__agg_linear_trend__attr_""rvalue""__chunk_len_10__f_agg_""mean""","sr21__agg_linear_trend__attr_""rvalue""__chunk_len_50__f_agg_""max""","sr21__agg_linear_trend__attr_""rvalue""__chunk_len_50__f_agg_""min""","sr21__agg_linear_trend__attr_""rvalue""__chunk_len_50__f_agg_""mean""","sr21__agg_linear_trend__attr_""rvalue""__chunk_len_50__f_agg_""var""","sr21__augmented_dickey_fuller__attr_""teststat""__autolag_""AIC""","sr21__augmented_dickey_fuller__attr_""usedlag""__autolag_""AIC"""
0,694040.0,79709780.0,0.956370,129.0,100.585507,6900.0,37.877404,1434.697761,-0.944844,-0.546966,...,-0.875542,-0.913434,-0.857507,-0.904504,-0.948693,-0.927557,-0.934285,-0.131670,4.261607,31.0
1,670640.0,76667780.0,0.981991,126.0,99.797619,6720.0,38.070081,1449.331066,-0.905750,-0.618988,...,0.796642,0.806790,0.807548,0.818857,0.835695,0.820240,0.849757,0.748830,7.697458,33.0
2,549740.0,60950780.0,1.139748,110.5,94.946459,5790.0,38.885407,1512.074854,-0.685496,-0.955446,...,-0.835416,-0.843438,-0.799322,-0.859932,-0.861896,-0.852600,-0.889894,0.118840,4.446881,30.0
3,744740.0,86300780.0,0.905200,130.0,102.159122,7290.0,37.440021,1401.755201,-1.026239,-0.385421,...,-0.774630,-0.682047,-0.677605,-0.817027,-0.566702,-0.738553,-0.871143,0.259986,2.676414,28.0
4,514640.0,56387780.0,1.195506,106.0,93.231884,5520.0,39.025547,1522.993331,-0.614848,-1.037598,...,-0.829269,-0.800570,-0.802428,-0.863858,-0.800386,-0.873682,-0.895283,0.564021,3.992538,31.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,1598840.0,197333780.0,0.476081,130.0,115.356421,13860.0,30.504795,930.542516,-2.016479,2.781403,...,0.776145,0.712299,0.728692,0.804887,0.723372,0.748435,0.825101,0.293774,4.589473,28.0
96,756440.0,87821780.0,0.894159,130.0,102.498645,7380.0,37.336282,1393.997966,-1.044425,-0.347205,...,0.782703,0.801835,0.703882,0.806014,0.858557,0.674227,0.832732,0.610812,6.536191,31.0
97,881240.0,104045780.0,0.791222,130.0,105.664269,8340.0,36.201872,1310.575534,-1.226387,0.077260,...,0.832354,0.778746,0.819230,0.855959,0.835856,0.843542,0.875879,0.071805,4.548924,31.0
98,250000.0,22053180.0,1.836160,72.0,71.839080,3480.0,34.296752,1176.267208,-0.031667,-1.103906,...,-0.768257,-0.669418,-0.696656,-0.823296,-0.696824,-0.794003,-0.905162,0.488940,3.156689,30.0


In [68]:
# Data scaling and train-validation split
X_train_splitnoop, X_val_splitnoop, y_train_splitnoop, y_val_splitnoop = scale_split(X_selected_without_op, y_trainwithoutop)

In [69]:
# Model training
xgb_modelwithoutop, y_pred_withoutop = train_xgb_model(X_train_splitnoop, y_train_splitnoop, X_val_splitnoop, y_val_splitnoop)
print(xgb_modelwithoutop)
print(y_pred_withoutop)

0.0 1.0
XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=None, n_jobs=None,
             num_parallel_tree=None, random_state=42, ...)
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
