# Transform - Data Transformation and Feature Engineering

This notebook handles data transformations, feature engineering, and data quality checks.


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
np.random.seed(34)
warnings.filterwarnings('ignore')


In [None]:
# Load extracted data (run extract.ipynb first)
# Or load from saved intermediate files if available
try:
    train = pd.read_csv('train_extracted.csv')
    valid = pd.read_csv('valid_extracted.csv')
    y_valid = pd.read_csv('y_valid_extracted.csv')
    print("Loaded from intermediate files")
except:
    # If intermediate files don't exist, run extract cells here
    index_names = ['unit_number', 'time_cycles']
    setting_names = ['setting_1', 'setting_2', 'setting_3']
    sensor_names = ['s_{}'.format(i+1) for i in range(0,21)]
    col_names = index_names + setting_names + sensor_names

    train = pd.read_csv('train_FD001.txt',sep='\s+',header=None,index_col=False,names=col_names)
    valid = pd.read_csv('test_FD001.txt',sep='\s+',header=None,index_col=False,names=col_names)
    y_valid = pd.read_csv('RUL_FD001.txt',sep='\s+',header=None,index_col=False,names=['RUL'])
    print("Loaded directly from source files")


In [None]:
# Data quality checks
print('Shape of the train dataset : ',train.shape)
print('Shape of the validation dataset : ',valid.shape)
print('Percentage of the validation dataset : ',len(valid)/(len(valid)+len(train)))


In [None]:
# Check for missing values
print('Total None values in the train dataset : ',train.isna().sum().sum())
print('Total None values in the validation dataset : ',valid.isna().sum().sum())


In [None]:
# Exploratory Data Analysis - Basic statistics
index_names = ['unit_number', 'time_cycles']
train.loc[:,['unit_number','time_cycles']].describe()


In [None]:
# Visualize engine lifetimes
max_time_cycles=train[index_names].groupby('unit_number').max()
plt.figure(figsize=(20,50))
ax=max_time_cycles['time_cycles'].plot(kind='barh',width=0.8, stacked=True,align='center')
plt.title('Turbofan Engines LifeTime',fontweight='bold',size=30)
plt.xlabel('Time cycle',fontweight='bold',size=20)
plt.xticks(size=15)
plt.ylabel('unit',fontweight='bold',size=20)
plt.yticks(size=15)
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
# Distribution of max time cycles
sns.displot(max_time_cycles['time_cycles'],kde=True,bins=20,height=6,aspect=2)
plt.xlabel('max time cycle')
plt.show()


In [None]:
# Feature Engineering: Add RUL (Remaining Useful Life) column
def add_RUL_column(df):
    train_grouped_by_unit = df.groupby(by='unit_number')
    max_time_cycles = train_grouped_by_unit['time_cycles'].max()
    merged = df.merge(max_time_cycles.to_frame(name='max_time_cycle'), left_on='unit_number',right_index=True)
    merged["RUL"] = merged["max_time_cycle"] - merged['time_cycles']
    merged = merged.drop("max_time_cycle", axis=1)
    return merged


In [None]:
# Apply RUL transformation to training data
train = add_RUL_column(train)
print("RUL column added to training data")
train[['unit_number','RUL']].head(10)


In [None]:
# Verify RUL transformation
maxrul_u = train.groupby('unit_number').max().reset_index()
maxrul_u.head()


In [None]:
# Correlation analysis
corr = train.corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
f, ax = plt.subplots(figsize=(10, 10))
cmap = sns.diverging_palette(230, 10, as_cmap=True)
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,square=True, linewidths=.5, cbar_kws={"shrink": .5})
plt.title('Correlation Matrix')
plt.show()


In [None]:
# Display transformed data summary
print("Transformed training data shape:", train.shape)
print("Transformed validation data shape:", valid.shape)
print("\nTransformed data is ready for loading!")


In [None]:
# Save transformed data for next stage (Load)
import os
train.to_csv('train_transformed.csv', index=False)
valid.to_csv('valid_transformed.csv', index=False)
y_valid.to_csv('y_valid_transformed.csv', index=False)
print("Transformed data saved for loading stage")
