# Extract - Data Loading

This notebook handles the extraction of raw data from source files.


In [1]:
import pandas as pd
import numpy as np
import warnings
np.random.seed(34)
warnings.filterwarnings('ignore')


In [2]:
# Define column names
index_names = ['unit_number', 'time_cycles']
setting_names = ['setting_1', 'setting_2', 'setting_3']
sensor_names = ['s_{}'.format(i+1) for i in range(0,21)]
col_names = index_names + setting_names + sensor_names


In [None]:
# Extract raw data from ALL source files (FD001-FD004, train and test)
# Combine all files into a single unified dataset
import glob

# Find all train and test files
train_files = sorted(glob.glob('train_FD*.txt'))
test_files = sorted(glob.glob('test_FD*.txt'))
rul_files = sorted(glob.glob('RUL_FD*.txt'))

print(f"Found {len(train_files)} train files: {[f.split('/')[-1] for f in train_files]}")
print(f"Found {len(test_files)} test files: {[f.split('/')[-1] for f in test_files]}")
print(f"Found {len(rul_files)} RUL files: {[f.split('/')[-1] for f in rul_files]}")

# Load and combine all train files
train_dataframes = []
for file in train_files:
    dataset_id = file.replace('train_', '').replace('.txt', '')  # e.g., 'FD001'
    df = pd.read_csv(file, sep='\s+', header=None, index_col=False, names=col_names)
    df['dataset_id'] = dataset_id
    df['source_file'] = 'train'
    train_dataframes.append(df)
    print(f"  Loaded {file}: {df.shape[0]} rows, {df.shape[1]} columns")

# Load and combine all test files
test_dataframes = []
for file in test_files:
    dataset_id = file.replace('test_', '').replace('.txt', '')  # e.g., 'FD001'
    df = pd.read_csv(file, sep='\s+', header=None, index_col=False, names=col_names)
    df['dataset_id'] = dataset_id
    df['source_file'] = 'test'
    test_dataframes.append(df)
    print(f"  Loaded {file}: {df.shape[0]} rows, {df.shape[1]} columns")

# Combine all data into single dataframe
all_data = pd.concat(train_dataframes + test_dataframes, ignore_index=True)
print(f"\nâœ“ Combined dataset: {all_data.shape[0]} rows, {all_data.shape[1]} columns")
print(f"  Unique engines: {all_data['unit_number'].nunique()}")
print(f"  Datasets: {sorted(all_data['dataset_id'].unique())}")
print(f"  Source files: {sorted(all_data['source_file'].unique())}")

# Load RUL files for reference (optional, for validation purposes)
rul_dataframes = []
for file in rul_files:
    dataset_id = file.replace('RUL_', '').replace('.txt', '')
    df = pd.read_csv(file, sep='\s+', header=None, index_col=False, names=['RUL'])
    df['dataset_id'] = dataset_id
    rul_dataframes.append(df)

if rul_dataframes:
    all_rul = pd.concat(rul_dataframes, ignore_index=True)
    print(f"  RUL reference data: {all_rul.shape[0]} rows")


In [None]:
# Create unified dataset for processing
data = all_data.copy()


In [None]:
# Verify extraction
print('Shape of the combined dataset:', data.shape)
print('\nBreakdown by dataset:')
print(data.groupby('dataset_id').size())
print('\nBreakdown by source file:')
print(data.groupby('source_file').size())
print('\nSample data:')
print(data.head())


Shape of the train dataset :  (20631, 26)
Shape of the validation dataset :  (13096, 26)
Shape of the RUL validation dataset :  (100, 1)


In [None]:
# Save extracted data for next stage (Transform)
import os
data.to_csv('data_extracted.csv', index=False)
if rul_dataframes:
    all_rul.to_csv('rul_extracted.csv', index=False)
    print("Extracted data and RUL reference saved for transformation stage")
else:
    print("Extracted data saved for transformation stage")
