# Load - Save Processed Data

This notebook handles saving the transformed data to files or databases for downstream use.


In [None]:
import pandas as pd
import numpy as np
import os
import warnings
warnings.filterwarnings('ignore')


In [None]:
# Load transformed data (run transform.ipynb first)
# Or load from saved intermediate files if available
try:
    train = pd.read_csv('train_transformed.csv')
    valid = pd.read_csv('valid_transformed.csv')
    y_valid = pd.read_csv('y_valid_transformed.csv')
    print("Loaded transformed data from files")
except:
    print("Please run transform.ipynb first to create transformed data")
    print("Or ensure transformed data files exist in the current directory")


In [None]:
# Save processed data to CSV files
output_dir = 'processed_data'
os.makedirs(output_dir, exist_ok=True)

# Save training data
train.to_csv(os.path.join(output_dir, 'train_processed.csv'), index=False)
print(f"Saved training data: {train.shape}")

# Save validation data
valid.to_csv(os.path.join(output_dir, 'valid_processed.csv'), index=False)
print(f"Saved validation data: {valid.shape}")

# Save RUL validation labels
y_valid.to_csv(os.path.join(output_dir, 'y_valid_processed.csv'), index=False)
print(f"Saved RUL validation labels: {y_valid.shape}")


In [None]:
# Verify saved files
import glob
saved_files = glob.glob(os.path.join(output_dir, '*.csv'))
print("\nSaved files:")
for file in saved_files:
    size = os.path.getsize(file) / (1024 * 1024)  # Size in MB
    print(f"  {os.path.basename(file)}: {size:.2f} MB")


In [None]:
# Optional: Save to Parquet format for better performance
train.to_parquet(os.path.join(output_dir, 'train_processed.parquet'), index=False)
valid.to_parquet(os.path.join(output_dir, 'valid_processed.parquet'), index=False)
y_valid.to_parquet(os.path.join(output_dir, 'y_valid_processed.parquet'), index=False)
print("\nData also saved in Parquet format for efficient storage and loading")


In [None]:
# Final summary
print("\n" + "="*50)
print("LOAD COMPLETE")
print("="*50)
print(f"Training data: {train.shape[0]} rows, {train.shape[1]} columns")
print(f"Validation data: {valid.shape[0]} rows, {valid.shape[1]} columns")
print(f"RUL labels: {y_valid.shape[0]} rows")
print(f"\nAll processed data saved to '{output_dir}' directory")
print("="*50)
