# 🛠 Data Wrangling

Clean, standardize, and enrich SpaceX launch data for downstream analysis.
.



In [None]:
# 1. Imports
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from IPython.display import display
import os



In [None]:
# Verify that both files exist and are non-empty

print("Current files:", os.listdir())
if os.path.exists("spacex_launch_site_scrape.csv"):
    size = os.path.getsize("spacex_launch_site_scrape.csv")
    print(f"spacex_launch_site_scrape.csv size: {size} bytes")
else:
    print("spacex_launch_site_scrape.csv not found!")


In [None]:
# 2. Load raw datasets
df_launch = pd.read_csv("spacex_launches_sample.csv")
df_scrape = pd.read_csv("spacex_launch_site_scrape.csv")

print("Launch rows:", len(df_launch))
print("Scrape rows:", len(df_scrape))


In [None]:
# 3. Handle missing values
df_launch.dropna(subset=['id'], inplace=True)
numeric_cols = df_launch.select_dtypes(include='number').columns
df_launch[numeric_cols] = df_launch[numeric_cols].fillna(0)

print("After missing-value handling:", df_launch.shape)


In [None]:
# 4. Type conversion & unit standardization
df_launch['date_utc'] = pd.to_datetime(df_launch['date_utc'])
if 'payload_mass_kg' in df_launch.columns:
    df_launch['payload_mass_lb'] = df_launch['payload_mass_kg'] * 2.20462

print("Date column type:", df_launch['date_utc'].dtype)


In [None]:
# 5. Merge scraped site info
df = df_launch.merge(df_scrape, on='name', how='left')
print("Merged rows:", df.shape[0])


In [None]:
# 6. Feature engineering
df['payload_ratio'] = (
    df['payload_mass_kg'] / df['mass_returned']
    if 'mass_returned' in df.columns else np.nan
)
df['site_code'] = df['site'].astype('category').cat.codes

print("Added features: payload_ratio, site_code")


In [None]:
# 7. Scaling & encoding
scaler = MinMaxScaler()
df[['payload_mass_kg_scaled']] = scaler.fit_transform(df[['payload_mass_kg']])
df = pd.get_dummies(df, columns=['rocket_name'], drop_first=True)

print("Scaled & one-hot encoded rocket_name")


In [None]:
# 8. Preview cleaned & enriched data
print("Final columns:", len(df.columns))
display(df.head())


In [None]:
# 9. Save wrangled data
df.to_parquet("spacex_launches_wrangled.parquet", index=False)
print("Saved spacex_launches_wrangled.parquet")


In [None]:
# 9. Save wrangled data
df.to_parquet("spacex_launches_wrangled.parquet", index=False)
print("Saved spacex_launches_wrangled.parquet")
