In [2]:
import boto3
import pandas as pd
import sagemaker

# Get default bucket (where features were saved)
sess = sagemaker.Session()
bucket = sess.default_bucket()

print("INSPECTING EXISTING PARQUET FEATURES")
print(f"Bucket: {bucket}")
print(f"Prefix: aai540-group1/features/")
print()

# List files in the features folder
s3 = boto3.client('s3')
response = s3.list_objects_v2(Bucket=bucket, Prefix='aai540-group1/features/')

print("Files found:")
if 'Contents' in response:
    for obj in response['Contents']:
        size_mb = obj['Size'] / (1024 * 1024)
        print(f"  {obj['Key']} ({size_mb:.2f} MB)")
else:
    print("  No files found! Features need to be regenerated.")

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
INSPECTING EXISTING PARQUET FEATURES
Bucket: sagemaker-us-east-1-786869526001
Prefix: aai540-group1/features/

Files found:
  No files found! Features need to be regenerated.


In [3]:
# Public bucket (where raw data is stored)
PUBLIC_BUCKET = "sagemaker-us-east-1-425709451100"

print("Checking PUBLIC bucket for features...")
print(f"Bucket: {PUBLIC_BUCKET}")
print()

# Check for features folder
response = s3.list_objects_v2(Bucket=PUBLIC_BUCKET, Prefix='aai540-group1/features/')

print("Features folder:")
if 'Contents' in response:
    for obj in response['Contents']:
        size_mb = obj['Size'] / (1024 * 1024)
        print(f"  {obj['Key']} ({size_mb:.2f} MB)")
else:
    print("  No features found in public bucket either.")

print()

# Also check what's in raw data folder (confirm raw data exists)
print("Raw data folder:")
response_raw = s3.list_objects_v2(Bucket=PUBLIC_BUCKET, Prefix='aai540-group1/data/raw/')
if 'Contents' in response_raw:
    for obj in response_raw['Contents']:
        size_mb = obj['Size'] / (1024 * 1024)
        print(f"  {obj['Key']} ({size_mb:.2f} MB)")
else:
    print("  No raw data found!")

Checking PUBLIC bucket for features...
Bucket: sagemaker-us-east-1-425709451100

Features folder:
  aai540-group1/features/prod_features.parquet (10.81 MB)
  aai540-group1/features/test_features.parquet (10.58 MB)
  aai540-group1/features/train_features.parquet (98.34 MB)
  aai540-group1/features/val_features.parquet (7.64 MB)

Raw data folder:
  aai540-group1/data/raw/airlines.csv (0.00 MB)
  aai540-group1/data/raw/airports.csv (0.02 MB)
  aai540-group1/data/raw/flights.csv (564.96 MB)


In [4]:
import pandas as pd

print("LOADING TRAINING FEATURES")

# Load training features from public bucket
train_path = f"s3://{PUBLIC_BUCKET}/aai540-group1/features/train_features.parquet"
print(f"Loading: {train_path}")

df_train = pd.read_parquet(train_path)

print(f"\nLoaded {len(df_train):,} rows × {df_train.shape[1]} columns")
print("\nSchema:")
print(df_train.dtypes)

print("\nFirst 3 rows:")
print(df_train.head(3))

print("\nTarget distribution:")
if 'DELAYED' in df_train.columns:
    delay_rate = df_train['DELAYED'].mean() * 100
    print(f"Delayed: {(df_train['DELAYED']==1).sum():,} ({delay_rate:.2f}%)")
    print(f"On-time: {(df_train['DELAYED']==0).sum():,} ({100-delay_rate:.2f}%)")
else:
    print("WARNING: DELAYED column not found!")

print("\nMissing values:")
missing = df_train.isnull().sum()
if missing.sum() > 0:
    print(missing[missing > 0])
else:
    print("No missing values")

LOADING TRAINING FEATURES
Loading: s3://sagemaker-us-east-1-425709451100/aai540-group1/features/train_features.parquet

Loaded 4,299,046 rows × 27 columns

Schema:
MONTH                    int64
DAY                      int64
DAY_OF_WEEK              int64
SCHEDULED_DEPARTURE      int64
AIRLINE                 object
ORIGIN_AIRPORT          object
DESTINATION_AIRPORT     object
DISTANCE                 int64
SCHEDULED_TIME         float64
DELAYED                  int64
DEP_HOUR                 int64
HOUR_SIN               float64
HOUR_COS               float64
IS_PEAK_HOUR             int64
IS_WEEKEND               int64
IS_LONG_HAUL             int64
DISTANCE_BUCKET          int64
ROUTE                   object
AIRLINE_DELAY_RATE     float64
ORIGIN_DELAY_RATE      float64
DEST_DELAY_RATE        float64
ROUTE_DELAY_RATE       float64
ORIGIN_FLIGHTS         float64
DEST_FLIGHTS           float64
ROUTE_FLIGHTS          float64
flight_id               object
event_time             float64

In [5]:
print("VERIFYING ALL SPLITS")

splits = ['val', 'test', 'prod']
datasets = {}

for split in splits:
    path = f"s3://{PUBLIC_BUCKET}/aai540-group1/features/{split}_features.parquet"
    print(f"\nLoading {split}...")
    df = pd.read_parquet(path)
    datasets[split] = df
    
    delay_rate = df['DELAYED'].mean() * 100
    n_delayed = (df['DELAYED'] == 1).sum()
    n_ontime = (df['DELAYED'] == 0).sum()
    
    print(f"  Rows: {len(df):,}")
    print(f"  Columns: {df.shape[1]}")
    print(f"  Delayed: {n_delayed:,} ({delay_rate:.2f}%)")
    print(f"  Missing values: {df.isnull().sum().sum()}")
    print(f"  Schema matches train: {list(df.columns) == list(df_train.columns)}")

print("\nSUMMARY")
print(f"\n{'Split':<12} {'Rows':<12} {'Delay Rate':<12} {'Columns':<10} {'Status'}")
print("-" * 60)

all_datasets = {'train': df_train, **datasets}
for split, df in all_datasets.items():
    delay_rate = df['DELAYED'].mean() * 100
    status = "OK" if df.isnull().sum().sum() == 0 else "WARNING"
    print(f"{split:<12} {len(df):<12,} {delay_rate:<11.2f}% {df.shape[1]:<10} {status}")

print("\nAll splits verified and ready for training!")

VERIFYING ALL SPLITS

Loading val...
  Rows: 482,878
  Columns: 27
  Delayed: 57,237 (11.85%)
  Missing values: 0
  Schema matches train: True

Loading test...
  Rows: 462,367
  Columns: 27
  Delayed: 67,576 (14.62%)
  Missing values: 0
  Schema matches train: True

Loading prod...
  Rows: 469,717
  Columns: 27
  Delayed: 93,313 (19.87%)
  Missing values: 0
  Schema matches train: True

SUMMARY

Split        Rows         Delay Rate   Columns    Status
------------------------------------------------------------
train        4,299,046    18.73      % 27         OK
val          482,878      11.85      % 27         OK
test         462,367      14.62      % 27         OK
prod         469,717      19.87      % 27         OK

All splits verified and ready for training!


In [6]:
print("PREPARING DATA FOR SAGEMAKER XGBOOST")

# Feature columns from section 6 (20 engineered features - all numeric)
FEATURE_COLS = [
    # Temporal
    'MONTH', 'DAY', 'DAY_OF_WEEK', 'DEP_HOUR', 'SCHEDULED_DEPARTURE',
    'HOUR_SIN', 'HOUR_COS', 'IS_PEAK_HOUR', 'IS_WEEKEND',
    
    # Distance
    'DISTANCE', 'SCHEDULED_TIME', 'IS_LONG_HAUL', 'DISTANCE_BUCKET',
    
    # Target-encoded
    'AIRLINE_DELAY_RATE', 'ORIGIN_DELAY_RATE', 'DEST_DELAY_RATE', 'ROUTE_DELAY_RATE',
    
    # Volume
    'ORIGIN_FLIGHTS', 'DEST_FLIGHTS', 'ROUTE_FLIGHTS'
]

TARGET_COL = 'DELAYED'

print(f"\nUsing {len(FEATURE_COLS)} engineered features (matching section 6)")
print(f"Target: {TARGET_COL}")

# Prepare datasets for SageMaker (target first, no headers)
print("\nExtracting features and target...")

# Add validation set to datasets dict
datasets['val'] = datasets.pop('val')  # Ensure val is in correct order
all_data = {'train': df_train, 'val': datasets['val'], 'test': datasets['test']}

prepared_data = {}
for split, df in all_data.items():
    # SageMaker format: target column first, then features
    cols = [TARGET_COL] + FEATURE_COLS
    df_prep = df[cols].copy()
    prepared_data[split] = df_prep
    
    print(f"  {split:<8} {df_prep.shape[0]:>10,} rows × {df_prep.shape[1]:>2} columns (1 target + {len(FEATURE_COLS)} features)")

print("\nSample data (train, first 3 rows):")
print(prepared_data['train'].head(3))

print("\nData ready for SageMaker XGBoost format (CSV, target first, no headers)")

PREPARING DATA FOR SAGEMAKER XGBOOST

Using 20 engineered features (matching section 6)
Target: DELAYED

Extracting features and target...
  train     4,299,046 rows × 21 columns (1 target + 20 features)
  val         482,878 rows × 21 columns (1 target + 20 features)
  test        462,367 rows × 21 columns (1 target + 20 features)

Sample data (train, first 3 rows):
   DELAYED  MONTH  DAY  DAY_OF_WEEK  DEP_HOUR  SCHEDULED_DEPARTURE  HOUR_SIN  \
0        0      1    1            4         0                    5       0.0   
1        0      1    1            4         0                   10       0.0   
2        0      1    1            4         0                   20       0.0   

   HOUR_COS  IS_PEAK_HOUR  IS_WEEKEND  ...  SCHEDULED_TIME  IS_LONG_HAUL  \
0       1.0             0           0  ...           205.0             0   
1       1.0             0           0  ...           280.0             1   
2       1.0             0           0  ...           286.0             1   

   D

In [8]:
import os

print("UPLOADING DATA TO S3")

# S3 paths (fixed path - no timestamp for idempotency)
prefix = "aai540-group1/training/engineered-baseline"
training_prefix = prefix

print(f"\nBucket: {bucket}")
print(f"Prefix: {training_prefix}")

# Check if data already exists in S3
print("\nChecking S3 for existing data...")
s3_paths = {}
upload_needed = {}

for split in ['train', 'val', 'test']:
    s3_key = f"{training_prefix}/{split}/{split}.csv"
    s3_path = f"s3://{bucket}/{s3_key}"
    s3_paths[split] = s3_path
    
    try:
        s3.head_object(Bucket=bucket, Key=s3_key)
        print(f"  {split:<8} exists: {s3_path}")
        upload_needed[split] = False
    except:
        print(f"  {split:<8} missing")
        upload_needed[split] = True

# Upload only if needed
if any(upload_needed.values()):
    # Save locally as CSV (no headers, no index)
    local_dir = "/tmp/sagemaker_data"
    os.makedirs(local_dir, exist_ok=True)
    
    print("\nSaving CSV files locally...")
    for split, df in prepared_data.items():
        if upload_needed[split]:
            local_path = f"{local_dir}/{split}.csv"
            df.to_csv(local_path, header=False, index=False)
            size_mb = os.path.getsize(local_path) / (1024 * 1024)
            print(f"  {split:<8} {local_path} ({size_mb:.2f} MB)")
    
    # Upload to S3
    print("\nUploading to S3...")
    for split in ['train', 'val', 'test']:
        if upload_needed[split]:
            local_path = f"{local_dir}/{split}.csv"
            s3_key = f"{training_prefix}/{split}/{split}.csv"
            s3.upload_file(local_path, bucket, s3_key)
            print(f"  {split:<8} -> {s3_paths[split]}")
else:
    print("\nAll data already exists in S3, skipping upload")

print("\nDATA READY")
print(f"\nTraining data:   {s3_paths['train']}")
print(f"Validation data: {s3_paths['val']}")
print(f"Test data:       {s3_paths['test']}")
print(f"\nReady for SageMaker training!")

UPLOADING DATA TO S3

Bucket: sagemaker-us-east-1-786869526001
Prefix: aai540-group1/training/engineered-baseline

Checking S3 for existing data...
  train    missing
  val      missing
  test     missing

Saving CSV files locally...
  train    /tmp/sagemaker_data/train.csv (842.15 MB)
  val      /tmp/sagemaker_data/val.csv (75.64 MB)
  test     /tmp/sagemaker_data/test.csv (91.01 MB)

Uploading to S3...
  train    -> s3://sagemaker-us-east-1-786869526001/aai540-group1/training/engineered-baseline/train/train.csv
  val      -> s3://sagemaker-us-east-1-786869526001/aai540-group1/training/engineered-baseline/val/val.csv
  test     -> s3://sagemaker-us-east-1-786869526001/aai540-group1/training/engineered-baseline/test/test.csv

DATA READY

Training data:   s3://sagemaker-us-east-1-786869526001/aai540-group1/training/engineered-baseline/train/train.csv
Validation data: s3://sagemaker-us-east-1-786869526001/aai540-group1/training/engineered-baseline/val/val.csv
Test data:       s3://sagema