# Process Train Data

This notebook processes the training data and separates labels from features.

## Steps:
1. Load training data from `data/external/train.parquet`
2. Identify the target column (label)
3. Separate features (X) and labels (y)
4. Save processed data to `data/processed/`


## Import Libraries


In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import sys

# Add project root to path
project_root = Path().resolve().parent
sys.path.insert(0, str(project_root))

print(f"Project root: {project_root}")


## Load Training Data


In [None]:
# Set up paths
data_dir = project_root / "data"
external_dir = data_dir / "external"
processed_dir = data_dir / "processed"

train_path = external_dir / "train.parquet"

print(f"Loading data from: {train_path}")
print(f"File exists: {train_path.exists()}")


In [None]:
# Load training data
print("Loading training data...")
df = pd.read_parquet(train_path)

print(f"Data shape: {df.shape}")
print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")


## Explore Data Structure


In [None]:
# Display first few rows
df.head()


In [None]:
# Display column names
print(f"Total columns: {len(df.columns)}")
print(f"\nColumn names:")
print(df.columns.tolist()[:20])  # Show first 20 columns
if len(df.columns) > 20:
    print(f"... and {len(df.columns) - 20} more columns")


In [None]:
# Display data types
print("Data types:")
print(df.dtypes.value_counts())


## Identify Target Column


In [None]:
# Common target column names in Kaggle competitions
possible_targets = ['target', 'default', 'label', 'y']

target_col = None
for col in possible_targets:
    if col in df.columns:
        target_col = col
        print(f"Found target column: '{col}'")
        break

if target_col is None:
    print("Could not find common target column names.")
    print("Checking for binary columns...")
    
    # Check for binary columns (0/1)
    binary_cols = []
    for col in df.columns:
        unique_vals = df[col].unique()
        if len(unique_vals) <= 2 and set(unique_vals).issubset({0, 1, 0.0, 1.0}):
            binary_cols.append(col)
    
    if len(binary_cols) == 1:
        target_col = binary_cols[0]
        print(f"Found binary column as target: '{target_col}'")
    else:
        print(f"Found {len(binary_cols)} binary columns: {binary_cols}")
        print("Please manually specify the target column.")


In [None]:
# If target column was not automatically identified, specify it here
# target_col = 'target'  # Uncomment and set the correct column name

if target_col:
    print(f"\nTarget column: '{target_col}'")
    print(f"\nTarget value counts:")
    print(df[target_col].value_counts().sort_index())
    print(f"\nTarget distribution:")
    print(df[target_col].value_counts(normalize=True).sort_index())


## Separate Features and Labels


In [None]:
# Get customer ID if it exists (usually first column or named 'customer_ID')
id_col = None
if 'customer_ID' in df.columns:
    id_col = 'customer_ID'
elif 'id' in df.columns:
    id_col = 'id'
elif 'customer_id' in df.columns:
    id_col = 'customer_id'

if id_col:
    print(f"Found ID column: '{id_col}'")
else:
    print("No ID column found")


In [None]:
# Separate target (labels)
y = df[target_col].copy()

# Separate features (exclude target and ID columns)
exclude_cols = [target_col]
if id_col:
    exclude_cols.append(id_col)

X = df.drop(columns=exclude_cols).copy()

# Store IDs separately if they exist
ids = df[id_col].copy() if id_col else None

print(f"Features (X) shape: {X.shape}")
print(f"Labels (y) shape: {y.shape}")
if ids is not None:
    print(f"IDs shape: {ids.shape}")


## Data Summary


In [None]:
print("=" * 70)
print("FEATURES SUMMARY")
print("=" * 70)
print(f"\nShape: {X.shape}")
print(f"Columns: {len(X.columns)}")
print(f"Memory usage: {X.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
print(f"\nData types:")
print(X.dtypes.value_counts())
print(f"\nMissing values: {X.isnull().sum().sum()}")
if X.isnull().sum().sum() > 0:
    print(f"Columns with missing values: {X.isnull().sum()[X.isnull().sum() > 0].shape[0]}")


In [None]:
print("=" * 70)
print("LABELS SUMMARY")
print("=" * 70)
print(f"\nShape: {y.shape}")
print(f"\nValue counts:")
print(y.value_counts().sort_index())
print(f"\nClass distribution:")
print(y.value_counts(normalize=True).sort_index())
print(f"\nData type: {y.dtype}")


In [None]:
if ids is not None:
    print("=" * 70)
    print("CUSTOMER IDs SUMMARY")
    print("=" * 70)
    print(f"\nShape: {ids.shape}")
    print(f"Unique IDs: {ids.nunique()}")
    print(f"Data type: {ids.dtype}")


## Save Processed Data


In [None]:
# Create processed directory if it doesn't exist
processed_dir.mkdir(parents=True, exist_ok=True)

print(f"Saving processed data to: {processed_dir}")


In [None]:
# Save features
features_path = processed_dir / "X_train.parquet"
X.to_parquet(features_path, index=False, compression='snappy')
print(f"✓ Features saved: {features_path}")
print(f"  Shape: {X.shape}")
print(f"  Size: {features_path.stat().st_size / 1024**2:.2f} MB")


In [None]:
# Save labels
labels_path = processed_dir / "y_train.parquet"
y.to_frame().to_parquet(labels_path, index=False, compression='snappy')
print(f"✓ Labels saved: {labels_path}")
print(f"  Shape: {y.shape}")
print(f"  Size: {labels_path.stat().st_size / 1024**2:.2f} MB")


In [None]:
# Save IDs if they exist
if ids is not None:
    ids_path = processed_dir / "train_ids.parquet"
    ids.to_frame().to_parquet(ids_path, index=False, compression='snappy')
    print(f"✓ IDs saved: {ids_path}")
    print(f"  Shape: {ids.shape}")
    print(f"  Size: {ids_path.stat().st_size / 1024**2:.2f} MB")


## Verification


In [None]:
# Verify saved files can be loaded
print("Verifying saved files...")

X_loaded = pd.read_parquet(features_path)
y_loaded = pd.read_parquet(labels_path)

print(f"✓ Features loaded: {X_loaded.shape}")
print(f"✓ Labels loaded: {y_loaded.shape}")

# Check if shapes match
assert X_loaded.shape == X.shape, "Features shape mismatch!"
assert y_loaded.shape[0] == y.shape[0], "Labels shape mismatch!"

print("\n✓ All verifications passed!")


## Summary

The training data has been successfully processed:
- Features (X) saved to: `data/processed/X_train.parquet`
- Labels (y) saved to: `data/processed/y_train.parquet`
- Customer IDs (if available) saved to: `data/processed/train_ids.parquet`

You can now use these processed files for model training.
