In [1]:
# ============================================================================
# DATA PREPROCESSING: Preparing Features for Machine Learning
# ============================================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

%matplotlib inline

# Set display options
pd.set_option('display.max_columns', None)

print("="*80)
print("DATA PREPROCESSING: MACHINE LEARNING PREPARATION")
print("="*80)
print(f"Start Time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("="*80)

# ========== LOAD PROCESSED DATASET ==========

print("\n Loading processed dataset with features...")

crashes = pd.read_csv(r'D:\Nairobi-Accident-Severity\data\processed\crashes_with_features.csv')

print(f" Dataset loaded successfully!")
print(f"   Shape: {crashes.shape}")
print(f"   Records: {len(crashes):,}")
print(f"   Features: {len(crashes.columns)}")

# ========== VERIFY DATA QUALITY ==========

print("\n Data Quality Check:")
print(f"   Missing values: {crashes.isnull().sum().sum()}")
print(f"   Duplicate rows: {crashes.duplicated().sum()}")

print("\n Target variable distribution:")
print(crashes['severity'].value_counts())

print("\n" + "="*80)
print(" DATA LOADED - READY FOR PREPROCESSING")
print("="*80)

DATA PREPROCESSING: MACHINE LEARNING PREPARATION
Start Time: 2026-01-28 16:36:53

 Loading processed dataset with features...
 Dataset loaded successfully!
   Shape: (31064, 36)
   Records: 31,064
   Features: 36

 Data Quality Check:
   Missing values: 0
   Duplicate rows: 0

 Target variable distribution:
severity
MINOR       25059
FATAL        2284
MODERATE     2121
SEVERE       1600
Name: count, dtype: int64

 DATA LOADED - READY FOR PREPROCESSING


In [2]:
# ============================================================================
# FEATURE TYPE ANALYSIS
# ============================================================================

print("="*80)
print("ANALYZING FEATURE TYPES FOR PREPROCESSING")
print("="*80)

# ========== IDENTIFY FEATURE TYPES ==========

print("\n Analyzing all features...")

# Display data types
print("\n Data types:")
print(crashes.dtypes)

# ========== CATEGORIZE FEATURES ==========

print("\n" + "="*80)
print("FEATURE CATEGORIZATION")
print("="*80)

# Features to DROP (not needed for ML)
drop_features = [
    'crash_id',           # Unique identifier (not predictive)
    'crash_datetime',     # Already extracted as features
    'crash_date',         # Already extracted as features
    'day_name',           # Redundant with day_of_week
    'month_name',         # Redundant with month
    'lat_grid',           # Redundant with location_grid
    'lon_grid',           # Redundant with location_grid
    'location_grid'       # Already captured in aggregated features
]

# TARGET variable
target = 'severity'

# CATEGORICAL features (need encoding)
categorical_features = [
    'time_of_day',        # Morning/Afternoon/Evening/Night
    'distance_category',  # 0-5km, 5-10km, etc.
    'frequency_category', # Isolated/Low/Moderate/High
    'location_risk'       # Low/Moderate/High/Very High Risk
]

# NUMERICAL features (need normalization)
numerical_features = [
    'latitude',
    'longitude',
    'n_crash_reports',
    'hour',
    'day_of_week',
    'month',
    'year',
    'distance_from_center_km',
    'crashes_at_location',
    'severity_numeric',
    'avg_severity_at_location',
    'max_severity_at_location',
    'fatal_rate_at_location',
    'pedestrian_rate_at_location'
]

# BOOLEAN features (already 0/1, no processing needed)
boolean_features = [
    'contains_fatality_words',
    'contains_pedestrian_words',
    'contains_matatu_words',
    'contains_motorcycle_words',
    'is_morning_rush',
    'is_evening_rush',
    'is_rush_hour',
    'is_weekend',
    'is_hotspot'
]

# ========== DISPLAY CATEGORIZATION ==========

print("\n Feature Categories:")
print(f"\n   Features to DROP: {len(drop_features)}")
for f in drop_features:
    print(f"      â€¢ {f}")

print(f"\n   TARGET variable: 1")
print(f"      â€¢ {target}")

print(f"\n   CATEGORICAL features (need encoding): {len(categorical_features)}")
for f in categorical_features:
    print(f"      â€¢ {f}")

print(f"\n   NUMERICAL features (need normalization): {len(numerical_features)}")
for f in numerical_features:
    print(f"      â€¢ {f}")

print(f"\n   BOOLEAN features (already 0/1): {len(boolean_features)}")
for f in boolean_features:
    print(f"      â€¢ {f}")

# ========== VERIFY COUNTS ==========

total_features = (
    len(drop_features) + 
    1 +  # target
    len(categorical_features) + 
    len(numerical_features) + 
    len(boolean_features)
)

print(f"\n Total features accounted for: {total_features}")
print(f"   Original features in dataset: {len(crashes.columns)}")

if total_features == len(crashes.columns):
    print("    All features categorized!")
else:
    print(f"    Missing {len(crashes.columns) - total_features} features")

print("\n" + "="*80)
print(" FEATURE ANALYSIS COMPLETE")
print("="*80)

ANALYZING FEATURE TYPES FOR PREPROCESSING

 Analyzing all features...

 Data types:
crash_id                         int64
crash_datetime                  object
crash_date                      object
latitude                       float64
longitude                      float64
n_crash_reports                  int64
contains_fatality_words          int64
contains_pedestrian_words        int64
contains_matatu_words            int64
contains_motorcycle_words        int64
severity                        object
hour                             int64
day_of_week                      int64
day_name                        object
month                            int64
month_name                      object
year                             int64
is_morning_rush                   bool
is_evening_rush                   bool
is_rush_hour                      bool
is_weekend                        bool
time_of_day                     object
distance_from_center_km        float64
distance_category  

In [3]:
# ============================================================================
# DROP UNNECESSARY FEATURES
# ============================================================================

print("="*80)
print("DROPPING UNNECESSARY FEATURES")
print("="*80)

print(f"\n Original dataset shape: {crashes.shape}")

# ========== DROP FEATURES ==========

print(f"\n Dropping {len(drop_features)} unnecessary features...")

crashes_clean = crashes.drop(columns=drop_features)

print(f" Features dropped!")
print(f"\n Cleaned dataset shape: {crashes_clean.shape}")
print(f"   Rows: {crashes_clean.shape[0]:,}")
print(f"   Columns: {crashes_clean.shape[1]}")

# ========== VERIFY REMAINING FEATURES ==========

print(f"\n Remaining features ({len(crashes_clean.columns)}):")
for i, col in enumerate(crashes_clean.columns, 1):
    print(f"   {i}. {col}")

# ========== SEPARATE FEATURES AND TARGET ==========

print("\n Separating features (X) and target (y)...")

X = crashes_clean.drop(columns=['severity'])
y = crashes_clean['severity']

print(f"\n Separation complete!")
print(f"   Features (X): {X.shape}")
print(f"   Target (y): {y.shape}")

print(f"\n Target distribution:")
print(y.value_counts())
print(f"\nTarget percentages:")
print(y.value_counts(normalize=True).mul(100).round(2))

print("\n" + "="*80)
print(" DATASET PREPARED FOR ENCODING")
print("="*80)
print(f"\n   Total features ready: {X.shape[1]}")
print(f"   â€¢ Categorical (to encode): {len(categorical_features)}")
print(f"   â€¢ Numerical (to normalize): {len(numerical_features)}")
print(f"   â€¢ Boolean (ready): {len(boolean_features)}")

DROPPING UNNECESSARY FEATURES

 Original dataset shape: (31064, 36)

 Dropping 8 unnecessary features...
 Features dropped!

 Cleaned dataset shape: (31064, 28)
   Rows: 31,064
   Columns: 28

 Remaining features (28):
   1. latitude
   2. longitude
   3. n_crash_reports
   4. contains_fatality_words
   5. contains_pedestrian_words
   6. contains_matatu_words
   7. contains_motorcycle_words
   8. severity
   9. hour
   10. day_of_week
   11. month
   12. year
   13. is_morning_rush
   14. is_evening_rush
   15. is_rush_hour
   16. is_weekend
   17. time_of_day
   18. distance_from_center_km
   19. distance_category
   20. crashes_at_location
   21. is_hotspot
   22. frequency_category
   23. severity_numeric
   24. avg_severity_at_location
   25. max_severity_at_location
   26. fatal_rate_at_location
   27. pedestrian_rate_at_location
   28. location_risk

 Separating features (X) and target (y)...

 Separation complete!
   Features (X): (31064, 27)
   Target (y): (31064,)

 Target dis

In [4]:
# ============================================================================
# ENCODE CATEGORICAL FEATURES
# ============================================================================

print("="*80)
print("ENCODING CATEGORICAL FEATURES")
print("="*80)

from sklearn.preprocessing import LabelEncoder

# ========== LABEL ENCODE TARGET VARIABLE ==========

print("\n Encoding target variable (severity)...")

# Create label encoder for target
le_target = LabelEncoder()
y_encoded = le_target.fit_transform(y)

print(f" Target encoded!")
print(f"\n   Label mapping:")
for i, label in enumerate(le_target.classes_):
    print(f"      {label} â†’ {i}")

print(f"\n   Encoded target distribution:")
unique, counts = np.unique(y_encoded, return_counts=True)
for label_num, count in zip(unique, counts):
    label_name = le_target.classes_[label_num]
    print(f"      {label_num} ({label_name}): {count:,}")

# ========== ONE-HOT ENCODE CATEGORICAL FEATURES ==========

print(f"\n One-hot encoding categorical features...")
print(f"   Features to encode: {categorical_features}")

# Copy X to avoid modifying original
X_encoded = X.copy()

# One-hot encode each categorical feature
encoded_dfs = []
for feature in categorical_features:
    print(f"\n   Encoding: {feature}")
    
    # Get unique values
    unique_vals = X_encoded[feature].unique()
    print(f"      Categories: {list(unique_vals)}")
    
    # One-hot encode
    encoded = pd.get_dummies(X_encoded[feature], prefix=feature, drop_first=False)
    encoded_dfs.append(encoded)
    
    print(f"      Created {len(encoded.columns)} dummy columns")

# Concatenate all encoded features
encoded_features = pd.concat(encoded_dfs, axis=1)

print(f"\n One-hot encoding complete!")
print(f"   Total dummy columns created: {len(encoded_features.columns)}")

# ========== DROP ORIGINAL CATEGORICAL COLUMNS ==========

print(f"\n Dropping original categorical columns...")

X_encoded = X_encoded.drop(columns=categorical_features)

print(f" Original categorical columns dropped!")
print(f"   Remaining columns: {X_encoded.shape[1]}")

# ========== COMBINE ENCODED FEATURES ==========

print(f"\n Combining encoded features with numerical/boolean features...")

X_final = pd.concat([X_encoded, encoded_features], axis=1)

print(f" Final feature matrix created!")
print(f"   Shape: {X_final.shape}")
print(f"   Features: {X_final.shape[1]}")

# ========== DISPLAY SAMPLE ==========

print(f"\n Sample of encoded dataset (first 5 rows, first 15 columns):")
print(X_final.iloc[:5, :15])

print("\n" + "="*80)
print(" CATEGORICAL ENCODING COMPLETE")
print("="*80)
print(f"\n   Original features: 27")
print(f"   Final features (after encoding): {X_final.shape[1]}")
print(f"   Features added: {X_final.shape[1] - 27}")

ENCODING CATEGORICAL FEATURES

 Encoding target variable (severity)...
 Target encoded!

   Label mapping:
      FATAL â†’ 0
      MINOR â†’ 1
      MODERATE â†’ 2
      SEVERE â†’ 3

   Encoded target distribution:
      0 (FATAL): 2,284
      1 (MINOR): 25,059
      2 (MODERATE): 2,121
      3 (SEVERE): 1,600

 One-hot encoding categorical features...
   Features to encode: ['time_of_day', 'distance_category', 'frequency_category', 'location_risk']

   Encoding: time_of_day
      Categories: ['Evening', 'Morning', 'Night', 'Afternoon']
      Created 4 dummy columns

   Encoding: distance_category
      Categories: ['5-10km', '20+km', '0-5km', '10-15km', '15-20km']
      Created 5 dummy columns

   Encoding: frequency_category
      Categories: ['High', 'Low', 'Moderate', 'Isolated']
      Created 4 dummy columns

   Encoding: location_risk
      Categories: ['Low Risk', 'Very High Risk', 'Moderate Risk', 'High Risk']
      Created 4 dummy columns

 One-hot encoding complete!
   Total

In [5]:
# ============================================================================
# NORMALIZE NUMERICAL FEATURES
# ============================================================================

print("="*80)
print("NORMALIZING NUMERICAL FEATURES")
print("="*80)

from sklearn.preprocessing import StandardScaler

# ========== IDENTIFY NUMERICAL COLUMNS IN FINAL DATASET ==========

print("\n Identifying numerical features in final dataset...")

# Original numerical features (that still exist after encoding)
numerical_cols = [col for col in numerical_features if col in X_final.columns]

print(f"\n   Numerical features to normalize: {len(numerical_cols)}")
for col in numerical_cols:
    print(f"      â€¢ {col}")

# ========== CHECK FEATURE DISTRIBUTIONS BEFORE NORMALIZATION ==========

print("\n Sample statistics BEFORE normalization:")
print(X_final[numerical_cols].describe().iloc[:3])  # Show mean, std, min

# ========== APPLY STANDARDIZATION ==========

print(f"\n Applying StandardScaler (zero mean, unit variance)...")

scaler = StandardScaler()
X_final[numerical_cols] = scaler.fit_transform(X_final[numerical_cols])

print(f" Normalization complete!")

# ========== VERIFY NORMALIZATION ==========

print("\n Sample statistics AFTER normalization:")
print(X_final[numerical_cols].describe().iloc[:3])  # Show mean, std, min

print("\n Verification:")
print(f"   Mean values should be ~0: {X_final[numerical_cols].mean().mean():.6f}")
print(f"   Std values should be ~1: {X_final[numerical_cols].std().mean():.6f}")

# ========== DISPLAY SAMPLE ==========

print(f"\n Sample of normalized dataset (first 5 rows, numerical features only):")
print(X_final[numerical_cols].head())

print("\n" + "="*80)
print(" FEATURE NORMALIZATION COMPLETE")
print("="*80)
print(f"\n   Total features in final dataset: {X_final.shape[1]}")
print(f"   Numerical features normalized: {len(numerical_cols)}")
print(f"   Boolean features (unchanged): {len(boolean_features)}")
print(f"   Encoded categorical features: 17")

NORMALIZING NUMERICAL FEATURES

 Identifying numerical features in final dataset...

   Numerical features to normalize: 14
      â€¢ latitude
      â€¢ longitude
      â€¢ n_crash_reports
      â€¢ hour
      â€¢ day_of_week
      â€¢ month
      â€¢ year
      â€¢ distance_from_center_km
      â€¢ crashes_at_location
      â€¢ severity_numeric
      â€¢ avg_severity_at_location
      â€¢ max_severity_at_location
      â€¢ fatal_rate_at_location
      â€¢ pedestrian_rate_at_location

 Sample statistics BEFORE normalization:
           latitude     longitude  n_crash_reports          hour  \
count  31064.000000  31064.000000     31064.000000  31064.000000   
mean      -1.272481     36.852499         1.400914     12.935746   
std        0.118961      0.113650         1.486540      5.525065   

        day_of_week         month          year  distance_from_center_km  \
count  31064.000000  31064.000000  31064.000000             31064.000000   
mean       2.854751      6.509464   2016.998

In [7]:
# ============================================================================
# SPLIT DATA INTO TRAIN/VALIDATION/TEST SETS
# ============================================================================

print("="*80)
print("SPLITTING DATA: TRAIN / VALIDATION / TEST")
print("="*80)

from sklearn.model_selection import train_test_split

# ========== SPLIT STRATEGY ==========

print("\n Split strategy:")
print("   â€¢ Training set: 70% (for model training)")
print("   â€¢ Validation set: 15% (for hyperparameter tuning)")
print("   â€¢ Test set: 15% (for final evaluation)")

# ========== FIRST SPLIT: TRAIN vs (VALIDATION + TEST) ==========

print("\n Step 1: Splitting train (70%) vs temp (30%)...")

X_train, X_temp, y_train, y_temp = train_test_split(
    X_final, 
    y_encoded, 
    test_size=0.30,  # 30% for validation + test
    random_state=42,
    stratify=y_encoded  # Maintain class distribution
)

print(f" First split complete!")
print(f"   Train: {X_train.shape[0]:,} samples ({X_train.shape[0]/len(X_final)*100:.1f}%)")
print(f"   Temp: {X_temp.shape[0]:,} samples ({X_temp.shape[0]/len(X_final)*100:.1f}%)")

# ========== SECOND SPLIT: VALIDATION vs TEST ==========

print("\n Step 2: Splitting temp into validation (15%) and test (15%)...")

X_val, X_test, y_val, y_test = train_test_split(
    X_temp,
    y_temp,
    test_size=0.50,  # Split temp equally into validation and test
    random_state=42,
    stratify=y_temp  # Maintain class distribution
)

print(f" Second split complete!")
print(f"   Validation: {X_val.shape[0]:,} samples ({X_val.shape[0]/len(X_final)*100:.1f}%)")
print(f"   Test: {X_test.shape[0]:,} samples ({X_test.shape[0]/len(X_final)*100:.1f}%)")

# ========== VERIFY SPLITS ==========

print("\n" + "="*80)
print(" FINAL DATASET SPLITS")
print("="*80)

print(f"\n Dataset sizes:")
print(f"   Total samples: {len(X_final):,}")
print(f"   Training: {len(X_train):,} ({len(X_train)/len(X_final)*100:.1f}%)")
print(f"   Validation: {len(X_val):,} ({len(X_val)/len(X_final)*100:.1f}%)")
print(f"   Test: {len(X_test):,} ({len(X_test)/len(X_final)*100:.1f}%)")

print(f"\n Class distribution in each set:")

# Train set
print("\n   TRAINING SET:")
train_dist = pd.Series(y_train).value_counts().sort_index()
for class_num, count in train_dist.items():
    class_name = le_target.classes_[class_num]
    pct = count / len(y_train) * 100
    print(f"      {class_num} ({class_name}): {count:,} ({pct:.2f}%)")

# Validation set
print("\n   VALIDATION SET:")
val_dist = pd.Series(y_val).value_counts().sort_index()
for class_num, count in val_dist.items():
    class_name = le_target.classes_[class_num]
    pct = count / len(y_val) * 100
    print(f"      {class_num} ({class_name}): {count:,} ({pct:.2f}%)")

# Test set
print("\n   TEST SET:")
test_dist = pd.Series(y_test).value_counts().sort_index()
for class_num, count in test_dist.items():
    class_name = le_target.classes_[class_num]
    pct = count / len(y_test) * 100
    print(f"      {class_num} ({class_name}): {count:,} ({pct:.2f}%)")

print("\n Class distributions are balanced across all sets!")

print("\n" + "="*80)
print(" DATA SPLITTING COMPLETE")
print("="*80)

SPLITTING DATA: TRAIN / VALIDATION / TEST

 Split strategy:
   â€¢ Training set: 70% (for model training)
   â€¢ Validation set: 15% (for hyperparameter tuning)
   â€¢ Test set: 15% (for final evaluation)

 Step 1: Splitting train (70%) vs temp (30%)...
 First split complete!
   Train: 21,744 samples (70.0%)
   Temp: 9,320 samples (30.0%)

 Step 2: Splitting temp into validation (15%) and test (15%)...
 Second split complete!
   Validation: 4,660 samples (15.0%)
   Test: 4,660 samples (15.0%)

 FINAL DATASET SPLITS

 Dataset sizes:
   Total samples: 31,064
   Training: 21,744 (70.0%)
   Validation: 4,660 (15.0%)
   Test: 4,660 (15.0%)

 Class distribution in each set:

   TRAINING SET:
      0 (FATAL): 1,599 (7.35%)
      1 (MINOR): 17,541 (80.67%)
      2 (MODERATE): 1,484 (6.82%)
      3 (SEVERE): 1,120 (5.15%)

   VALIDATION SET:
      0 (FATAL): 342 (7.34%)
      1 (MINOR): 3,759 (80.67%)
      2 (MODERATE): 319 (6.85%)
      3 (SEVERE): 240 (5.15%)

   TEST SET:
      0 (FATAL): 3

In [8]:
# ============================================================================
# CLASS IMBALANCE ANALYSIS
# ============================================================================

print("="*80)
print("CLASS IMBALANCE ANALYSIS")
print("="*80)

# ========== CALCULATE IMBALANCE RATIOS ==========

print("\n Analyzing class imbalance in training set...")

train_counts = pd.Series(y_train).value_counts().sort_index()
majority_class_count = train_counts.max()
minority_class_count = train_counts.min()

print(f"\n   Majority class (MINOR): {majority_class_count:,} samples")
print(f"   Minority class (SEVERE): {minority_class_count:,} samples")
print(f"   Imbalance ratio: {majority_class_count / minority_class_count:.2f}:1")

# ========== IMBALANCE BY CLASS ==========

print("\n Imbalance ratios for each class:")
for class_num in sorted(train_counts.index):
    class_name = le_target.classes_[class_num]
    count = train_counts[class_num]
    ratio = majority_class_count / count
    print(f"   {class_name}: {ratio:.2f}:1 (majority to this class)")

# ========== DECISION ON SMOTE ==========

print("\n" + "="*80)
print("SMOTE DECISION")
print("="*80)

imbalance_ratio = majority_class_count / minority_class_count

print(f"\n Should we apply SMOTE?")
print(f"\n   Imbalance ratio: {imbalance_ratio:.2f}:1")
print(f"\n   Guidelines:")
print(f"      â€¢ Ratio < 3:1  â†’ SMOTE not needed (balanced)")
print(f"      â€¢ Ratio 3-10:1 â†’ SMOTE recommended")
print(f"      â€¢ Ratio > 10:1 â†’ SMOTE highly recommended")

if imbalance_ratio < 3:
    decision = "NOT NEEDED"
    color = "ðŸŸ¢"
    reason = "Dataset is relatively balanced"
elif imbalance_ratio <= 10:
    decision = "RECOMMENDED"
    color = "ðŸŸ¡"
    reason = "Moderate imbalance exists"
else:
    decision = "HIGHLY RECOMMENDED"
    color = "ðŸ”´"
    reason = "Severe imbalance exists"

print(f"\n   {color} DECISION: SMOTE {decision}")
print(f"   Reason: {reason}")

# ========== ADDITIONAL CONSIDERATIONS ==========

print(f"\n Additional considerations:")
print(f"   â€¢ Minority class (SEVERE) has {minority_class_count:,} samples")
print(f"   â€¢ This is {'sufficient' if minority_class_count >= 500 else 'limited'} for training")
print(f"   â€¢ Class weights can be used as alternative to SMOTE")
print(f"   â€¢ We'll compare both approaches in model training")

print("\n" + "="*80)
print(" IMBALANCE ANALYSIS COMPLETE")
print("="*80)

CLASS IMBALANCE ANALYSIS

 Analyzing class imbalance in training set...

   Majority class (MINOR): 17,541 samples
   Minority class (SEVERE): 1,120 samples
   Imbalance ratio: 15.66:1

 Imbalance ratios for each class:
   FATAL: 10.97:1 (majority to this class)
   MINOR: 1.00:1 (majority to this class)
   MODERATE: 11.82:1 (majority to this class)
   SEVERE: 15.66:1 (majority to this class)

SMOTE DECISION

 Should we apply SMOTE?

   Imbalance ratio: 15.66:1

   Guidelines:
      â€¢ Ratio < 3:1  â†’ SMOTE not needed (balanced)
      â€¢ Ratio 3-10:1 â†’ SMOTE recommended
      â€¢ Ratio > 10:1 â†’ SMOTE highly recommended

   ðŸ”´ DECISION: SMOTE HIGHLY RECOMMENDED
   Reason: Severe imbalance exists

 Additional considerations:
   â€¢ Minority class (SEVERE) has 1,120 samples
   â€¢ This is sufficient for training
   â€¢ Class weights can be used as alternative to SMOTE
   â€¢ We'll compare both approaches in model training

 IMBALANCE ANALYSIS COMPLETE


In [9]:
# ============================================================================
# APPLY SMOTE TO TRAINING SET
# ============================================================================

print("="*80)
print("APPLYING SMOTE (SYNTHETIC MINORITY OVER-SAMPLING)")
print("="*80)

from imblearn.over_sampling import SMOTE

# ========== BEFORE SMOTE ==========

print("\n Class distribution BEFORE SMOTE:")
print(f"   Total training samples: {len(y_train):,}")
for class_num in sorted(train_counts.index):
    class_name = le_target.classes_[class_num]
    count = train_counts[class_num]
    pct = count / len(y_train) * 100
    print(f"   {class_name}: {count:,} ({pct:.2f}%)")

# ========== APPLY SMOTE ==========

print(f"\n Applying SMOTE...")
print(f"   Strategy: Balance all classes to majority class size")
print(f"   Target: ~17,541 samples per class")

smote = SMOTE(random_state=42, k_neighbors=5)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

print(f" SMOTE applied successfully!")

# ========== AFTER SMOTE ==========

print(f"\n Class distribution AFTER SMOTE:")
print(f"   Total training samples: {len(y_train_smote):,}")

train_smote_counts = pd.Series(y_train_smote).value_counts().sort_index()
for class_num in sorted(train_smote_counts.index):
    class_name = le_target.classes_[class_num]
    count = train_smote_counts[class_num]
    pct = count / len(y_train_smote) * 100
    print(f"   {class_name}: {count:,} ({pct:.2f}%)")

# ========== COMPARISON ==========

print(f"\n Comparison:")
print(f"   Training samples BEFORE SMOTE: {len(y_train):,}")
print(f"   Training samples AFTER SMOTE: {len(y_train_smote):,}")
print(f"   Synthetic samples created: {len(y_train_smote) - len(y_train):,}")

print(f"\n Result: All classes now balanced!")

# ========== IMPORTANT NOTE ==========

print("\n" + "="*80)
print("  IMPORTANT: SMOTE ONLY APPLIED TO TRAINING SET")
print("="*80)

print("\n    Training set: SMOTE applied (balanced)")
print(f"      â€¢ Samples: {len(X_train_smote):,}")
print(f"      â€¢ Classes: Balanced (25% each)")

print("\n    Validation set: Original (imbalanced)")
print(f"      â€¢ Samples: {len(X_val):,}")
print(f"      â€¢ Classes: Original distribution (80.67% MINOR)")

print("\n    Test set: Original (imbalanced)")
print(f"      â€¢ Samples: {len(X_test):,}")
print(f"      â€¢ Classes: Original distribution (80.67% MINOR)")

print("\n    Why?")
print("      â€¢ Train on balanced data â†’ model learns all classes equally")
print("      â€¢ Evaluate on real data â†’ assess real-world performance")

print("\n" + "="*80)
print(" SMOTE APPLICATION COMPLETE")
print("="*80)

APPLYING SMOTE (SYNTHETIC MINORITY OVER-SAMPLING)

 Class distribution BEFORE SMOTE:
   Total training samples: 21,744
   FATAL: 1,599 (7.35%)
   MINOR: 17,541 (80.67%)
   MODERATE: 1,484 (6.82%)
   SEVERE: 1,120 (5.15%)

 Applying SMOTE...
   Strategy: Balance all classes to majority class size
   Target: ~17,541 samples per class
 SMOTE applied successfully!

 Class distribution AFTER SMOTE:
   Total training samples: 70,164
   FATAL: 17,541 (25.00%)
   MINOR: 17,541 (25.00%)
   MODERATE: 17,541 (25.00%)
   SEVERE: 17,541 (25.00%)

 Comparison:
   Training samples BEFORE SMOTE: 21,744
   Training samples AFTER SMOTE: 70,164
   Synthetic samples created: 48,420

 Result: All classes now balanced!

  IMPORTANT: SMOTE ONLY APPLIED TO TRAINING SET

    Training set: SMOTE applied (balanced)
      â€¢ Samples: 70,164
      â€¢ Classes: Balanced (25% each)

    Validation set: Original (imbalanced)
      â€¢ Samples: 4,660
      â€¢ Classes: Original distribution (80.67% MINOR)

    Test s

In [10]:
# ============================================================================
# SAVE PREPROCESSED DATASETS
# ============================================================================

print("="*80)
print("SAVING PREPROCESSED DATASETS")
print("="*80)

import pickle
import os

# Create directory if it doesn't exist
os.makedirs(r'D:\Nairobi-Accident-Severity\data\processed', exist_ok=True)

# ========== SAVE TRAINING DATA (WITH SMOTE) ==========

print("\n Saving training data (with SMOTE)...")

# Save as CSV
pd.DataFrame(X_train_smote, columns=X_final.columns).to_csv(
    r'D:\Nairobi-Accident-Severity\data\processed\X_train_smote.csv',
    index=False
)
pd.DataFrame(y_train_smote, columns=['severity_encoded']).to_csv(
    r'D:\Nairobi-Accident-Severity\data\processed\y_train_smote.csv',
    index=False
)

# Save as pickle (faster for ML)
with open(r'D:\Nairobi-Accident-Severity\data\processed\X_train_smote.pkl', 'wb') as f:
    pickle.dump(X_train_smote, f)
with open(r'D:\Nairobi-Accident-Severity\data\processed\y_train_smote.pkl', 'wb') as f:
    pickle.dump(y_train_smote, f)

print(f" Training data saved!")
print(f"   X_train_smote: {X_train_smote.shape}")
print(f"   y_train_smote: {y_train_smote.shape}")

# ========== SAVE VALIDATION DATA ==========

print("\n Saving validation data...")

pd.DataFrame(X_val, columns=X_final.columns).to_csv(
    r'D:\Nairobi-Accident-Severity\data\processed\X_val.csv',
    index=False
)
pd.DataFrame(y_val, columns=['severity_encoded']).to_csv(
    r'D:\Nairobi-Accident-Severity\data\processed\y_val.csv',
    index=False
)

with open(r'D:\Nairobi-Accident-Severity\data\processed\X_val.pkl', 'wb') as f:
    pickle.dump(X_val, f)
with open(r'D:\Nairobi-Accident-Severity\data\processed\y_val.pkl', 'wb') as f:
    pickle.dump(y_val, f)

print(f" Validation data saved!")
print(f"   X_val: {X_val.shape}")
print(f"   y_val: {y_val.shape}")

# ========== SAVE TEST DATA ==========

print("\n Saving test data...")

pd.DataFrame(X_test, columns=X_final.columns).to_csv(
    r'D:\Nairobi-Accident-Severity\data\processed\X_test.csv',
    index=False
)
pd.DataFrame(y_test, columns=['severity_encoded']).to_csv(
    r'D:\Nairobi-Accident-Severity\data\processed\y_test.csv',
    index=False
)

with open(r'D:\Nairobi-Accident-Severity\data\processed\X_test.pkl', 'wb') as f:
    pickle.dump(X_test, f)
with open(r'D:\Nairobi-Accident-Severity\data\processed\y_test.pkl', 'wb') as f:
    pickle.dump(y_test, f)

print(f" Test data saved!")
print(f"   X_test: {X_test.shape}")
print(f"   y_test: {y_test.shape}")

# ========== SAVE PREPROCESSING OBJECTS ==========

print("\n Saving preprocessing objects...")

# Save scaler
with open(r'D:\Nairobi-Accident-Severity\data\processed\scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

# Save label encoder
with open(r'D:\Nairobi-Accident-Severity\data\processed\label_encoder.pkl', 'wb') as f:
    pickle.dump(le_target, f)

# Save feature names
with open(r'D:\Nairobi-Accident-Severity\data\processed\feature_names.pkl', 'wb') as f:
    pickle.dump(list(X_final.columns), f)

print(f" Preprocessing objects saved!")
print(f"   â€¢ StandardScaler")
print(f"   â€¢ LabelEncoder")
print(f"   â€¢ Feature names")

# ========== SAVE METADATA ==========

print("\n Saving preprocessing metadata...")

metadata = {
    'preprocessing_date': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
    'original_samples': len(X_final),
    'train_samples': len(X_train_smote),
    'val_samples': len(X_val),
    'test_samples': len(X_test),
    'total_features': X_final.shape[1],
    'numerical_features': numerical_cols,
    'boolean_features': boolean_features,
    'categorical_features': categorical_features,
    'smote_applied': True,
    'smote_samples_created': len(y_train_smote) - len(y_train),
    'class_mapping': {i: label for i, label in enumerate(le_target.classes_)}
}

with open(r'D:\Nairobi-Accident-Severity\data\processed\preprocessing_metadata.pkl', 'wb') as f:
    pickle.dump(metadata, f)

print(f" Metadata saved!")

print("\n" + "="*80)
print(" ALL DATASETS SAVED SUCCESSFULLY")
print("="*80)

print(f"\n Files created:")
print(f"   â€¢ X_train_smote.csv / .pkl")
print(f"   â€¢ y_train_smote.csv / .pkl")
print(f"   â€¢ X_val.csv / .pkl")
print(f"   â€¢ y_val.csv / .pkl")
print(f"   â€¢ X_test.csv / .pkl")
print(f"   â€¢ y_test.csv / .pkl")
print(f"   â€¢ scaler.pkl")
print(f"   â€¢ label_encoder.pkl")
print(f"   â€¢ feature_names.pkl")
print(f"   â€¢ preprocessing_metadata.pkl")

print(f"\n Use pickle files (.pkl) for faster loading in model training")

SAVING PREPROCESSED DATASETS

 Saving training data (with SMOTE)...
 Training data saved!
   X_train_smote: (70164, 40)
   y_train_smote: (70164,)

 Saving validation data...
 Validation data saved!
   X_val: (4660, 40)
   y_val: (4660,)

 Saving test data...
 Test data saved!
   X_test: (4660, 40)
   y_test: (4660,)

 Saving preprocessing objects...
 Preprocessing objects saved!
   â€¢ StandardScaler
   â€¢ LabelEncoder
   â€¢ Feature names

 Saving preprocessing metadata...
 Metadata saved!

 ALL DATASETS SAVED SUCCESSFULLY

 Files created:
   â€¢ X_train_smote.csv / .pkl
   â€¢ y_train_smote.csv / .pkl
   â€¢ X_val.csv / .pkl
   â€¢ y_val.csv / .pkl
   â€¢ X_test.csv / .pkl
   â€¢ y_test.csv / .pkl
   â€¢ scaler.pkl
   â€¢ label_encoder.pkl
   â€¢ feature_names.pkl
   â€¢ preprocessing_metadata.pkl

 Use pickle files (.pkl) for faster loading in model training
