# Notebook

# Step 5: Predictive Analysis (Modeling)

This is the core machine learning phase. We will build, train, and evaluate several models.

## 5.1: Setup and Imports

In [1]:
import pandas as pd
import numpy as np
import yaml
import os
import joblib
import sys
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Add project root to path
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.insert(0, project_root)

from src.features import create_features, split_and_scale_data
from src.eval import evaluate_model

## 5.2: Load Data and Config

In [2]:
## Load config files
with open('../config/config.yaml', 'r') as f:
    config = yaml.safe_load(f)

with open('../config/model_params.yaml', 'r') as f:
    params = yaml.safe_load(f)

## Load the cleaned dataset
INTERIM_DATA_PATH = os.path.join('..', config['data_paths']['interim'])
WRANGLED_DATA_FILE = os.path.join(INTERIM_DATA_PATH, config['data_files']['wrangled_data'])
df = pd.read_csv(WRANGLED_DATA_FILE)

# Display the first few rows to confirm it loaded correctly
df.head()

Unnamed: 0,flight_number,Date,BoosterVersion,PayloadMass,Orbit,LaunchSite,Outcome,Flights,GridFins,Reused,Legs,class
0,1,2006-03-24 22:30:00+00:00,FalconSat,20.0,LEO,5e9e4502f5090995de566f86,False,1.0,False,False,False,0
1,2,2007-03-21 01:10:00+00:00,DemoSat,7842.388855,LEO,5e9e4502f5090995de566f86,False,1.0,False,False,False,0
2,3,2008-08-03 03:34:00+00:00,Trailblazer,7842.388855,LEO,5e9e4502f5090995de566f86,False,1.0,False,False,False,0
3,4,2008-09-28 23:15:00+00:00,RatSat,165.0,LEO,5e9e4502f5090995de566f86,False,1.0,False,False,False,0
4,5,2009-07-13 03:35:00+00:00,RazakSat,200.0,LEO,5e9e4502f5090995de566f86,False,1.0,False,False,False,0




## 5.3: Feature Engineering and Data Splitting

In [3]:
# Apply one-hot encoding
features_df = create_features(df)

# Define features (X) and target (Y)
Y = features_df['class'].values
X = features_df.drop(columns=['class', 'Date', 'BoosterVersion', 'Outcome'])

# Split and scale the data
X_train, X_test, Y_train, Y_test, scaler = split_and_scale_data(
    X, Y,
    test_size=config['project_settings']['test_size'],
    random_state=config['project_settings']['random_state']
)

print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of X_test: {X_test.shape}")

Shape of X_train: (164, 24)
Shape of X_test: (41, 24)


## 5.4: Model Training and Hyperparameter Tuning

In [4]:
# Define models
models = {
    'LogisticRegression': LogisticRegression(),
    'SVM': SVC(),
    'DecisionTree': DecisionTreeClassifier(),
    'RandomForest': RandomForestClassifier()
}

best_estimators = {}
for name, model in models.items():
    print(f"--- Tuning {name} ---")
    param_grid = params.get(name, {})  # Use .get for safety
    if not param_grid:
        print(f"No parameters found for {name}, using defaults.")
        model.fit(X_train, Y_train)
        best_estimators[name] = model
        continue

    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='f1', n_jobs=-1, verbose=1)
    grid_search.fit(X_train, Y_train)
    best_estimators[name] = grid_search.best_estimator_
    print(f"Best Score for {name}: {grid_search.best_score_:.4f}\n")

--- Tuning LogisticRegression ---
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best Score for LogisticRegression: 0.9335

--- Tuning SVM ---
Fitting 5 folds for each of 32 candidates, totalling 160 fits
Best Score for SVM: 0.9335

--- Tuning DecisionTree ---
Fitting 5 folds for each of 72 candidates, totalling 360 fits
Best Score for DecisionTree: 0.9285

--- Tuning RandomForest ---
Fitting 5 folds for each of 324 candidates, totalling 1620 fits
Best Score for RandomForest: 0.9338



## 5.5: Model Evaluation on Test Set

In [5]:
results = []
for name, model in best_estimators.items():
    metrics = evaluate_model(model, X_test, Y_test)
    metrics['Model'] = name
    results.append(metrics)

results_df = pd.DataFrame(results).set_index('Model')
print("--- Model Evaluation on Test Set ---")
display(results_df)

--- Model Evaluation on Test Set ---


Unnamed: 0_level_0,Accuracy,Precision,Recall,F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
LogisticRegression,0.926829,0.90625,1.0,0.95082
SVM,0.926829,0.90625,1.0,0.95082
DecisionTree,0.926829,0.90625,1.0,0.95082
RandomForest,0.902439,0.878788,1.0,0.935484


## 5.6: Select and Save the Best Model

In [6]:
# Find the best model based on F1-Score
best_model_name = results_df['F1-Score'].idxmax()
best_model = best_estimators[best_model_name]

print(f"The best performing model is: {best_model_name}")

# Define paths for saving
MODELS_PATH = os.path.join('..', config['data_paths']['models'])
os.makedirs(MODELS_PATH, exist_ok=True)
model_filepath = os.path.join(MODELS_PATH, 'best_model.pkl')
scaler_filepath = os.path.join(MODELS_PATH, 'scaler.pkl')

# Save the model and the scaler
joblib.dump(best_model, model_filepath)
joblib.dump(scaler, scaler_filepath)

print(f"Model saved to {model_filepath}")
print(f"Scaler saved to {scaler_filepath}")

The best performing model is: LogisticRegression
Model saved to ..\models\best_model.pkl
Scaler saved to ..\models\scaler.pkl
