In [1]:
!pip install scikit-learn



In [2]:
import pandas as pd  
import numpy as np  
import matplotlib.pyplot as plt  
import seaborn as sns  
from datetime import datetime  
import joblib  
import os  
import pickle  
import warnings  
warnings.filterwarnings('ignore')  
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold  
from sklearn.pipeline import Pipeline  
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder  
from sklearn.compose import ColumnTransformer  
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix, roc_curve, precision_recall_curve  
from sklearn.impute import SimpleImputer  

from sklearn.ensemble import RandomForestClassifier  
from sklearn.linear_model import LogisticRegression  
from sklearn.naive_bayes import GaussianNB  
import torch  
import torch.nn as nn  
import torch.optim as optim  
from torch.utils.data import Dataset, DataLoader, TensorDataset  
from torch.nn import functional as F  

In [3]:
def preprocess_data(df):  
    """  
    Preprocess the accident dataset for model training  
    
    Args:  
        df: Pandas DataFrame with accident data  
        
    Returns:  
        Processed features and target  
    """  
    print("Starting data preprocessing...")  
    
    df = df.dropna()
    def process_datetime(date_str):  
        try:  
            # Try different date formats  
            for fmt in ['%m/%d/%Y %I:%M:%S %p', '%m/%d/%Y %H:%M:%S']:  
                try:  
                    date_obj = datetime.strptime(date_str, fmt)  
                    return date_obj  
                except ValueError:  
                    continue  
            return pd.NaT  
        except:  
            return pd.NaT  

    
    if 'crash_type' in df.columns:  
        df['target'] = (df['crash_type'] == 'INJURY AND / OR TOW DUE TO CRASH').astype(int)  
        print(f"Target distribution: {df['target'].value_counts().to_dict()}")  
    
    categorical_features = [  
        'traffic_control_device', 'weather_condition', 'lighting_condition',  
        'first_crash_type', 'trafficway_type', 'alignment', 'roadway_surface_cond',  
        'road_defect', 'intersection_related_i', 'damage', 'prim_contributory_cause'  
    ]  
    
    # Add time_of_day if it exists  
    if 'time_of_day' in df.columns:  
        categorical_features.append('time_of_day')  
    
    numerical_features = [  
        'num_units'  
    ]  
    
    # Add temporal features if they exist  
    temporal_features = ['crash_hour', 'crash_day_of_week', 'crash_month',   
                       'is_weekend', 'is_rush_hour']  
    
    for feat in temporal_features:  
        if feat in df.columns:  
            numerical_features.append(feat)  
    
    # Optionally include injury features (remove if causing data leakage)  
    injury_features = [  
        'injuries_total', 'injuries_fatal', 'injuries_incapacitating',  
        'injuries_non_incapacitating', 'injuries_reported_not_evident',   
        'injuries_no_indication'  
    ]  
      
    for feat in injury_features:  
        if feat in df.columns:  
            numerical_features.append(feat)  
    

    # Make sure all categorical_features and numerical_features exist in df  
    categorical_features = [f for f in categorical_features if f in df.columns]  
    numerical_features = [f for f in numerical_features if f in df.columns]  
    
    print(f"Using {len(categorical_features)} categorical features and {len(numerical_features)} numerical features")  
    
    # 5. Return preprocessed data  
    X = df[categorical_features + numerical_features]  
    if 'target' in df.columns:  
        y = df['target']  
    else:  
        y = None  # For prediction-only scenarios  
    
    return X, y, categorical_features, numerical_features  


**CREATE FOLDER**

In [4]:
# Create output directories for models and plots  
os.makedirs('output', exist_ok=True)  
os.makedirs('output/models', exist_ok=True)  
os.makedirs('output/plots', exist_ok=True)  
os.makedirs('output/metrics', exist_ok=True)  

print("Starting accident prediction system...")  

Starting accident prediction system...


 **SAVE & LOAD MODELS**


In [5]:
def save_models(sklearn_models):  
    """  
    Save all trained models to files  
    
    Args:  
        sklearn_models: Dictionary with trained sklearn models  
    """  
    print("\nSaving models to files...")  
    
    # Save sklearn models  
    joblib.dump(sklearn_models['random_forest'], 'output/models/random_forest_model.pkl')  
    joblib.dump(sklearn_models['logistic_regression'], 'output/models/logistic_regression_model.pkl')  
    joblib.dump(sklearn_models['naive_bayes'], 'output/models/naive_bayes_model.pkl')  
    
    # Save model metadata  
    metadata = {  
        'models': ['random_forest', 'logistic_regression', 'naive_bayes', 'neural_network'],  
        'sklearn_categorical_features': sklearn_models['categorical_features'],  
        'sklearn_numerical_features': sklearn_models['numerical_features'],  
        'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S')  
    }  
    
    with open('output/models/model_metadata.pkl', 'wb') as f:  
        pickle.dump(metadata, f)  
    
    print("All models successfully saved!")  

def load_models():  
    """  
    Load all trained models from files  
    
    Returns:  
        Dictionary with loaded models  
    """  
    print("\nLoading models from files...")  
    
    models = {}  
    
    try:  
        # Load sklearn models  
        models['random_forest'] = joblib.load('output/models/random_forest_model.pkl')  
        models['logistic_regression'] = joblib.load('output/models/logistic_regression_model.pkl')  
        models['naive_bayes'] = joblib.load('output/models/naive_bayes_model.pkl')  
        
        # Load PyTorch neural network  
        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")  
        
        try:  
            # Try to load full model first  
            models['neural_network'] = torch.load('output/models/full_nn_model.pt', map_location=device)  
        except:  
            # If that fails, load the preprocessing info and build the model  
            with open('output/models/nn_preprocessing.pkl', 'rb') as f:  
                preprocessing = pickle.load(f)  
            
            cat_dims = preprocessing['cat_dims']  
            num_dims = len(preprocessing['numerical_features'])  
            
            # Create the model architecture and load weights  
            models['neural_network'] = EmbeddingNet(cat_dims=cat_dims, num_dims=num_dims).to(device)  
            models['neural_network'].load_state_dict(torch.load('output/models/best_nn_model.pt', map_location=device))  
        
        # Set the model to evaluation mode  
        models['neural_network'].eval()  
        
        # Load model metadata  
        with open('output/models/model_metadata.pkl', 'rb') as f:  
            models['metadata'] = pickle.load(f)  
        
        print("All models successfully loaded!")  
    except Exception as e:  
        print(f"Error loading models: {e}")  
    
    return models  

**RANDOM FOREST, LOGISTIC REGRESSION, NAIVE BAYES**

In [6]:
def train_sklearn_models(X, y, categorical_features, numerical_features):  
    """  
    Train Random Forest, Logistic Regression, and Naive Bayes models  
    
    Args:  
        X: Feature DataFrame  
        y: Target Series  
        categorical_features: List of categorical feature names  
        numerical_features: List of numerical feature names  
        
    Returns:  
        Dictionary of trained models and preprocessors  
    """  
    # Train/test split  
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)  
    
    # Save train/test indices for consistency across models  
    train_indices = X_train.index  
    test_indices = X_test.index  
    
    indices_info = {  
        'train_indices': train_indices,  
        'test_indices': test_indices  
    }  
    
    # Save indices for later use  
    with open('output/models/train_test_indices.pkl', 'wb') as f:  
        pickle.dump(indices_info, f)  
    
    # Create preprocessor  
    preprocessor = ColumnTransformer(  
        transformers=[  
            ('cat', Pipeline([  
                ('imputer', SimpleImputer(strategy='constant', fill_value='UNKNOWN')),  
                ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))  
            ]), categorical_features),  
            ('num', Pipeline([  
                ('imputer', SimpleImputer(strategy='median')),  
                ('scaler', StandardScaler())  
            ]), numerical_features)  
        ]  
    )  
    
    # 1. Random Forest  
    print("\nTraining Random Forest...")  
    rf_pipeline = Pipeline([  
        ('preprocessor', preprocessor),  
        ('classifier', RandomForestClassifier(  
            n_estimators=100,   
            max_depth=15,  
            min_samples_split=10,  
            min_samples_leaf=5,  
            class_weight='balanced',  
            random_state=42  
        ))  
    ])  
    
    rf_pipeline.fit(X_train, y_train)  
    
    # 2. Logistic Regression  
    print("Training Logistic Regression...")  
    lr_pipeline = Pipeline([  
        ('preprocessor', preprocessor),  
        ('classifier', LogisticRegression(  
            C=1.0,  
            max_iter=1000,  
            class_weight='balanced',  
            random_state=42  
        ))  
    ])  
    
    lr_pipeline.fit(X_train, y_train)  
    
    # 3. Naive Bayes  
    print("Training Naive Bayes...")  
    nb_pipeline = Pipeline([  
        ('preprocessor', preprocessor),  
        ('classifier', GaussianNB())  
    ])  
    
    nb_pipeline.fit(X_train, y_train)  
    
    # Save preprocessor separately for potential reuse  
    joblib.dump(preprocessor, 'output/models/sklearn_preprocessor.pkl')  
    
    # Return models and test data  
    return {  
        'random_forest': rf_pipeline,  
        'logistic_regression': lr_pipeline,  
        'naive_bayes': nb_pipeline,  
        'X_test': X_test,  
        'y_test': y_test,  
        'preprocessor': preprocessor,  
        'categorical_features': categorical_features,  
        'numerical_features': numerical_features  
    }  

**NEURAL NETWORK**

In [7]:
# Define the residual block  
class ResidualBlock(nn.Module):  
    def __init__(self, in_features, out_features, dropout=0.3):  
        super(ResidualBlock, self).__init__()  
        self.block = nn.Sequential(  
            nn.Linear(in_features, out_features),  
            nn.BatchNorm1d(out_features),  
            nn.ReLU(),  
            nn.Dropout(dropout),  
            nn.Linear(out_features, out_features),  
            nn.BatchNorm1d(out_features),  
        )  
        self.relu = nn.ReLU()  
        self.dropout = nn.Dropout(dropout)  
        # If dimensions don't match, use a projection shortcut  
        if in_features != out_features:  
            self.shortcut = nn.Linear(in_features, out_features)  
        else:  
            self.shortcut = nn.Identity()  
            
    def forward(self, x):  
        residual = self.shortcut(x)  
        out = self.block(x)  
        out += residual  
        out = self.relu(out)  
        out = self.dropout(out)  
        return out  

# Define a simple self-attention mechanism  
class SelfAttention(nn.Module):  
    def __init__(self, dim):  
        super(SelfAttention, self).__init__()  
        self.query = nn.Linear(dim, dim)  
        self.key = nn.Linear(dim, dim)  
        self.value = nn.Linear(dim, dim)  
        self.scale = dim ** -0.5  
        self.softmax = nn.Softmax(dim=-1)  
        
    def forward(self, x):  
        # Reshape for attention if necessary  
        original_shape = x.shape  
        if len(original_shape) == 2:  
            # For batched inputs without sequence dimension  
            x = x.unsqueeze(1)  # Add sequence dimension  
            
        q = self.query(x)  
        k = self.key(x)  
        v = self.value(x)  
        
        # Compute attention scores  
        attn = torch.bmm(q, k.transpose(1, 2)) * self.scale  
        attn = self.softmax(attn)  
        
        # Apply attention to values  
        out = torch.bmm(attn, v)  
        
        # Reshape back if needed  
        if len(original_shape) == 2:  
            out = out.squeeze(1)  
            
        return out  
    
class EmbeddingNet(nn.Module):  
    """  
    PyTorch neural network model with embedding layers for categorical features  
    """  
    def __init__(self, cat_dims, num_dims):  
        """  
        Initialize the neural network  
        
        Args:  
            cat_dims: List of dimensions for each categorical feature  
            num_dims: Number of numerical features  
        """  
        super(EmbeddingNet, self).__init__()  
        
        # Embedding layers for categorical features  
        self.embeddings = nn.ModuleList()  
        self.embedding_dims = []  
        
        for dim in cat_dims:  
            # Rule of thumb for embedding size: min(50, (cardinality+1)//2)  
            embed_dim = min(50, (dim+1)//2)  
            self.embedding_dims.append(embed_dim)  
            self.embeddings.append(nn.Embedding(dim+1, embed_dim))  
        
        # Calculate total input dimension after embeddings and numerical features  
        total_embed_dim = sum(self.embedding_dims) + num_dims  
        
        # Deep layers  
        self.layers = nn.Sequential(  
            nn.BatchNorm1d(total_embed_dim),  
            nn.Linear(total_embed_dim, 256),  # Wider first layer  
            nn.LeakyReLU(0.2),  # LeakyReLU instead of ReLU  
            nn.Dropout(0.3),  
            nn.BatchNorm1d(256),  
            
            # First residual block  
            ResidualBlock(256, 256, dropout=0.3),  
            
            nn.Linear(256, 192),  
            nn.GELU(),  # Using GELU activation  
            nn.Dropout(0.4),  # Increased dropout  
            nn.BatchNorm1d(192),  
            
            # Self-attention mechanism  
            SelfAttention(192),  
            
            nn.Linear(192, 128),  
            nn.SiLU(),  # Using SiLU (Swish) activation  
            nn.Dropout(0.3),  
            nn.BatchNorm1d(128),  
            
            # Second residual block  
            ResidualBlock(128, 128, dropout=0.3),  
            
            nn.Linear(128, 96),  
            nn.Mish(),  # Using Mish activation  
            nn.Dropout(0.25),  
            nn.BatchNorm1d(96),  
            
            nn.Linear(96, 64),  
            nn.ReLU(),  
            nn.Dropout(0.2),  
            nn.BatchNorm1d(64),  
            
            # Third residual block  
            ResidualBlock(64, 64, dropout=0.2),  
            
            nn.Linear(64, 32),  
            nn.ReLU(),  
            nn.BatchNorm1d(32),  
            nn.Dropout(0.2),  
            
            nn.Linear(32, 16),  
            nn.ReLU(),  
            nn.BatchNorm1d(16),  
            
            nn.Linear(16, 1),  
            nn.Sigmoid()  
        )  
        
    def forward(self, cat_x, num_x):  
        """  
        Forward pass through the network  
        
        Args:  
            cat_x: Categorical features (as a list of tensors)  
            num_x: Numerical features  
            
        Returns:  
            Model output (probability)  
        """  
        # Process embeddings  
        embeds = []  
        for i, embedding_layer in enumerate(self.embeddings):  
            embed = embedding_layer(cat_x[:, i])  
            embeds.append(embed)  
        
        # Concatenate all embeddings and numerical features  
        x = torch.cat([*embeds, num_x], dim=1)  
        
        # Feed through deep layers  
        return self.layers(x)  

def train_neural_network(X, y, categorical_features, numerical_features):  
    """  
    Train PyTorch Neural Network model with embedding layers  
    
    Args:  
        X: Feature DataFrame  
        y: Target Series  
        categorical_features: List of categorical feature names  
        numerical_features: List of numerical feature names  
        
    Returns:  
        Trained model and preprocessors  
    """  
    print("\nPreprocessing data for Neural Network...")  
    
    # Set device (GPU if available, otherwise CPU)  
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")  
    print(f"Using device: {device}")  
    
    # Load train/test indices for consistency  
    try:  
        with open('output/models/train_test_indices.pkl', 'rb') as f:  
            indices = pickle.load(f)  
            train_indices = indices['train_indices']  
            test_indices = indices['test_indices']  
            
        X_train = X.loc[train_indices]  
        X_test = X.loc[test_indices]  
        y_train = y.loc[train_indices]  
        y_test = y.loc[test_indices]  
        print("Using consistent train/test split with sklearn models")  
    except:  
        # If indices file doesn't exist, create new split  
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)  
        print("Created new train/test split for neural network")  
    
    # Preprocess categorical features (label encoding for embeddings)  
    label_encoders = {}  
    cat_encoded_df = pd.DataFrame(index=X.index)  
    
    for feature in categorical_features:  
        le = LabelEncoder()  
        cat_encoded_df[feature] = le.fit_transform(X[feature].astype(str))  
        label_encoders[feature] = le  
    
    # Get cardinalities for embedding dimensions  
    cat_dims = [len(label_encoders[col].classes_) for col in categorical_features]  
    
    # Preprocess numerical features  
    scaler = StandardScaler()  
    num_df = pd.DataFrame(scaler.fit_transform(X[numerical_features]),   
                          columns=numerical_features,  
                          index=X.index)  
    
    # Prepare PyTorch tensors  
    X_cat_train = torch.tensor(cat_encoded_df.loc[train_indices].values, dtype=torch.long).to(device)  
    X_cat_test = torch.tensor(cat_encoded_df.loc[test_indices].values, dtype=torch.long).to(device)  
    X_num_train = torch.tensor(num_df.loc[train_indices].values, dtype=torch.float32).to(device)  
    X_num_test = torch.tensor(num_df.loc[test_indices].values, dtype=torch.float32).to(device)  
    y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).unsqueeze(1).to(device)  
    y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).unsqueeze(1).to(device)  
    
    # Create dataset and dataloader  
    train_dataset = TensorDataset(X_cat_train, X_num_train, y_train_tensor)  
    test_dataset = TensorDataset(X_cat_test, X_num_test, y_test_tensor)  
    
    batch_size = 64  
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)  
    test_loader = DataLoader(test_dataset, batch_size=batch_size)  
    
    # Create model  
    print("Building Neural Network model...")  
    model = EmbeddingNet(cat_dims=cat_dims, num_dims=X_num_train.shape[1]).to(device)  
    print(model)  
    
    # Loss and optimizer  
    criterion = nn.BCELoss()  
    optimizer = optim.Adam(model.parameters(), lr=0.001)  
    
    # Calculate class weights for imbalanced data  
    if sum(y_train) / len(y_train) < 0.5:  
        pos_weight = torch.tensor((len(y_train) - sum(y_train)) / sum(y_train)).to(device)  
        criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)  
    
    # Training loop  
    print("Training Neural Network...")  
    epochs = 30  
    train_losses = []  
    val_losses = []  
    train_aucs = []  
    val_aucs = []  
    best_val_auc = 0  
    patience = 10  
    patience_counter = 0  
    
    for epoch in range(epochs):  
        # Training  
        model.train()  
        train_loss = 0  
        train_preds = []  
        train_true = []  
        
        for cat_data, num_data, targets in train_loader:  
            # Zero the gradients  
            optimizer.zero_grad()  
            
            # Forward pass  
            outputs = model(cat_data, num_data)  
            
            # Calculate loss  
            loss = criterion(outputs, targets)  
            
            # Backward pass and optimize  
            loss.backward()  
            optimizer.step()  
            
            # Record loss and predictions  
            train_loss += loss.item() * len(targets)  
            train_preds.extend(outputs.detach().cpu().numpy())  
            train_true.extend(targets.detach().cpu().numpy())  
        
        train_loss /= len(train_dataset)  
        train_auc = roc_auc_score(train_true, train_preds)  
        train_losses.append(train_loss)  
        train_aucs.append(train_auc)  
        
        # Validation  
        model.eval()  
        val_loss = 0  
        val_preds = []  
        val_true = []  
        
        with torch.no_grad():  
            for cat_data, num_data, targets in test_loader:  
                outputs = model(cat_data, num_data)  
                loss = criterion(outputs, targets)  
                
                val_loss += loss.item() * len(targets)  
                val_preds.extend(outputs.detach().cpu().numpy())  
                val_true.extend(targets.detach().cpu().numpy())  
        
        val_loss /= len(test_dataset)  
        val_auc = roc_auc_score(val_true, val_preds)  
        val_losses.append(val_loss)  
        val_aucs.append(val_auc)  
        
        # Print progress  
        print(f"Epoch {epoch+1}/{epochs} - Train Loss: {train_loss:.4f}, Train AUC: {train_auc:.4f}, Val Loss: {val_loss:.4f}, Val AUC: {val_auc:.4f}")  
        
        # Check for improvement  
        if val_auc > best_val_auc:  
            best_val_auc = val_auc  
            patience_counter = 0  
            # Save best model  
            torch.save(model.state_dict(), 'output/models/best_nn_model.pt')  
            print(f"Improved! Saved model with validation AUC: {val_auc:.4f}")  
        else:  
            patience_counter += 1  
            if patience_counter >= patience:  
                print(f"Early stopping after {epoch+1} epochs")  
                break  
    
    # Load best model  
    model.load_state_dict(torch.load('output/models/best_nn_model.pt'))  
    
    # Always save the final model in addition to the best model  
    torch.save(model.state_dict(), 'output/models/final_nn_model.pt')  
    
    # Save the full model (architecture + weights)  
    torch.save(model, 'output/models/full_nn_model.pt')  
    
    # Save model configuration and preprocessing objects  
    nn_preprocessing = {  
        'label_encoders': label_encoders,  
        'scaler': scaler,  
        'categorical_features': categorical_features,  
        'numerical_features': numerical_features,  
        'cat_dims': cat_dims,  
        'device': str(device),  
        'model_architecture': {  
            'cat_dims': cat_dims,  
            'num_dims': X_num_train.shape[1]  
        }  
    }  
    
    with open('output/models/nn_preprocessing.pkl', 'wb') as f:  
        pickle.dump(nn_preprocessing, f)  
    
    # Save training history  
    history = {  
        'train_loss': train_losses,  
        'val_loss': val_losses,  
        'train_auc': train_aucs,  
        'val_auc': val_aucs  
    }  
    
    with open('output/models/nn_training_history.pkl', 'wb') as f:  
        pickle.dump(history, f)  
    
    return {  
        'model': model,  
        'history': history,  
        'X_cat_test': X_cat_test,  
        'X_num_test': X_num_test,  
        'y_test': y_test,  
        'y_test_tensor': y_test_tensor,  
        'label_encoders': label_encoders,  
        'scaler': scaler,  
        'categorical_features': categorical_features,  
        'numerical_features': numerical_features,  
        'device': device  
    }  

**MODEL EVALUATION**

In [8]:


def evaluate_sklearn_models(models_dict):  
    """  
    Evaluate the trained sklearn models  
    
    Args:  
        models_dict: Dictionary with trained models and test data  
        
    Returns:  
        Dictionary with evaluation metrics  
    """  
    results = {}  
    X_test = models_dict['X_test']  
    y_test = models_dict['y_test']  
    
    # Models to evaluate  
    models = {  
        'Random Forest': models_dict['random_forest'],  
        'Logistic Regression': models_dict['logistic_regression'],  
        'Naive Bayes': models_dict['naive_bayes']  
    }  
    
    # Prepare results summary for CSV export  
    results_summary = []  
    
    # Evaluate each model  
    for name, model in models.items():  
        print(f"\nEvaluating {name}...")  
        
        # Predictions  
        y_pred = model.predict(X_test)  
        y_proba = model.predict_proba(X_test)[:, 1]  
        
        # Metrics  
        accuracy = (y_pred == y_test).mean()  
        auc = roc_auc_score(y_test, y_proba)  
        report = classification_report(y_test, y_pred, output_dict=True)  
        
        print(f"Accuracy: {accuracy:.4f}")  
        print(f"ROC AUC: {auc:.4f}")  
        print("Classification Report:")  
        print(classification_report(y_test, y_pred))  
        
        # Add to results summary  
        results_summary.append({  
            'model': name,  
            'accuracy': accuracy,  
            'auc': auc,  
            'precision': report['weighted avg']['precision'],  
            'recall': report['weighted avg']['recall'],  
            'f1': report['weighted avg']['f1-score']  
        })  
        
        # Plot confusion matrix  
        cm = confusion_matrix(y_test, y_pred)  
        plt.figure(figsize=(8, 6))  
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',  
                   xticklabels=['No Injury', 'Injury/Tow'],  
                   yticklabels=['No Injury', 'Injury/Tow'])  
        plt.xlabel('Predicted')  
        plt.ylabel('Actual')  
        plt.title(f'Confusion Matrix - {name}')  
        plt.tight_layout()  
        plt.savefig(f'output/plots/cm_{name.lower().replace(" ", "_")}.png')  
        plt.close()  
        
        # Plot ROC curve  
        fpr, tpr, _ = roc_curve(y_test, y_proba)  
        plt.figure(figsize=(8, 6))  
        plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {auc:.4f})')  
        plt.plot([0, 1], [0, 1], 'k--', label='Random Classifier')  
        plt.xlabel('False Positive Rate')  
        plt.ylabel('True Positive Rate')  
        plt.title(f'ROC Curve - {name}')  
        plt.legend()  
        plt.tight_layout()  
        plt.savefig(f'output/plots/roc_{name.lower().replace(" ", "_")}.png')  
        plt.close()  
        
        # Store results  
        results[name] = {  
            'accuracy': accuracy,  
            'auc': auc,  
            'report': report,  
            'y_pred': y_pred,  
            'y_proba': y_proba  
        }  
    
    # Save results summary to CSV  
    pd.DataFrame(results_summary).to_csv('output/metrics/sklearn_models_results.csv', index=False)  
    
    return results  

def evaluate_neural_network(nn_dict):  
    """  
    Evaluate the trained neural network model  
    
    Args:  
        nn_dict: Dictionary with neural network model and related data  
        
    Returns:  
        Dictionary with evaluation metrics  
    """  
    print("\nEvaluating Neural Network...")  
    
    # Unpack dictionary  
    model = nn_dict['model']  
    history = nn_dict['history']  
    X_cat_test = nn_dict['X_cat_test']  
    X_num_test = nn_dict['X_num_test']  
    y_test = nn_dict['y_test']  
    device = nn_dict['device']  
    
    # Evaluation mode  
    model.eval()  
    
    # Make predictions  
    with torch.no_grad():  
        y_pred_proba = model(X_cat_test, X_num_test).cpu().numpy()  
    
    y_pred = (y_pred_proba > 0.5).astype(int)  
    
    # Calculate metrics  
    accuracy = (y_pred.flatten() == y_test.values).mean()  
    auc = roc_auc_score(y_test, y_pred_proba)  
    report = classification_report(y_test, y_pred, output_dict=True)  
    
    print(f"Accuracy: {accuracy:.4f}")  
    print(f"ROC AUC: {auc:.4f}")  
    print("Classification Report:")  
    print(classification_report(y_test, y_pred))  
    
    # Save metrics to CSV  
    results_summary = [{  
        'model': 'Neural Network',  
        'accuracy': accuracy,  
        'auc': auc,  
        'precision': report['weighted avg']['precision'],  
        'recall': report['weighted avg']['recall'],  
        'f1': report['weighted avg']['f1-score']  
    }]  
    
    pd.DataFrame(results_summary).to_csv('output/metrics/neural_network_results.csv', index=False)  
    
    # Plot training history  
    plt.figure(figsize=(12, 5))  
    plt.subplot(1, 2, 1)  
    plt.plot(history['train_loss'], label='Training Loss')  
    plt.plot(history['val_loss'], label='Validation Loss')  
    plt.title('Training and Validation Loss')  
    plt.xlabel('Epoch')  
    plt.ylabel('Loss')  
    plt.legend()  
    
    plt.subplot(1, 2, 2)  
    plt.plot(history['train_auc'], label='Training AUC')  
    plt.plot(history['val_auc'], label='Validation AUC')  
    plt.title('Training and Validation AUC')  
    plt.xlabel('Epoch')  
    plt.ylabel('AUC')  
    plt.legend()  
    plt.tight_layout()  
    plt.savefig('output/plots/nn_training_history.png')  
    plt.close()  
    
    # Plot confusion matrix  
    conf_matrix = confusion_matrix(y_test, y_pred)  
    plt.figure(figsize=(8, 6))  
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',  
               xticklabels=['No Injury', 'Injury/Tow'],  
               yticklabels=['No Injury', 'Injury/Tow'])  
    plt.xlabel('Predicted')  
    plt.ylabel('Actual')  
    plt.title('Confusion Matrix - Neural Network')  
    plt.tight_layout()  
    plt.savefig('output/plots/cm_neural_network.png')  
    plt.close()  
    
    # Plot ROC curve  
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)  
    plt.figure(figsize=(8, 6))  
    plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {auc:.4f})')  
    plt.plot([0, 1], [0, 1], 'k--', label='Random Classifier')  
    plt.xlabel('False Positive Rate')  
    plt.ylabel('True Positive Rate')  
    plt.title('ROC Curve - Neural Network')  
    plt.legend()  
    plt.tight_layout()  
    plt.savefig('output/plots/roc_neural_network.png')  
    plt.close()  
    
    return {  
        'accuracy': accuracy,  
        'auc': auc,  
        'report': report,  
        'y_pred': y_pred.flatten(),  
        'y_proba': y_pred_proba.flatten()  
    }  

def compare_models(sklearn_results, nn_results, y_test):  
    """  
    Compare the performance of all models  
    
    Args:  
        sklearn_results: Results from sklearn models  
        nn_results: Results from neural network  
        y_test: Test target values  
    """  
    # Combine all results  
    all_results = {**sklearn_results, 'Neural Network': nn_results}  
    
    # Extract metrics for comparison  
    models = list(all_results.keys())  
    accuracies = [all_results[model]['accuracy'] for model in models]  
    aucs = [all_results[model]['auc'] for model in models]  
    
    # Create comparison DataFrame  
    comparison_df = pd.DataFrame({  
        'Model': models,  
        'Accuracy': accuracies,  
        'ROC AUC': aucs  
    })  
    
    # Add precision, recall, and F1 score  
    for model in models:  
        report = all_results[model]['report']  
        comparison_df.loc[comparison_df['Model'] == model, 'Precision'] = report['weighted avg']['precision']  
        comparison_df.loc[comparison_df['Model'] == model, 'Recall'] = report['weighted avg']['recall']  
        comparison_df.loc[comparison_df['Model'] == model, 'F1 Score'] = report['weighted avg']['f1-score']  
    
    # Sort by AUC  
    comparison_df = comparison_df.sort_values('ROC AUC', ascending=False)  
    
    # Save comparison to CSV  
    comparison_df.to_csv('output/metrics/model_comparison.csv', index=False)  
    
    # Create comparison bar chart  
    plt.figure(figsize=(12, 6))  
    
    x = np.arange(len(models))  
    width = 0.3  
    
    plt.bar(x - width, accuracies, width, label='Accuracy')  
    plt.bar(x, aucs, width, label='ROC AUC')  
    plt.bar(x + width, [all_results[model]['report']['weighted avg']['f1-score'] for model in models],   
            width, label='F1 Score')  
    
    plt.xlabel('Models')  
    plt.ylabel('Score')  
    plt.title('Model Performance Comparison')  
    plt.xticks(x, models)  
    plt.legend()  
    
    # Add value labels  
    for i, v in enumerate(accuracies):  
        plt.text(i - width, v + 0.01, f'{v:.3f}', ha='center')  
    
    for i, v in enumerate(aucs):  
        plt.text(i, v + 0.01, f'{v:.3f}', ha='center')  
    
    for i, v in enumerate([all_results[model]['report']['weighted avg']['f1-score'] for model in models]):  
        plt.text(i + width, v + 0.01, f'{v:.3f}', ha='center')  
    
    plt.tight_layout()  
    plt.savefig('output/plots/model_comparison.png')  
    plt.close()  
    
    # Create a comprehensive ROC curve comparison  
    plt.figure(figsize=(10, 8))  
    
    for model_name in models:  
        model_results = all_results[model_name]  
        y_proba = model_results['y_proba']  
        
        fpr, tpr, _ = roc_curve(y_test, y_proba)  
        auc = roc_auc_score(y_test, y_proba)  
        
        plt.plot(fpr, tpr, label=f'{model_name} (AUC = {auc:.4f})')  
    
    plt.plot([0, 1], [0, 1], 'k--', label='Random')  
    plt.xlabel('False Positive Rate')  
    plt.ylabel('True Positive Rate')  
    plt.title('ROC Curve Comparison')  
    plt.legend()  
    plt.tight_layout()  
    plt.savefig('output/plots/roc_comparison.png')  
    plt.close()  
    
    # Print comparison table  
    print("\nModel Performance Comparison:")  
    print(comparison_df)  

def analyze_feature_importance(rf_model, categorical_features, numerical_features):  
    """  
    Analyze feature importance from Random Forest model  
    
    Args:  
        rf_model: Trained Random Forest pipeline  
        categorical_features: List of categorical feature names  
        numerical_features: List of numerical feature names  
    """  
    # Extract the preprocessor and classifier  
    preprocessor = rf_model.named_steps['preprocessor']  
    rf_classifier = rf_model.named_steps['classifier']  
    
    # Get feature names after preprocessing  
    cat_features = preprocessor.named_transformers_['cat'].named_steps['encoder'].get_feature_names_out(categorical_features)  
    all_features = np.append(cat_features, numerical_features)  
    
    # Get feature importances  
    importances = rf_classifier.feature_importances_  
    
    # Sort features by importance  
    indices = np.argsort(importances)[::-1]  
    
    # Create feature importance DataFrame  
    importance_df = pd.DataFrame({  
        'feature': [all_features[i] for i in indices],  
        'importance': importances[indices]  
    })  
    
    # Save to CSV  
    importance_df.to_csv('output/metrics/feature_importance.csv', index=False)  
    
    # Plot top 20 features  
    plt.figure(figsize=(12, 8))  
    plt.title('Feature Importance (Random Forest)')  
    plt.bar(range(min(20, len(all_features))),   
            importances[indices[:20]],   
            align='center')  
    plt.xticks(range(min(20, len(all_features))),   
              [all_features[i] for i in indices[:20]],   
              rotation=90)  
    plt.tight_layout()  
    plt.savefig('output/plots/feature_importance.png')  
    plt.close()  
    
    # Print top 10 features  
    print("\nTop 10 Most Important Features:")  
    for i in range(min(10, len(all_features))):  
        feature_idx = indices[i]  
        print(f"{all_features[feature_idx]}: {importances[feature_idx]:.4f}")  

# ==================  
# PREDICTION FUNCTIONS  
# ==================  

def predict_with_sklearn_model(model, new_data):  
    """  
    Make prediction with a sklearn model  
    
    Args:  
        model: Trained sklearn pipeline  
        new_data: New data for prediction (DataFrame)  
        
    Returns:  
        Probability of accident  
    """  
    return model.predict_proba(new_data)[0, 1]  

def predict_with_pytorch_network(new_data):  
    """  
    Make prediction with the PyTorch neural network model  
    
    Args:  
        new_data: New data for prediction (DataFrame or dict)  
        
    Returns:  
        Probability of accident  
    """  
    # Load the model and preprocessing objects  
    try:  
        # Load preprocessing info  
        with open('output/models/nn_preprocessing.pkl', 'rb') as f:  
            preprocessing = pickle.load(f)  
            
        label_encoders = preprocessing['label_encoders']  
        scaler = preprocessing['scaler']  
        categorical_features = preprocessing['categorical_features']  
        numerical_features = preprocessing['numerical_features']  
        cat_dims = preprocessing['cat_dims']  
        num_dims = len(numerical_features)  
        
        # Set device  
        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")  
        
        # Load model  
        try:  
            # Try to load full model first  
            model = torch.load('output/models/full_nn_model.pt', map_location=device)  
        except:  
            # If full model not available, load architecture and weights separately  
            model = EmbeddingNet(cat_dims=cat_dims, num_dims=num_dims).to(device)  
            model.load_state_dict(torch.load('output/models/best_nn_model.pt', map_location=device))  
        
        model.eval()  
        
        # Convert dict to DataFrame if needed  
        if isinstance(new_data, dict):  
            new_data = pd.DataFrame([new_data])  
        
        # Preprocess categorical features  
        cat_processed = []  
        for feature in categorical_features:  
            le = label_encoders[feature]  
            value = new_data[feature].iloc[0]  
            try:  
                encoded = le.transform([str(value)])[0]  
            except:  
                # Handle unseen categories  
                encoded = 0  # Default to first category  
            cat_processed.append(encoded)  
        
        # Preprocess numerical features  
        num_values = []  
        for feature in numerical_features:  
            value = new_data[feature].iloc[0]  
            num_values.append(value)  
        
        num_processed = scaler.transform(np.array([num_values]))  
        
        # Convert to PyTorch tensors  
        cat_tensor = torch.tensor([cat_processed], dtype=torch.long).to(device)  
        num_tensor = torch.tensor(num_processed, dtype=torch.float32).to(device)  
        
        # Make prediction  
        with torch.no_grad():  
            prediction = model(cat_tensor, num_tensor).item()  
            
        return prediction  
    
    except Exception as e:  
        print(f"Error making prediction with PyTorch neural network: {e}")  
        return None  


In [9]:

# Assume df is loaded  
df = pd.read_csv('/kaggle/input/traffic-accidents/traffic_accidents.csv')

# For demonstration, you should replace this with your actual dataframe:  
from sklearn.datasets import make_classification  


# Preprocess data  
X, y, categorical_features, numerical_features = preprocess_data(df)  

# Train sklearn models  
sklearn_models = train_sklearn_models(X, y, categorical_features, numerical_features)  

# Save sklearn models  
save_models(sklearn_models)  

# Evaluate sklearn models  
sklearn_results = evaluate_sklearn_models(sklearn_models)  

# Train PyTorch neural network  
nn_dict = train_neural_network(X, y, categorical_features, numerical_features)  

# Evaluate neural network  
nn_results = evaluate_neural_network(nn_dict)  

# Compare all models  
compare_models(sklearn_results, nn_results, sklearn_models['y_test'])  

# Analyze feature importance from Random Forest  
analyze_feature_importance(sklearn_models['random_forest'],   
                          categorical_features,   
                          numerical_features)  

# Example for prediction  
print("\nExample Predictions:")  
# Sample data point  
sample = X.iloc[0:1]  

# Load models (to demonstrate the loading functionality)  
loaded_models = load_models()  

# Use loaded models for prediction  
if loaded_models:  
    rf_prob = predict_with_sklearn_model(loaded_models['random_forest'], sample)  
    lr_prob = predict_with_sklearn_model(loaded_models['logistic_regression'], sample)  
    nb_prob = predict_with_sklearn_model(loaded_models['naive_bayes'], sample)  
    
    print(f"Random Forest: {rf_prob:.4f}")  
    print(f"Logistic Regression: {lr_prob:.4f}")  
    print(f"Naive Bayes: {nb_prob:.4f}")  
    
    # For neural network, use the dedicated prediction function  
    nn_prob = predict_with_pytorch_network(sample)  
    if nn_prob is not None:  
        print(f"Neural Network: {nn_prob:.4f}")  

print("\nDone! All models trained, evaluated, and saved.")  
print(f"Models and results saved in the 'output' directory")  



Starting data preprocessing...
Target distribution: {0: 117376, 1: 91930}
Using 11 categorical features and 10 numerical features

Training Random Forest...
Training Logistic Regression...
Training Naive Bayes...

Saving models to files...
All models successfully saved!

Evaluating Random Forest...
Accuracy: 0.8451
ROC AUC: 0.9251
Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.92      0.87     23350
           1       0.88      0.75      0.81     18512

    accuracy                           0.85     41862
   macro avg       0.85      0.84      0.84     41862
weighted avg       0.85      0.85      0.84     41862


Evaluating Logistic Regression...
Accuracy: 0.8434
ROC AUC: 0.9270
Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.89      0.86     23350
           1       0.85      0.78      0.82     18512

    accuracy                           0.84     41862
   macro