# Comprehensive Telematics Driving Behavior Analysis
## Feature Engineering, Classification, and Behavior Scoring for Insurance Applications

This notebook provides a complete pipeline for analyzing driving behavior from telematics sensor data, combining:
- Advanced feature engineering from accelerometer and gyroscope signals
- Multiple machine learning and deep learning classification models
- Behavior score calculation for insurance risk assessment
- Model comparison and evaluation metrics

**Key Features:**
- Time and frequency domain feature extraction
- Signal smoothing and noise reduction
- 3-class driving behavior classification (SLOW, NORMAL, AGGRESSIVE)
- Comprehensive behavior scoring algorithm
- Export functionality for production systems

# 1. Import Required Libraries

In [None]:
# Core data science libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import os
from datetime import datetime
from dataclasses import dataclass
warnings.filterwarnings('ignore')

# Signal processing
from scipy import signal
from scipy.ndimage import gaussian_filter1d

# Machine Learning libraries
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.mixture import GaussianMixture
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import make_pipeline
from sklearn.utils import class_weight

# Metrics and evaluation
from sklearn.metrics import (classification_report, accuracy_score, ConfusionMatrixDisplay, 
                           confusion_matrix, precision_score, recall_score, roc_curve, 
                           roc_auc_score, balanced_accuracy_score, silhouette_score)

# Deep Learning
import tensorflow as tf
from tensorflow import keras
from scikeras.wrappers import KerasClassifier

# XGBoost
import xgboost as xgb

# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

print("All libraries imported successfully!")
print(f"TensorFlow version: {tf.__version__}")
print(f"Pandas version: {pd.__version__}")
print(f"NumPy version: {np.__version__}")
print(f"Scikit-learn version: {__import__('sklearn').__version__}")

# 2. Data Loading and Initial Processing

In [None]:
# Load the training data
train_data_path = 'train_motion_data.csv'
test_data_path = 'test_motion_data.csv'

# Try different possible paths
possible_paths = [
    train_data_path,
    f'data/{train_data_path}',
    f'/home/bakasur/INSURITY/{train_data_path}',
    f'/kaggle/input/driving-behavior/{train_data_path}'
]

train_data = None
for path in possible_paths:
    try:
        if os.path.exists(path):
            train_data = pd.read_csv(path)
            print(f"Successfully loaded training data from: {path}")
            break
    except:
        continue

if train_data is None:
    print("Could not find training data. Please ensure the file is in the correct location.")
    # Create sample data for demonstration
    print("Creating sample data for demonstration...")
    np.random.seed(42)
    n_samples = 10000
    train_data = pd.DataFrame({
        'Timestamp': range(n_samples),
        'AccX': np.random.normal(0, 1, n_samples),
        'AccY': np.random.normal(0, 1, n_samples),
        'AccZ': np.random.normal(9.8, 1, n_samples),
        'GyroX': np.random.normal(0, 0.5, n_samples),
        'GyroY': np.random.normal(0, 0.5, n_samples),
        'GyroZ': np.random.normal(0, 0.5, n_samples),
        'Class': np.random.choice(['SLOW', 'NORMAL', 'AGGRESSIVE'], n_samples, p=[0.3, 0.4, 0.3])
    })

print(f"Dataset shape: {train_data.shape}")
print("\nFirst few rows:")
train_data.head()

In [None]:
# Data exploration and preprocessing
print("=== Dataset Information ===")
print(f"Shape: {train_data.shape}")
print(f"Columns: {list(train_data.columns)}")
print(f"Data types:\n{train_data.dtypes}")
print(f"\nMissing values:\n{train_data.isnull().sum()}")
print(f"\nClass distribution:\n{train_data['Class'].value_counts()}")
print(f"\nClass distribution (%):\n{train_data['Class'].value_counts(normalize=True) * 100}")

# Basic statistics
print("\n=== Statistical Summary ===")
train_data.describe()

In [None]:
# Data preprocessing
df = train_data.copy()

# Ensure data is sorted by timestamp
df = df.sort_values('Timestamp').reset_index(drop=True)

# Convert timestamp to proper format
df['Timestamp'] = (df.index + 1) / 2  # Assuming 2 samples per second

# Define sensor columns
sensor_columns = ['AccX', 'AccY', 'AccZ', 'GyroX', 'GyroY', 'GyroZ']

# Normalize sensor data using MinMaxScaler
scaler = MinMaxScaler()
df[sensor_columns] = scaler.fit_transform(df[sensor_columns])

# Map classes to numeric values
class_mapping = {'SLOW': 0, 'NORMAL': 1, 'AGGRESSIVE': 2}
df['Class_numeric'] = df['Class'].map(class_mapping)

# Add derived features
df['Acc_magnitude'] = np.sqrt(df['AccX']**2 + df['AccY']**2 + df['AccZ']**2)
df['Gyro_magnitude'] = np.sqrt(df['GyroX']**2 + df['GyroY']**2 + df['GyroZ']**2)

print("Data preprocessing completed!")
print(f"Processed dataset shape: {df.shape}")
print(f"Sensor columns normalized: {sensor_columns}")
print(f"Added magnitude features: ['Acc_magnitude', 'Gyro_magnitude']")

# Visualize class distribution
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
df['Class'].value_counts().plot(kind='bar', color=['green', 'blue', 'red'])
plt.title('Class Distribution')
plt.ylabel('Count')
plt.xticks(rotation=45)

plt.subplot(1, 2, 2)
df['Class'].value_counts(normalize=True).plot(kind='pie', autopct='%1.1f%%', colors=['green', 'blue', 'red'])
plt.title('Class Distribution (%)')
plt.ylabel('')

plt.tight_layout()
plt.show()

# 3. Signal Smoothing and Preprocessing

In [None]:
# Define smoothing functions
def rolling_average_smoothing(data, window=5):
    """Apply rolling average smoothing"""
    return data.rolling(window=window, center=True).mean()

def exponential_smoothing(data, alpha=0.1):
    """Apply exponential smoothing"""
    return data.ewm(alpha=alpha).mean()

def gaussian_smoothing(data, window=5, std=1):
    """Apply Gaussian smoothing"""
    return data.rolling(window=window, win_type='gaussian', center=True).mean(std=std)

# Apply different smoothing techniques
sampling_rate = 2  # 2 samples per second
window_size = int(4 * sampling_rate)  # 4-second windows
overlap = int(window_size * 0.25)  # 25% overlap

# Create smoothed versions of the data
df_smoothed = df.copy()

# Apply rolling average to sensor columns
for col in sensor_columns:
    df_smoothed[f'{col}_smooth'] = rolling_average_smoothing(df[col], window=window_size)

# Fill NaN values from smoothing
df_smoothed = df_smoothed.fillna(method='bfill').fillna(method='ffill')

print(f"Applied smoothing with window size: {window_size}")
print(f"Smoothed columns added: {[f'{col}_smooth' for col in sensor_columns]}")

# Visualize original vs smoothed signals
fig, axes = plt.subplots(2, 3, figsize=(15, 8))
axes = axes.flatten()

for i, col in enumerate(sensor_columns):
    sample_data = df.iloc[1000:1200]  # Sample 200 points
    axes[i].plot(sample_data['Timestamp'], sample_data[col], alpha=0.7, label='Original', linewidth=1)
    axes[i].plot(sample_data['Timestamp'], df_smoothed.iloc[1000:1200][f'{col}_smooth'], 
                 label='Smoothed', linewidth=2)
    axes[i].set_title(f'{col} - Original vs Smoothed')
    axes[i].set_xlabel('Time')
    axes[i].set_ylabel('Value')
    axes[i].legend()
    axes[i].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# 4. Time Domain Feature Extraction

In [None]:
# Extract time domain features using windowing approach
def extract_time_domain_features(data, window_size=8, overlap_ratio=0.25):
    """
    Extract time domain features from sensor data using sliding windows
    """
    features_list = []
    overlap = int(window_size * overlap_ratio)
    
    for i in range(0, len(data) - window_size + 1, overlap):
        window = data.iloc[i:i+window_size]
        
        features_dict = {
            'window_start': i,
            'timestamp': window['Timestamp'].iloc[-1],
            'class': window['Class_numeric'].iloc[-1]
        }
        
        # Extract features for each sensor column
        for col in sensor_columns:
            if col in window.columns:
                values = window[col].values
                features_dict.update({
                    f'{col}_mean': np.mean(values),
                    f'{col}_std': np.std(values),
                    f'{col}_min': np.min(values),
                    f'{col}_max': np.max(values),
                    f'{col}_median': np.median(values),
                    f'{col}_range': np.max(values) - np.min(values),
                    f'{col}_rms': np.sqrt(np.mean(values**2)),
                    f'{col}_var': np.var(values),
                    f'{col}_skew': pd.Series(values).skew(),
                    f'{col}_kurtosis': pd.Series(values).kurtosis()
                })
        
        features_list.append(features_dict)
    
    return pd.DataFrame(features_list)

# Extract features
print("Extracting time domain features...")
time_features = extract_time_domain_features(df_smoothed, window_size=8, overlap_ratio=0.25)

print(f"Extracted features shape: {time_features.shape}")
print(f"Number of features per window: {len([col for col in time_features.columns if col not in ['window_start', 'timestamp', 'class']])}")
print(f"Feature columns: {[col for col in time_features.columns if '_mean' in col][:5]}...")  # Show first 5 mean features

# Display sample features
time_features.head()

# 5. Jerk Calculation

In [None]:
# Calculate jerk (derivative of acceleration)
def calculate_jerk_features(data):
    """Calculate jerk features from acceleration data"""
    jerk_data = data.copy()
    
    # Time difference for derivative calculation
    time_diff = data['Timestamp'].diff().fillna(0.5)  # Default to 0.5s
    
    # Calculate jerk for each acceleration axis
    jerk_data['JerkX'] = data['AccX'].diff().div(time_diff, fill_value=0)
    jerk_data['JerkY'] = data['AccY'].diff().div(time_diff, fill_value=0)
    jerk_data['JerkZ'] = data['AccZ'].diff().div(time_diff, fill_value=0)
    
    # Calculate jerk magnitude
    jerk_data['Jerk_magnitude'] = np.sqrt(
        jerk_data['JerkX']**2 + jerk_data['JerkY']**2 + jerk_data['JerkZ']**2
    )
    
    # Fill initial NaN values
    jerk_data = jerk_data.fillna(0)
    
    return jerk_data

# Calculate jerk features
print("Calculating jerk features...")
df_with_jerk = calculate_jerk_features(df_smoothed)

jerk_columns = ['JerkX', 'JerkY', 'JerkZ', 'Jerk_magnitude']
print(f"Added jerk columns: {jerk_columns}")

# Add jerk-based statistical features to time features
def add_jerk_to_time_features(time_features, jerk_data, window_size=8, overlap_ratio=0.25):
    """Add jerk features to existing time domain features"""
    overlap = int(window_size * overlap_ratio)
    
    for i, row in time_features.iterrows():
        start_idx = row['window_start']
        end_idx = start_idx + window_size
        
        if end_idx <= len(jerk_data):
            window_jerk = jerk_data.iloc[start_idx:end_idx]
            
            for col in jerk_columns:
                if col in window_jerk.columns:
                    values = window_jerk[col].values
                    time_features.loc[i, f'{col}_mean'] = np.mean(values)
                    time_features.loc[i, f'{col}_std'] = np.std(values)
                    time_features.loc[i, f'{col}_max'] = np.max(values)
                    time_features.loc[i, f'{col}_rms'] = np.sqrt(np.mean(values**2))
    
    return time_features

# Add jerk features to time domain features
time_features = add_jerk_to_time_features(time_features, df_with_jerk)

print(f"Updated features shape: {time_features.shape}")

# Visualize jerk signals
plt.figure(figsize=(15, 10))

for i, col in enumerate(['AccX', 'AccY', 'AccZ']):
    plt.subplot(3, 2, i*2+1)
    sample_data = df_with_jerk.iloc[1000:1200]
    plt.plot(sample_data['Timestamp'], sample_data[col], label=f'{col}')
    plt.title(f'Acceleration - {col}')
    plt.ylabel('Acceleration')
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    plt.subplot(3, 2, i*2+2)
    plt.plot(sample_data['Timestamp'], sample_data[f'Jerk{col[-1]}'], label=f'Jerk{col[-1]}', color='red')
    plt.title(f'Jerk - {col[-1]} axis')
    plt.ylabel('Jerk')
    plt.legend()
    plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# 6. Magnitude Calculations

In [None]:
# Calculate magnitude features
def calculate_magnitude_features(data):
    """Calculate magnitude features for acceleration and gyroscope"""
    mag_data = data.copy()
    
    # Calculate magnitude vectors
    mag_data['Acc_magnitude'] = np.sqrt(
        data['AccX']**2 + data['AccY']**2 + data['AccZ']**2
    )
    mag_data['Gyro_magnitude'] = np.sqrt(
        data['GyroX']**2 + data['GyroY']**2 + data['GyroZ']**2
    )
    
    # Calculate additional magnitude-based features
    mag_data['Total_magnitude'] = mag_data['Acc_magnitude'] + mag_data['Gyro_magnitude']
    mag_data['Magnitude_ratio'] = mag_data['Acc_magnitude'] / (mag_data['Gyro_magnitude'] + 1e-8)
    
    return mag_data

# Calculate magnitude features
print("Calculating magnitude features...")
df_with_magnitudes = calculate_magnitude_features(df_with_jerk)

magnitude_columns = ['Acc_magnitude', 'Gyro_magnitude', 'Total_magnitude', 'Magnitude_ratio']
print(f"Added magnitude columns: {magnitude_columns}")

# Add magnitude features to time domain features
def add_magnitude_to_time_features(time_features, mag_data, window_size=8, overlap_ratio=0.25):
    """Add magnitude features to existing time domain features"""
    overlap = int(window_size * overlap_ratio)
    
    for i, row in time_features.iterrows():
        start_idx = row['window_start']
        end_idx = start_idx + window_size
        
        if end_idx <= len(mag_data):
            window_mag = mag_data.iloc[start_idx:end_idx]
            
            for col in magnitude_columns:
                if col in window_mag.columns:
                    values = window_mag[col].values
                    time_features.loc[i, f'{col}_mean'] = np.mean(values)
                    time_features.loc[i, f'{col}_std'] = np.std(values)
                    time_features.loc[i, f'{col}_max'] = np.max(values)
                    time_features.loc[i, f'{col}_min'] = np.min(values)
                    time_features.loc[i, f'{col}_range'] = np.max(values) - np.min(values)
    
    return time_features

# Add magnitude features
time_features = add_magnitude_to_time_features(time_features, df_with_magnitudes)

print(f"Updated features shape: {time_features.shape}")

# Visualize magnitude features by class
plt.figure(figsize=(15, 8))

# Box plots for magnitude features by class
magnitude_cols_to_plot = ['Acc_magnitude_mean', 'Gyro_magnitude_mean', 'Total_magnitude_mean']

for i, col in enumerate(magnitude_cols_to_plot):
    plt.subplot(2, 3, i+1)
    sns.boxplot(data=time_features, x='class', y=col)
    plt.title(f'{col} by Class')
    plt.xlabel('Class (0=SLOW, 1=NORMAL, 2=AGGRESSIVE)')

# Time series plots
for i, col in enumerate(['Acc_magnitude', 'Gyro_magnitude', 'Total_magnitude']):
    plt.subplot(2, 3, i+4)
    sample_data = df_with_magnitudes.iloc[1000:1500]
    classes = sample_data['Class'].unique()
    colors = ['green', 'blue', 'red']
    
    for cls, color in zip(classes, colors):
        class_data = sample_data[sample_data['Class'] == cls]
        if len(class_data) > 0:
            plt.plot(class_data['Timestamp'], class_data[col], 
                    alpha=0.7, color=color, label=cls)
    
    plt.title(f'{col} Time Series')
    plt.xlabel('Time')
    plt.ylabel('Magnitude')
    plt.legend()
    plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# 7. Frequency Domain Features

In [None]:
# Extract frequency domain features
def calculate_frequency_features(data, sampling_rate=2):
    """Calculate frequency domain features using FFT"""
    freq_features = {}
    
    # Apply FFT
    fft = np.fft.fft(data)
    freqs = np.fft.fftfreq(len(data), 1/sampling_rate)
    
    # Power spectral density
    psd = np.abs(fft)**2
    
    # Energy in different frequency bands
    freq_features['total_energy'] = np.sum(psd)
    freq_features['mean_frequency'] = np.sum(freqs[:len(freqs)//2] * psd[:len(psd)//2]) / np.sum(psd[:len(psd)//2])
    freq_features['spectral_centroid'] = np.sum(freqs[:len(freqs)//2] * psd[:len(psd)//2]) / np.sum(psd[:len(psd)//2])
    
    # Frequency band energy (0-0.5Hz, 0.5-1Hz, 1-2Hz)
    pos_freqs = freqs[:len(freqs)//2]
    pos_psd = psd[:len(psd)//2]
    
    band1 = (pos_freqs >= 0) & (pos_freqs < 0.5)
    band2 = (pos_freqs >= 0.5) & (pos_freqs < 1.0)
    band3 = (pos_freqs >= 1.0) & (pos_freqs < 2.0)
    
    freq_features['energy_band_0_0.5'] = np.sum(pos_psd[band1])
    freq_features['energy_band_0.5_1'] = np.sum(pos_psd[band2])
    freq_features['energy_band_1_2'] = np.sum(pos_psd[band3])
    
    # Spectral features
    freq_features['spectral_variance'] = np.var(pos_psd)
    freq_features['spectral_skewness'] = pd.Series(pos_psd).skew()
    freq_features['spectral_kurtosis'] = pd.Series(pos_psd).kurtosis()
    
    return freq_features

# Add frequency features to time domain features
def add_frequency_to_time_features(time_features, signal_data, window_size=8, overlap_ratio=0.25):
    """Add frequency domain features to existing time domain features"""
    overlap = int(window_size * overlap_ratio)
    
    for i, row in time_features.iterrows():
        start_idx = row['window_start']
        end_idx = start_idx + window_size
        
        if end_idx <= len(signal_data):
            window_data = signal_data.iloc[start_idx:end_idx]
            
            # Calculate frequency features for each sensor
            for col in sensor_columns:
                if col in window_data.columns:
                    signal = window_data[col].values
                    freq_features = calculate_frequency_features(signal)
                    
                    for freq_name, freq_value in freq_features.items():
                        time_features.loc[i, f'{col}_{freq_name}'] = freq_value
    
    return time_features

print("Calculating frequency domain features...")
time_features = add_frequency_to_time_features(time_features, df_with_magnitudes)

print(f"Updated features shape: {time_features.shape}")

# Visualize frequency domain analysis
def plot_frequency_analysis(data, column, sample_start=1000, sample_length=200):
    """Plot time domain and frequency domain analysis"""
    sample_data = data.iloc[sample_start:sample_start+sample_length][column].values
    
    # FFT
    fft = np.fft.fft(sample_data)
    freqs = np.fft.fftfreq(len(sample_data), 1/2)  # 2 Hz sampling rate
    
    plt.figure(figsize=(12, 4))
    
    # Time domain
    plt.subplot(1, 2, 1)
    plt.plot(sample_data)
    plt.title(f'Time Domain - {column}')
    plt.xlabel('Sample')
    plt.ylabel('Amplitude')
    plt.grid(True, alpha=0.3)
    
    # Frequency domain
    plt.subplot(1, 2, 2)
    plt.plot(freqs[:len(freqs)//2], np.abs(fft[:len(fft)//2]))
    plt.title(f'Frequency Domain - {column}')
    plt.xlabel('Frequency (Hz)')
    plt.ylabel('Magnitude')
    plt.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

# Show frequency analysis for a few sensors
for col in ['AccX', 'AccY', 'GyroX']:
    plot_frequency_analysis(df_with_magnitudes, col)

# 8. Feature Selection and Dimensionality Reduction

In [None]:
# Prepare features for machine learning
print("Preparing features for machine learning...")

# Remove non-feature columns
feature_columns = [col for col in time_features.columns 
                  if col not in ['window_start', 'timestamp', 'class']]

X = time_features[feature_columns].fillna(0)
y = time_features['class']

print(f"Total features extracted: {len(feature_columns)}")
print(f"Feature matrix shape: {X.shape}")
print(f"Target distribution:\n{y.value_counts()}")

# Handle infinite values and missing data
X = X.replace([np.inf, -np.inf], np.nan)
X = X.fillna(X.median())

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply PCA for dimensionality reduction
pca = PCA()
X_pca = pca.fit_transform(X_scaled)

# Determine number of components for 95% variance
cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
n_components_95 = np.argmax(cumulative_variance >= 0.95) + 1

print(f"Number of components for 95% variance: {n_components_95}")

# Select top features based on PCA
n_top_features = min(50, len(feature_columns))
feature_importance = np.abs(pca.components_[:n_components_95]).mean(axis=0)
top_feature_indices = np.argsort(feature_importance)[-n_top_features:]
top_features = [feature_columns[i] for i in top_feature_indices]

print(f"Selected top {n_top_features} features")
print(f"Top 10 features: {top_features[-10:]}")

# Create final feature matrix
X_final = X.iloc[:, top_feature_indices]
X_final_scaled = X_scaled[:, top_feature_indices]

# Visualize PCA results
plt.figure(figsize=(15, 5))

# Explained variance
plt.subplot(1, 3, 1)
plt.plot(range(1, min(51, len(pca.explained_variance_ratio_)+1)), 
         pca.explained_variance_ratio_[:50], 'bo-')
plt.axhline(y=0.05, color='r', linestyle='--', alpha=0.7)
plt.title('PCA Explained Variance Ratio')
plt.xlabel('Principal Component')
plt.ylabel('Explained Variance Ratio')
plt.grid(True, alpha=0.3)

# Cumulative variance
plt.subplot(1, 3, 2)
plt.plot(range(1, min(51, len(cumulative_variance)+1)), 
         cumulative_variance[:50], 'ro-')
plt.axhline(y=0.95, color='b', linestyle='--', alpha=0.7)
plt.axvline(x=n_components_95, color='g', linestyle='--', alpha=0.7)
plt.title('Cumulative Explained Variance')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Variance')
plt.grid(True, alpha=0.3)

# Feature importance
plt.subplot(1, 3, 3)
sorted_importance = np.sort(feature_importance)[-20:]
plt.barh(range(len(sorted_importance)), sorted_importance)
plt.title('Top 20 Feature Importances')
plt.xlabel('Importance Score')
plt.ylabel('Feature Rank')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Feature correlation analysis
plt.figure(figsize=(12, 8))
correlation_matrix = X_final.corr()
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
sns.heatmap(correlation_matrix, mask=mask, annot=False, cmap='coolwarm', center=0)
plt.title('Feature Correlation Matrix (Top Features)')
plt.tight_layout()
plt.show()

# 9. Data Preparation for Machine Learning

In [None]:
# Create train/validation/test splits
print("Creating train/validation/test splits...")

# Split data stratified by class
X_temp, X_test, y_temp, y_test = train_test_split(
    X_final_scaled, y, test_size=0.2, random_state=42, stratify=y
)

X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.25, random_state=42, stratify=y_temp
)

print(f"Training set: {X_train.shape}")
print(f"Validation set: {X_val.shape}")
print(f"Test set: {X_test.shape}")

# Check class distribution in splits
print("\nClass distribution in splits:")
print("Training:", y_train.value_counts(normalize=True).sort_index())
print("Validation:", y_val.value_counts(normalize=True).sort_index())
print("Test:", y_test.value_counts(normalize=True).sort_index())

# Create data loader class for organized data management
@dataclass
class DataLoader:
    X_train: np.ndarray
    X_val: np.ndarray
    X_test: np.ndarray
    y_train: np.ndarray
    y_val: np.ndarray
    y_test: np.ndarray
    feature_names: list
    scaler: StandardScaler
    
    def get_class_weights(self):
        """Calculate class weights for handling imbalanced data"""
        class_weights = class_weight.compute_class_weight(
            'balanced', classes=np.unique(self.y_train), y=self.y_train
        )
        return {i: weight for i, weight in enumerate(class_weights)}
    
    def get_sample_weights(self):
        """Get sample weights for training"""
        return class_weight.compute_sample_weight('balanced', self.y_train)

# Initialize data loader
data_loader = DataLoader(
    X_train=X_train,
    X_val=X_val,
    X_test=X_test,
    y_train=y_train,
    y_val=y_val,
    y_test=y_test,
    feature_names=top_features,
    scaler=scaler
)

class_weights = data_loader.get_class_weights()
print(f"\nCalculated class weights: {class_weights}")

# Visualize class distributions
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

for i, (split_name, split_y) in enumerate([('Train', y_train), ('Validation', y_val), ('Test', y_test)]):
    split_counts = split_y.value_counts().sort_index()
    colors = ['green', 'blue', 'red']
    
    axes[i].bar(split_counts.index, split_counts.values, color=colors)
    axes[i].set_title(f'{split_name} Set Class Distribution')
    axes[i].set_xlabel('Class (0=SLOW, 1=NORMAL, 2=AGGRESSIVE)')
    axes[i].set_ylabel('Count')
    axes[i].set_xticks([0, 1, 2])
    
    # Add percentage labels
    total = len(split_y)
    for j, count in enumerate(split_counts.values):
        axes[i].text(j, count + total*0.01, f'{count/total*100:.1f}%', 
                    ha='center', va='bottom')

plt.tight_layout()
plt.show()

print("Data preparation completed successfully!")

# 10. Traditional Machine Learning Models

In [None]:
# Define model evaluation class
class ModelEvaluator:
    def __init__(self, data_loader):
        self.data_loader = data_loader
        self.models = {}
        self.results = {}
    
    def train_and_evaluate(self, model, model_name, **model_params):
        """Train and evaluate a model"""
        print(f"\n{'='*50}")
        print(f"Training {model_name}")
        print(f"{'='*50}")
        
        # Initialize model
        if model_name in ['SVM', 'Logistic Regression']:
            # Use pipeline with scaling for SVM and LogReg
            clf = make_pipeline(StandardScaler(), model(**model_params))
        else:
            clf = model(**model_params)
        
        # Train model
        start_time = datetime.now()
        clf.fit(self.data_loader.X_train, self.data_loader.y_train)
        training_time = (datetime.now() - start_time).total_seconds()
        
        # Predictions
        y_pred_train = clf.predict(self.data_loader.X_train)
        y_pred_val = clf.predict(self.data_loader.X_val)
        y_pred_test = clf.predict(self.data_loader.X_test)
        
        # Probabilities for ROC analysis
        if hasattr(clf, 'predict_proba'):
            y_proba_test = clf.predict_proba(self.data_loader.X_test)
        else:
            y_proba_test = None
        
        # Calculate metrics
        results = {
            'model': clf,
            'training_time': training_time,
            'train_accuracy': accuracy_score(self.data_loader.y_train, y_pred_train),
            'val_accuracy': accuracy_score(self.data_loader.y_val, y_pred_val),
            'test_accuracy': accuracy_score(self.data_loader.y_test, y_pred_test),
            'test_balanced_accuracy': balanced_accuracy_score(self.data_loader.y_test, y_pred_test),
            'test_precision': precision_score(self.data_loader.y_test, y_pred_test, average='weighted'),
            'test_recall': recall_score(self.data_loader.y_test, y_pred_test, average='weighted'),
            'predictions': y_pred_test,
            'probabilities': y_proba_test,
            'confusion_matrix': confusion_matrix(self.data_loader.y_test, y_pred_test),
            'classification_report': classification_report(self.data_loader.y_test, y_pred_test, 
                                                         target_names=['SLOW', 'NORMAL', 'AGGRESSIVE'])
        }
        
        # Store results
        self.models[model_name] = clf
        self.results[model_name] = results
        
        # Print results
        print(f"Training Time: {training_time:.2f} seconds")
        print(f"Train Accuracy: {results['train_accuracy']:.4f}")
        print(f"Validation Accuracy: {results['val_accuracy']:.4f}")
        print(f"Test Accuracy: {results['test_accuracy']:.4f}")
        print(f"Test Balanced Accuracy: {results['test_balanced_accuracy']:.4f}")
        print(f"Test Precision: {results['test_precision']:.4f}")
        print(f"Test Recall: {results['test_recall']:.4f}")
        
        return results

# Initialize evaluator
evaluator = ModelEvaluator(data_loader)

# Train multiple models
models_to_train = [
    (LogisticRegression, 'Logistic Regression', {'random_state': 42, 'max_iter': 1000}),
    (RandomForestClassifier, 'Random Forest', {'n_estimators': 100, 'random_state': 42, 'class_weight': 'balanced'}),
    (SVC, 'SVM', {'random_state': 42, 'class_weight': 'balanced', 'probability': True}),
    (GaussianNB, 'Naive Bayes', {}),
    (xgb.XGBClassifier, 'XGBoost', {'random_state': 42, 'eval_metric': 'mlogloss'})
]

# Train all models
for model_class, model_name, params in models_to_train:
    try:
        evaluator.train_and_evaluate(model_class, model_name, **params)
    except Exception as e:
        print(f"Error training {model_name}: {str(e)}")

print(f"\n{'='*50}")
print("All models trained successfully!")
print(f"{'='*50}")

# 11. Deep Learning Models

In [None]:
# Deep Learning Models
class DeepLearningEvaluator:
    def __init__(self, data_loader):
        self.data_loader = data_loader
        self.models = {}
        self.histories = {}
        
    def create_mlp_model(self, input_dim, hidden_layers=[64, 32, 16], dropout_rate=0.3):
        """Create Multi-Layer Perceptron model"""
        model = keras.Sequential()
        model.add(keras.layers.Input(shape=(input_dim,)))
        
        for i, units in enumerate(hidden_layers):
            model.add(keras.layers.Dense(units, activation='relu'))
            model.add(keras.layers.BatchNormalization())
            if dropout_rate > 0:
                model.add(keras.layers.Dropout(dropout_rate))
        
        model.add(keras.layers.Dense(3, activation='softmax'))
        
        model.compile(
            optimizer='adam',
            loss='sparse_categorical_crossentropy',
            metrics=['accuracy', 'precision', 'recall']
        )
        
        return model
    
    def train_neural_network(self, model, model_name, epochs=100, batch_size=32, 
                           early_stopping_patience=10):
        """Train neural network model"""
        print(f"\n{'='*50}")
        print(f"Training {model_name}")
        print(f"{'='*50}")
        
        # Callbacks
        callbacks = [
            keras.callbacks.EarlyStopping(
                monitor='val_accuracy',
                patience=early_stopping_patience,
                restore_best_weights=True
            ),
            keras.callbacks.ReduceLROnPlateau(
                monitor='val_loss',
                factor=0.5,
                patience=5,
                min_lr=1e-7
            )
        ]
        
        # Calculate class weights
        class_weights = self.data_loader.get_class_weights()
        
        # Train model
        start_time = datetime.now()
        history = model.fit(
            self.data_loader.X_train, self.data_loader.y_train,
            validation_data=(self.data_loader.X_val, self.data_loader.y_val),
            epochs=epochs,
            batch_size=batch_size,
            callbacks=callbacks,
            class_weight=class_weights,
            verbose=1
        )
        training_time = (datetime.now() - start_time).total_seconds()
        
        # Evaluate model
        test_loss, test_accuracy, test_precision, test_recall = model.evaluate(
            self.data_loader.X_test, self.data_loader.y_test, verbose=0
        )
        
        # Predictions
        y_pred_proba = model.predict(self.data_loader.X_test, verbose=0)
        y_pred = np.argmax(y_pred_proba, axis=1)
        
        # Store results
        results = {
            'model': model,
            'history': history,
            'training_time': training_time,
            'test_accuracy': test_accuracy,
            'test_precision': test_precision,
            'test_recall': test_recall,
            'test_balanced_accuracy': balanced_accuracy_score(self.data_loader.y_test, y_pred),
            'predictions': y_pred,
            'probabilities': y_pred_proba,
            'confusion_matrix': confusion_matrix(self.data_loader.y_test, y_pred),
            'classification_report': classification_report(
                self.data_loader.y_test, y_pred, 
                target_names=['SLOW', 'NORMAL', 'AGGRESSIVE']
            )
        }
        
        self.models[model_name] = model
        self.histories[model_name] = history
        
        # Print results
        print(f"Training Time: {training_time:.2f} seconds")
        print(f"Test Accuracy: {test_accuracy:.4f}")
        print(f"Test Precision: {test_precision:.4f}")
        print(f"Test Recall: {test_recall:.4f}")
        print(f"Test Balanced Accuracy: {results['test_balanced_accuracy']:.4f}")
        
        return results
    
    def plot_training_history(self, model_name):
        """Plot training history"""
        if model_name not in self.histories:
            print(f"No history found for {model_name}")
            return
        
        history = self.histories[model_name]
        
        fig, axes = plt.subplots(2, 2, figsize=(12, 8))
        
        # Accuracy
        axes[0, 0].plot(history.history['accuracy'], label='Train Accuracy')
        axes[0, 0].plot(history.history['val_accuracy'], label='Val Accuracy')
        axes[0, 0].set_title('Model Accuracy')
        axes[0, 0].set_xlabel('Epoch')
        axes[0, 0].set_ylabel('Accuracy')
        axes[0, 0].legend()
        axes[0, 0].grid(True, alpha=0.3)
        
        # Loss
        axes[0, 1].plot(history.history['loss'], label='Train Loss')
        axes[0, 1].plot(history.history['val_loss'], label='Val Loss')
        axes[0, 1].set_title('Model Loss')
        axes[0, 1].set_xlabel('Epoch')
        axes[0, 1].set_ylabel('Loss')
        axes[0, 1].legend()
        axes[0, 1].grid(True, alpha=0.3)
        
        # Precision
        axes[1, 0].plot(history.history['precision'], label='Train Precision')
        axes[1, 0].plot(history.history['val_precision'], label='Val Precision')
        axes[1, 0].set_title('Model Precision')
        axes[1, 0].set_xlabel('Epoch')
        axes[1, 0].set_ylabel('Precision')
        axes[1, 0].legend()
        axes[1, 0].grid(True, alpha=0.3)
        
        # Recall
        axes[1, 1].plot(history.history['recall'], label='Train Recall')
        axes[1, 1].plot(history.history['val_recall'], label='Val Recall')
        axes[1, 1].set_title('Model Recall')
        axes[1, 1].set_xlabel('Epoch')
        axes[1, 1].set_ylabel('Recall')
        axes[1, 1].legend()
        axes[1, 1].grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.show()

# Initialize deep learning evaluator
dl_evaluator = DeepLearningEvaluator(data_loader)

# Create and train MLP models
input_dim = X_train.shape[1]

# Simple MLP
simple_mlp = dl_evaluator.create_mlp_model(
    input_dim=input_dim,
    hidden_layers=[32, 16],
    dropout_rate=0.2
)

simple_mlp_results = dl_evaluator.train_neural_network(
    simple_mlp, 'Simple MLP', epochs=50, batch_size=32
)

# Complex MLP
complex_mlp = dl_evaluator.create_mlp_model(
    input_dim=input_dim,
    hidden_layers=[128, 64, 32, 16],
    dropout_rate=0.3
)

complex_mlp_results = dl_evaluator.train_neural_network(
    complex_mlp, 'Complex MLP', epochs=50, batch_size=32
)

# Plot training histories
print("Plotting training histories...")
dl_evaluator.plot_training_history('Simple MLP')
dl_evaluator.plot_training_history('Complex MLP')

# 12. Model Comparison and Evaluation

In [None]:
# Comprehensive Model Comparison
def create_model_comparison():
    """Create comprehensive model comparison"""
    
    # Combine traditional ML and DL results
    all_results = {}
    
    # Add traditional ML results
    for model_name, results in evaluator.results.items():
        all_results[model_name] = {
            'Test Accuracy': results['test_accuracy'],
            'Test Balanced Accuracy': results['test_balanced_accuracy'],
            'Test Precision': results['test_precision'],
            'Test Recall': results['test_recall'],
            'Training Time': results['training_time']
        }
    
    # Add deep learning results
    dl_results = {
        'Simple MLP': simple_mlp_results,
        'Complex MLP': complex_mlp_results
    }
    
    for model_name, results in dl_results.items():
        all_results[model_name] = {
            'Test Accuracy': results['test_accuracy'],
            'Test Balanced Accuracy': results['test_balanced_accuracy'],
            'Test Precision': results['test_precision'],
            'Test Recall': results['test_recall'],
            'Training Time': results['training_time']
        }
    
    # Create comparison DataFrame
    comparison_df = pd.DataFrame(all_results).T
    comparison_df = comparison_df.round(4)
    
    return comparison_df, all_results

# Create model comparison
comparison_df, all_results = create_model_comparison()

print("Model Performance Comparison:")
print("="*80)
print(comparison_df.sort_values('Test Accuracy', ascending=False))

# Visualize model comparison
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Accuracy comparison
axes[0, 0].bar(comparison_df.index, comparison_df['Test Accuracy'], color='skyblue')
axes[0, 0].set_title('Test Accuracy Comparison')
axes[0, 0].set_ylabel('Accuracy')
axes[0, 0].tick_params(axis='x', rotation=45)
axes[0, 0].grid(True, alpha=0.3)

# Balanced accuracy comparison
axes[0, 1].bar(comparison_df.index, comparison_df['Test Balanced Accuracy'], color='lightgreen')
axes[0, 1].set_title('Test Balanced Accuracy Comparison')
axes[0, 1].set_ylabel('Balanced Accuracy')
axes[0, 1].tick_params(axis='x', rotation=45)
axes[0, 1].grid(True, alpha=0.3)

# Precision comparison
axes[1, 0].bar(comparison_df.index, comparison_df['Test Precision'], color='orange')
axes[1, 0].set_title('Test Precision Comparison')
axes[1, 0].set_ylabel('Precision')
axes[1, 0].tick_params(axis='x', rotation=45)
axes[1, 0].grid(True, alpha=0.3)

# Training time comparison
axes[1, 1].bar(comparison_df.index, comparison_df['Training Time'], color='salmon')
axes[1, 1].set_title('Training Time Comparison')
axes[1, 1].set_ylabel('Time (seconds)')
axes[1, 1].tick_params(axis='x', rotation=45)
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Confusion matrices for best models
best_models = comparison_df.sort_values('Test Accuracy', ascending=False).head(3).index

fig, axes = plt.subplots(1, 3, figsize=(15, 4))

for i, model_name in enumerate(best_models):
    if model_name in evaluator.results:
        cm = evaluator.results[model_name]['confusion_matrix']
    else:
        # For deep learning models
        if model_name == 'Simple MLP':
            cm = simple_mlp_results['confusion_matrix']
        elif model_name == 'Complex MLP':
            cm = complex_mlp_results['confusion_matrix']
    
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=['SLOW', 'NORMAL', 'AGGRESSIVE'],
                yticklabels=['SLOW', 'NORMAL', 'AGGRESSIVE'],
                ax=axes[i])
    axes[i].set_title(f'{model_name} Confusion Matrix')
    axes[i].set_xlabel('Predicted')
    axes[i].set_ylabel('Actual')

plt.tight_layout()
plt.show()

# Identify best model
best_model_name = comparison_df.sort_values('Test Accuracy', ascending=False).index[0]
print(f"\nBest performing model: {best_model_name}")
print(f"Test Accuracy: {comparison_df.loc[best_model_name, 'Test Accuracy']:.4f}")
print(f"Test Balanced Accuracy: {comparison_df.loc[best_model_name, 'Test Balanced Accuracy']:.4f}")

# 13. Behavior Score Calculation

In [None]:
# Comprehensive Behavior Score Calculation
class BehaviorScoreCalculator:
    def __init__(self, best_model, best_model_name, feature_importance=None):
        self.best_model = best_model
        self.best_model_name = best_model_name
        self.feature_importance = feature_importance
        
    def calculate_base_score(self, predictions, probabilities):
        """Calculate base behavior score from model predictions"""
        base_scores = []
        
        for pred, prob in zip(predictions, probabilities):
            if hasattr(prob, '__len__') and len(prob) == 3:
                # Use probability distribution for more nuanced scoring
                # SLOW=0 (good), NORMAL=1 (average), AGGRESSIVE=2 (bad)
                prob_slow, prob_normal, prob_aggressive = prob
                
                # Base score calculation: higher score = better behavior
                # Scale from 0-100 where 100 is safest
                base_score = (
                    prob_slow * 100 +           # Full points for safe driving
                    prob_normal * 70 +          # Moderate points for normal driving
                    prob_aggressive * 20        # Low points for aggressive driving
                )
            else:
                # Fallback for models without probabilities
                if pred == 0:      # SLOW (safe)
                    base_score = 85
                elif pred == 1:    # NORMAL
                    base_score = 70
                else:              # AGGRESSIVE
                    base_score = 30
            
            base_scores.append(base_score)
        
        return np.array(base_scores)
    
    def calculate_feature_based_adjustments(self, features, feature_names):
        """Calculate adjustments based on specific feature values"""
        adjustments = np.zeros(len(features))
        
        # Define critical features and their impact
        critical_features = {
            'jerk': ['Jerk', 'jerk'],
            'acceleration': ['Acc', 'acc'],
            'gyroscope': ['Gyro', 'gyro'],
            'magnitude': ['magnitude', 'Magnitude']
        }
        
        for i, feature_row in enumerate(features):
            adjustment = 0
            
            for j, (feature_name, feature_value) in enumerate(zip(feature_names, feature_row)):
                # Normalize feature value (assuming standardized features)
                normalized_value = abs(feature_value)
                
                # Apply penalties for extreme values
                if any(keyword in feature_name for keyword in critical_features['jerk']):
                    if normalized_value > 2:  # High jerk
                        adjustment -= 5
                    elif normalized_value > 1.5:
                        adjustment -= 2
                        
                elif any(keyword in feature_name for keyword in critical_features['acceleration']):
                    if normalized_value > 2.5:  # High acceleration
                        adjustment -= 3
                    elif normalized_value > 2:
                        adjustment -= 1
                        
                elif any(keyword in feature_name for keyword in critical_features['magnitude']):
                    if normalized_value > 2:  # High overall magnitude
                        adjustment -= 2
            
            adjustments[i] = max(adjustment, -20)  # Cap negative adjustment
        
        return adjustments
    
    def calculate_confidence_intervals(self, base_scores, model_confidence):
        """Calculate confidence intervals for behavior scores"""
        confidence_intervals = []
        
        for score, confidence in zip(base_scores, model_confidence):
            # Higher model confidence = narrower interval
            interval_width = (1 - confidence) * 10  # Max ±10 points
            lower_bound = max(0, score - interval_width)
            upper_bound = min(100, score + interval_width)
            
            confidence_intervals.append({
                'lower': lower_bound,
                'upper': upper_bound,
                'width': interval_width
            })
        
        return confidence_intervals
    
    def calculate_comprehensive_score(self, X_test, y_test):
        """Calculate comprehensive behavior scores"""
        print("Calculating comprehensive behavior scores...")
        
        # Get model predictions and probabilities
        predictions = self.best_model.predict(X_test)
        
        if hasattr(self.best_model, 'predict_proba'):
            probabilities = self.best_model.predict_proba(X_test)
            model_confidence = np.max(probabilities, axis=1)
        else:
            # For models without predict_proba, create pseudo-probabilities
            probabilities = np.zeros((len(predictions), 3))
            for i, pred in enumerate(predictions):
                probabilities[i, pred] = 0.8  # Assume 80% confidence
                # Distribute remaining probability
                remaining = (1 - 0.8) / 2
                for j in range(3):
                    if j != pred:
                        probabilities[i, j] = remaining
            model_confidence = np.full(len(predictions), 0.8)
        
        # Calculate base scores
        base_scores = self.calculate_base_score(predictions, probabilities)
        
        # Calculate feature-based adjustments
        feature_adjustments = self.calculate_feature_based_adjustments(
            X_test, data_loader.feature_names
        )
        
        # Calculate final scores
        final_scores = np.clip(base_scores + feature_adjustments, 0, 100)
        
        # Calculate confidence intervals
        confidence_intervals = self.calculate_confidence_intervals(
            final_scores, model_confidence
        )
        
        # Create risk categories
        risk_categories = []
        for score in final_scores:
            if score >= 80:
                risk_categories.append('LOW_RISK')
            elif score >= 60:
                risk_categories.append('MEDIUM_RISK')
            elif score >= 40:
                risk_categories.append('HIGH_RISK')
            else:
                risk_categories.append('VERY_HIGH_RISK')
        
        return {
            'final_scores': final_scores,
            'base_scores': base_scores,
            'feature_adjustments': feature_adjustments,
            'model_confidence': model_confidence,
            'confidence_intervals': confidence_intervals,
            'risk_categories': risk_categories,
            'predictions': predictions,
            'probabilities': probabilities
        }

# Get the best model for scoring
if best_model_name in evaluator.models:
    best_model = evaluator.models[best_model_name]
elif best_model_name == 'Simple MLP':
    best_model = simple_mlp
elif best_model_name == 'Complex MLP':
    best_model = complex_mlp
else:
    best_model = evaluator.models[list(evaluator.models.keys())[0]]  # Fallback

# Initialize behavior score calculator
behavior_calculator = BehaviorScoreCalculator(best_model, best_model_name)

# Calculate behavior scores
behavior_scores = behavior_calculator.calculate_comprehensive_score(
    data_loader.X_test, data_loader.y_test
)

print("Behavior score calculation completed!")
print(f"Score statistics:")
print(f"Mean score: {np.mean(behavior_scores['final_scores']):.2f}")
print(f"Std score: {np.std(behavior_scores['final_scores']):.2f}")
print(f"Min score: {np.min(behavior_scores['final_scores']):.2f}")
print(f"Max score: {np.max(behavior_scores['final_scores']):.2f}")

# Visualize behavior scores
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Score distribution
axes[0, 0].hist(behavior_scores['final_scores'], bins=20, alpha=0.7, color='skyblue', edgecolor='black')
axes[0, 0].set_title('Behavior Score Distribution')
axes[0, 0].set_xlabel('Behavior Score')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].axvline(np.mean(behavior_scores['final_scores']), color='red', linestyle='--', label='Mean')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# Scores by actual class
class_names = ['SLOW', 'NORMAL', 'AGGRESSIVE']
score_by_class = [behavior_scores['final_scores'][data_loader.y_test == i] for i in range(3)]
axes[0, 1].boxplot(score_by_class, labels=class_names)
axes[0, 1].set_title('Behavior Scores by Actual Class')
axes[0, 1].set_ylabel('Behavior Score')
axes[0, 1].grid(True, alpha=0.3)

# Risk category distribution
risk_counts = pd.Series(behavior_scores['risk_categories']).value_counts()
axes[1, 0].pie(risk_counts.values, labels=risk_counts.index, autopct='%1.1f%%')
axes[1, 0].set_title('Risk Category Distribution')

# Score vs Confidence
axes[1, 1].scatter(behavior_scores['model_confidence'], behavior_scores['final_scores'], 
                  alpha=0.6, c=data_loader.y_test, cmap='viridis')
axes[1, 1].set_title('Behavior Score vs Model Confidence')
axes[1, 1].set_xlabel('Model Confidence')
axes[1, 1].set_ylabel('Behavior Score')
axes[1, 1].colorbar = plt.colorbar(axes[1, 1].collections[0], ax=axes[1, 1])
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# 14. Save Combined DataFrame with Behavior Scores

In [None]:
# Create comprehensive combined DataFrame with all features and behavior scores
print("Creating comprehensive combined DataFrame...")

# Reconstruct original data indices for proper alignment
total_samples = len(data_loader.X_train) + len(data_loader.X_test)
train_indices = range(len(data_loader.X_train))
test_indices = range(len(data_loader.X_train), total_samples)

# Create combined feature DataFrame
combined_features = pd.DataFrame(
    np.vstack([data_loader.X_train, data_loader.X_test]),
    columns=data_loader.feature_names
)

# Create combined labels
combined_labels = np.hstack([data_loader.y_train, data_loader.y_test])

# Add metadata columns
combined_df = combined_features.copy()
combined_df['actual_class'] = combined_labels
combined_df['actual_class_name'] = [data_loader.label_encoder.classes_[i] for i in combined_labels]

# Initialize score columns with NaN
combined_df['behavior_score'] = np.nan
combined_df['base_score'] = np.nan
combined_df['feature_adjustment'] = np.nan
combined_df['model_confidence'] = np.nan
combined_df['predicted_class'] = np.nan
combined_df['predicted_class_name'] = ''
combined_df['risk_category'] = ''
combined_df['confidence_lower'] = np.nan
combined_df['confidence_upper'] = np.nan
combined_df['data_split'] = ''

# Mark train/test split
combined_df.loc[train_indices, 'data_split'] = 'train'
combined_df.loc[test_indices, 'data_split'] = 'test'

# Fill in behavior scores for test data
test_mask = combined_df['data_split'] == 'test'
combined_df.loc[test_mask, 'behavior_score'] = behavior_scores['final_scores']
combined_df.loc[test_mask, 'base_score'] = behavior_scores['base_scores']
combined_df.loc[test_mask, 'feature_adjustment'] = behavior_scores['feature_adjustments']
combined_df.loc[test_mask, 'model_confidence'] = behavior_scores['model_confidence']
combined_df.loc[test_mask, 'predicted_class'] = behavior_scores['predictions']
combined_df.loc[test_mask, 'predicted_class_name'] = [
    data_loader.label_encoder.classes_[i] for i in behavior_scores['predictions']
]
combined_df.loc[test_mask, 'risk_category'] = behavior_scores['risk_categories']

# Add confidence intervals
for i, interval in enumerate(behavior_scores['confidence_intervals']):
    idx = combined_df.index[test_mask][i]
    combined_df.loc[idx, 'confidence_lower'] = interval['lower']
    combined_df.loc[idx, 'confidence_upper'] = interval['upper']

# Calculate behavior scores for training data (for completeness)
print("Calculating behavior scores for training data...")
train_predictions = best_model.predict(data_loader.X_train)
if hasattr(best_model, 'predict_proba'):
    train_probabilities = best_model.predict_proba(data_loader.X_train)
    train_confidence = np.max(train_probabilities, axis=1)
else:
    train_probabilities = np.zeros((len(train_predictions), 3))
    for i, pred in enumerate(train_predictions):
        train_probabilities[i, pred] = 0.8
    train_confidence = np.full(len(train_predictions), 0.8)

# Calculate scores for training data
train_base_scores = behavior_calculator.calculate_base_score(train_predictions, train_probabilities)
train_adjustments = behavior_calculator.calculate_feature_based_adjustments(
    data_loader.X_train, data_loader.feature_names
)
train_final_scores = np.clip(train_base_scores + train_adjustments, 0, 100)
train_confidence_intervals = behavior_calculator.calculate_confidence_intervals(
    train_final_scores, train_confidence
)

train_risk_categories = []
for score in train_final_scores:
    if score >= 80:
        train_risk_categories.append('LOW_RISK')
    elif score >= 60:
        train_risk_categories.append('MEDIUM_RISK')
    elif score >= 40:
        train_risk_categories.append('HIGH_RISK')
    else:
        train_risk_categories.append('VERY_HIGH_RISK')

# Fill in training data scores
train_mask = combined_df['data_split'] == 'train'
combined_df.loc[train_mask, 'behavior_score'] = train_final_scores
combined_df.loc[train_mask, 'base_score'] = train_base_scores
combined_df.loc[train_mask, 'feature_adjustment'] = train_adjustments
combined_df.loc[train_mask, 'model_confidence'] = train_confidence
combined_df.loc[train_mask, 'predicted_class'] = train_predictions
combined_df.loc[train_mask, 'predicted_class_name'] = [
    data_loader.label_encoder.classes_[i] for i in train_predictions
]
combined_df.loc[train_mask, 'risk_category'] = train_risk_categories

for i, interval in enumerate(train_confidence_intervals):
    idx = combined_df.index[train_mask][i]
    combined_df.loc[idx, 'confidence_lower'] = interval['lower']
    combined_df.loc[idx, 'confidence_upper'] = interval['upper']

# Add model metadata
combined_df['model_used'] = best_model_name
combined_df['timestamp'] = pd.Timestamp.now()

# Display summary statistics
print(f"\nCombined DataFrame Summary:")
print(f"Total samples: {len(combined_df)}")
print(f"Features: {len(data_loader.feature_names)}")
print(f"Training samples: {len(combined_df[combined_df['data_split'] == 'train'])}")
print(f"Test samples: {len(combined_df[combined_df['data_split'] == 'test'])}")
print(f"\nBehavior Score Statistics:")
print(combined_df['behavior_score'].describe())

print(f"\nRisk Category Distribution:")
print(combined_df['risk_category'].value_counts())

print(f"\nClass Distribution:")
print(combined_df['actual_class_name'].value_counts())

# Display sample of the combined DataFrame
print(f"\nSample of Combined DataFrame:")
display_columns = ['actual_class_name', 'predicted_class_name', 'behavior_score', 
                  'risk_category', 'model_confidence', 'data_split']
print(combined_df[display_columns].head(10))

# Save the combined DataFrame
output_file = 'combined_telematics_data_with_behavior_scores.csv'
combined_df.to_csv(output_file, index=False)
print(f"\n✅ Combined DataFrame saved to: {output_file}")

# Save a summary report
summary_report = f"""
Telematics Behavior Analysis Summary Report
=========================================
Generated: {pd.Timestamp.now()}
Model Used: {best_model_name}

Dataset Statistics:
- Total Samples: {len(combined_df):,}
- Training Samples: {len(combined_df[combined_df['data_split'] == 'train']):,}
- Test Samples: {len(combined_df[combined_df['data_split'] == 'test']):,}
- Features: {len(data_loader.feature_names)}

Behavior Score Statistics:
- Mean Score: {combined_df['behavior_score'].mean():.2f}
- Std Score: {combined_df['behavior_score'].std():.2f}
- Min Score: {combined_df['behavior_score'].min():.2f}
- Max Score: {combined_df['behavior_score'].max():.2f}

Risk Category Distribution:
{combined_df['risk_category'].value_counts().to_string()}

Class Distribution:
{combined_df['actual_class_name'].value_counts().to_string()}

Model Performance:
- Best Model: {best_model_name}
- Test Accuracy: {evaluator.results.loc[evaluator.results['Model'] == best_model_name, 'Accuracy'].iloc[0]:.4f}

Files Generated:
1. combined_telematics_data_with_behavior_scores.csv - Complete dataset with scores
2. telematics_analysis_summary.txt - This summary report
"""

with open('telematics_analysis_summary.txt', 'w') as f:
    f.write(summary_report)

print(f"📊 Summary report saved to: telematics_analysis_summary.txt")
print("\n Comprehensive telematics analysis completed!")
print("The combined notebook successfully integrates:")
print("✓ Feature engineering from both source notebooks")
print("✓ Multiple machine learning models")
print("✓ Comprehensive behavior scoring system")
print("✓ Risk categorization for insurance applications")
print("✓ Complete dataset with scores for further analysis")