# 3 Model Exploration

## 3.1 Imports and dataset loading

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix, classification_report,
    roc_curve, auc
)
import warnings
warnings.filterwarnings('ignore')

# Set visualization defaults
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")


In [5]:
output_dir = '../output/'
df = pd.read_csv(output_dir + 'df_cleaned.csv')
print(f"Dataset loaded: {df.shape[0]} rows, {df.shape[1]} columns")

Dataset loaded: 19701 rows, 41 columns


In [7]:
print("Dataset Info:")
print(df.info())

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19701 entries, 0 to 19700
Data columns (total 41 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Any                        19701 non-null  int64  
 1   zona                       19701 non-null  int64  
 2   nomCom                     19701 non-null  int64  
 3   nomDem                     19701 non-null  int64  
 4   F_MORTS                    19701 non-null  int64  
 5   F_FERITS_GREUS             19701 non-null  int64  
 6   F_FERITS_LLEUS             19701 non-null  int64  
 7   F_VICTIMES                 19701 non-null  int64  
 8   F_UNITATS_IMPLICADES       19701 non-null  int64  
 9   F_VIANANTS_IMPLICADES      19701 non-null  int64  
 10  F_BICICLETES_IMPLICADES    19701 non-null  int64  
 11  F_CICLOMOTORS_IMPLICADES   19701 non-null  int64  
 12  F_MOTOCICLETES_IMPLICADES  19701 non-null  int64  
 13  F_VEH_LLEUGERS_IMPLICADES  19701

In [6]:
print("\nTarget variable distribution:")
print(df['Mortalitat'].value_counts().sort_index())
print("\nTarget proportions:")
print(df['Mortalitat'].value_counts(normalize=True).sort_index())



Target variable distribution:
Mortalitat
0    16789
1     2912
Name: count, dtype: int64

Target proportions:
Mortalitat
0    0.85219
1    0.14781
Name: proportion, dtype: float64


## 3.2 Data preparation and splits

### Separate features and target

In [12]:
X = df.drop(columns=['Mortalitat'])
y = df['Mortalitat']

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"\nFeatures ({X.shape[1]} columns):")
print(X.columns.tolist())

Features shape: (19701, 40)
Target shape: (19701,)

Features (40 columns):
['Any', 'zona', 'nomCom', 'nomDem', 'F_MORTS', 'F_FERITS_GREUS', 'F_FERITS_LLEUS', 'F_VICTIMES', 'F_UNITATS_IMPLICADES', 'F_VIANANTS_IMPLICADES', 'F_BICICLETES_IMPLICADES', 'F_CICLOMOTORS_IMPLICADES', 'F_MOTOCICLETES_IMPLICADES', 'F_VEH_LLEUGERS_IMPLICADES', 'F_VEH_PESANTS_IMPLICADES', 'C_VELOCITAT_VIA', 'D_BOIRA', 'D_CARACT_ENTORN', 'D_CARRIL_ESPECIAL', 'D_CIRCULACIO_MESURES_ESP', 'D_CLIMATOLOGIA', 'D_FUNC_ESP_VIA', 'D_INTER_SECCIO', 'D_LIMIT_VELOCITAT', 'D_LLUMINOSITAT', 'D_REGULACIO_PRIORITAT', 'D_SENTITS_VIA', 'D_SUBTIPUS_ACCIDENT', 'D_SUBTIPUS_TRAM', 'D_SUBZONA', 'D_SUPERFICIE', 'D_TIPUS_VIA', 'D_TITULARITAT_VIA', 'D_TRACAT_ALTIMETRIC', 'D_VENT', 'hor', 'grupHor', 'tipAcc', 'tipDia', 'Mes']


### Stratified Train/Val/Test split

In [13]:
# First split: 70% train, 30% temp (for validation + test)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.30, random_state=42, stratify=y
)

# Second split: 50% val, 50% test from temp
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, random_state=42, stratify=y_temp
)

print("Dataset splits:")
print(f"Train set: {X_train.shape[0]} samples ({X_train.shape[0]/len(X)*100:.1f}%)")
print(f"Val set:   {X_val.shape[0]} samples ({X_val.shape[0]/len(X)*100:.1f}%)")
print(f"Test set:  {X_test.shape[0]} samples ({X_test.shape[0]/len(X)*100:.1f}%)")

print("\nClass distribution across splits:")
print("\nTrain:")
print(y_train.value_counts(normalize=True).sort_index())
print("\nVal:")
print(y_val.value_counts(normalize=True).sort_index())
print("\nTest:")
print(y_test.value_counts(normalize=True).sort_index())


Dataset splits:
Train set: 13790 samples (70.0%)
Val set:   2955 samples (15.0%)
Test set:  2956 samples (15.0%)

Class distribution across splits:

Train:
Mortalitat
0    0.852212
1    0.147788
Name: proportion, dtype: float64

Val:
Mortalitat
0    0.852115
1    0.147885
Name: proportion, dtype: float64

Test:
Mortalitat
0    0.852165
1    0.147835
Name: proportion, dtype: float64


### Feature scaling

In [15]:
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Convert back to dfs
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_val_scaled = pd.DataFrame(X_val_scaled, columns=X_val.columns, index=X_val.index)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)

print("Features scaled using RobustScaler")
print(f"Train set shape: {X_train_scaled.shape}")
print(f"Val set shape: {X_val_scaled.shape}")
print(f"Test set shape: {X_test_scaled.shape}")


Features scaled using RobustScaler
Train set shape: (13790, 40)
Val set shape: (2955, 40)
Test set shape: (2956, 40)


### Class weights (for imbalanced data)

In [17]:
class_weights = compute_class_weight(
    'balanced',
    classes=np.unique(y_train),
    y=y_train
)
class_weight_dict = dict(zip(np.unique(y_train), class_weights))

print("Class Weights (for handling imbalance):")
print(f"Class 0 (No mortality): {class_weight_dict[0]:.4f}")
print(f"Class 1 (Mortality): {class_weight_dict[1]:.4f}")

Class Weights (for handling imbalance):
Class 0 (No mortality): 0.5867
Class 1 (Mortality): 3.3832
