In [None]:
import pandas as pd
import numpy as np

# 0.1) Cargar
df = pd.read_csv(
    "Accidentes de tránsito en carreteras-2020-2021-Sutran.csv",
    encoding="latin1",
    sep=";",            # <--- separador correcto
    low_memory=False
)

# 0.2) Normalizar nombres de columnas (sin espacios/acentos)
def normalize_col(s):
    s = (s.strip()
           .lower()
           .replace("á","a").replace("é","e").replace("í","i").replace("ó","o").replace("ú","u")
           .replace("ñ","n")
           .replace("ü","u")
           .replace("º","")
           .replace("°",""))
    s = "".join(ch if ch.isalnum() or ch=="_" else "_" for ch in s)
    s = "_".join(filter(None, s.split("_")))
    return s

df.columns = [normalize_col(c) for c in df.columns]
df.head()


Unnamed: 0,fecha_corte,fecha,hora,departamento,codigo_via,kilometro,modalidad,fallecidos,heridos
0,20211222,20200101,05:40,LIMA,PE-1S,24,DESPISTE,0,0
1,20211222,20200101,16:30,CAJAMARCA,PE-3N,74,DESPISTE,0,0
2,20211222,20200101,07:45,PASCO,PE-3N,103,DESPISTE,0,1
3,20211222,20200101,18:30,CAJAMARCA,PE-08,111,DESPISTE,0,2
4,20211222,20200101,18:40,LIMA,PE-1N,174,DESPISTE,0,5


In [None]:
# 1.1) Asegurar numéricos
for col in ["fallecidos","heridos"]:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors="coerce")

# 1.2) Target: fatal
df["fatal"] = (df.get("fallecidos", 0).fillna(0) > 0).astype(int)

# 1.3) FECHA -> año, mes, dia_semana
if "fecha" in df.columns:
    # muchas veces viene como yyyymmdd
    fecha_parsed = pd.to_datetime(df["fecha"].astype(str), errors="coerce", format="%Y%m%d")
    # si no funcionó, intenta parseo libre
    fecha_parsed = fecha_parsed.fillna(pd.to_datetime(df["fecha"], errors="coerce"))
else:
    fecha_parsed = pd.NaT

df["anio"] = getattr(fecha_parsed.dt, "year", np.nan)
df["mes"] = getattr(fecha_parsed.dt, "month", np.nan)
df["dia_semana"] = getattr(fecha_parsed.dt, "dayofweek", np.nan)

# 1.4) HORA -> hora del dia
if "hora" in df.columns:
    # suele venir como HH:MM
    h = pd.to_datetime(df["hora"].astype(str), errors="coerce", format="%H:%M")
    df["hora_num"] = getattr(h.dt, "hour", np.nan)
else:
    df["hora_num"] = np.nan

# 1.5) KILOMETRO -> extraer número
def extraer_km(x):
    if pd.isna(x):
        return np.nan
    x = str(x)
    # tomar números (ej: "174+500" -> 174.5; "KM 12" -> 12)
    x = x.replace(",", ".")
    if "+" in x:
        try:
            a,b = x.split("+",1)
            return float("".join(ch for ch in a if ch.isdigit() or ch==".")) + \
                   float("".join(ch for ch in b if ch.isdigit() or ch=="."))/1000.0
        except:
            pass
    nums = "".join(ch if ch.isdigit() or ch=="." else " " for ch in x).strip().split()
    if len(nums)>0:
        try:
            return float(nums[0])
        except:
            return np.nan
    return np.nan

df["kilometro_num"] = df["kilometro"].apply(extraer_km) if "kilometro" in df.columns else np.nan

# 1.6) Seleccionar features disponibles
num_cols_posibles = ["hora_num","mes","anio","dia_semana","kilometro_num","heridos"]
cat_cols_posibles = ["departamento","codigo_via","modalidad"]

num_cols = [c for c in num_cols_posibles if c in df.columns]
cat_cols = [c for c in cat_cols_posibles if c in df.columns]

feature_cols = num_cols + cat_cols
df_model = df.dropna(subset=["fatal"] + (feature_cols if feature_cols else []))

X = df_model[feature_cols].copy()
y = df_model["fatal"].astype(int)
X.shape, y.mean()  # tamaño y proporción de fatales


((8012, 9), np.float64(0.11694957563654518))

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, roc_auc_score

# 2.1) Preprocesamiento
preprocess = ColumnTransformer(
    transformers=[
        ("num", "passthrough", num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
    ],
    remainder="drop"
)

# 2.2) Train/Test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

len(X_train), len(X_test), y_train.mean(), y_test.mean()


(6409, 1603, np.float64(0.11702293649555313), np.float64(0.11665626949469744))

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Pipeline con RF (hiperparámetros serán optimizados por GA)
base_clf = RandomForestClassifier(
    n_estimators=200,
    random_state=42,
    n_jobs=-1,
    class_weight="balanced"  # por si está desbalanceado
)

pipe = Pipeline(steps=[("prep", preprocess), ("clf", base_clf)])

pipe.fit(X_train, y_train)
pred = pipe.predict(X_test)
proba = pipe.predict_proba(X_test)[:,1]

print(classification_report(y_test, pred, digits=4))
print("AUC:", roc_auc_score(y_test, proba))


              precision    recall  f1-score   support

           0     0.8969    0.9894    0.9409      1416
           1     0.6341    0.1390    0.2281       187

    accuracy                         0.8902      1603
   macro avg     0.7655    0.5642    0.5845      1603
weighted avg     0.8663    0.8902    0.8577      1603

AUC: 0.6795560288830478


In [None]:
from sklearn.model_selection import StratifiedKFold, cross_val_score

# 4.1) Espacio y helpers
rng = np.random.default_rng(42)

max_features_choices = ["sqrt", "log2", 0.5, 1.0]
bootstrap_choices = [True, False]

bounds = {
    "n_estimators": (50, 400),       # int
    "max_depth": (3, 30),            # int
    "min_samples_split": (2, 20),    # int
    "min_samples_leaf": (1, 20),     # int
    "max_features_idx": (0, len(max_features_choices)-1),  # categorical via índice
    "bootstrap_idx": (0, len(bootstrap_choices)-1)         # categorical via índice
}

genes = list(bounds.keys())

def random_individual():
    ind = {}
    for g in genes:
        lo, hi = bounds[g]
        ind[g] = rng.integers(lo, hi+1)  # enteros, para categóricas es índice
    return ind

def decode_params(ind):
    return {
        "n_estimators": int(ind["n_estimators"]),
        "max_depth": int(ind["max_depth"]),
        "min_samples_split": int(ind["min_samples_split"]),
        "min_samples_leaf": int(ind["min_samples_leaf"]),
        "max_features": max_features_choices[int(ind["max_features_idx"])],
        "bootstrap": bootstrap_choices[int(ind["bootstrap_idx"])],
        "class_weight": "balanced",
        "random_state": 42,
        "n_jobs": -1
    }

def fitness(ind):
    params = decode_params(ind)
    clf = RandomForestClassifier(**params)
    model = Pipeline(steps=[("prep", preprocess), ("clf", clf)])
    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    # usa F1 (clase 1 = fatal)
    scores = cross_val_score(model, X_train, y_train, cv=cv, scoring="f1", n_jobs=-1)
    return float(np.mean(scores))

def tournament_select(pop, k=3):
    # selecciona el mejor de k al azar
    idxs = rng.choice(len(pop), size=k, replace=False)
    best = max((pop[i] for i in idxs), key=lambda d: d["fitness"])
    return best.copy()

def crossover(p1, p2, pc=0.9):
    c1, c2 = p1.copy(), p2.copy()
    if rng.random() < pc:
        # uniforme: por gen elige de p1 o p2
        for g in genes:
            if rng.random() < 0.5:
                c1[g], c2[g] = p2[g], p1[g]
    return c1, c2

def mutate(ind, pm=0.2):
    for g in genes:
        if rng.random() < pm:
            lo, hi = bounds[g]
            # para enteros numéricos: pequeña perturbación
            step = max(1, int((hi-lo)*0.1))
            if "idx" in g:  # categóricas -> salto a otro índice
                choices = list(range(lo, hi+1))
                choices.remove(int(ind[g]))
                ind[g] = int(rng.choice(choices))
            else:
                val = int(ind[g]) + int(rng.integers(-step, step+1))
                ind[g] = int(np.clip(val, lo, hi))
    return ind

def evaluate(pop):
    for ind in pop:
        if "fitness" not in ind:
            ind["fitness"] = fitness(ind)

# 4.2) GA loop
POP = 24
GENS = 15
ELIT = 2
pc = 0.9
pm = 0.2

# init
population = [random_individual() for _ in range(POP)]
evaluate(population)

history = []
for gen in range(GENS):
    # elitismo
    population.sort(key=lambda d: d["fitness"], reverse=True)
    best = population[0]
    history.append((gen, best["fitness"], decode_params(best)))
    print(f"Gen {gen:02d} | best F1 = {best['fitness']:.4f} | {decode_params(best)}")

    # reproducción
    new_pop = population[:ELIT]  # conservar élite
    while len(new_pop) < POP:
        p1 = tournament_select(population, k=3)
        p2 = tournament_select(population, k=3)
        c1, c2 = crossover(p1, p2, pc=pc)
        c1 = mutate(c1, pm=pm)
        c2 = mutate(c2, pm=pm)
        for c in (c1, c2):
            if len(new_pop) < POP:
                if "fitness" in c: del c["fitness"]
                new_pop.append(c)
    population = new_pop
    evaluate(population)

# Mejor individuo final
population.sort(key=lambda d: d["fitness"], reverse=True)
best_ind = population[0]
best_params = decode_params(best_ind)
best_f1_cv = best_ind["fitness"]
best_params, best_f1_cv


Gen 00 | best F1 = 0.3312 | {'n_estimators': 208, 'max_depth': 9, 'min_samples_split': 3, 'min_samples_leaf': 12, 'max_features': 1.0, 'bootstrap': True, 'class_weight': 'balanced', 'random_state': 42, 'n_jobs': -1}
Gen 01 | best F1 = 0.3312 | {'n_estimators': 208, 'max_depth': 9, 'min_samples_split': 3, 'min_samples_leaf': 12, 'max_features': 1.0, 'bootstrap': True, 'class_weight': 'balanced', 'random_state': 42, 'n_jobs': -1}
Gen 02 | best F1 = 0.3353 | {'n_estimators': 62, 'max_depth': 7, 'min_samples_split': 12, 'min_samples_leaf': 12, 'max_features': 0.5, 'bootstrap': True, 'class_weight': 'balanced', 'random_state': 42, 'n_jobs': -1}
Gen 03 | best F1 = 0.3353 | {'n_estimators': 62, 'max_depth': 7, 'min_samples_split': 12, 'min_samples_leaf': 12, 'max_features': 0.5, 'bootstrap': True, 'class_weight': 'balanced', 'random_state': 42, 'n_jobs': -1}
Gen 04 | best F1 = 0.3356 | {'n_estimators': 206, 'max_depth': 8, 'min_samples_split': 15, 'min_samples_leaf': 12, 'max_features': 1.0, 

({'n_estimators': 61,
  'max_depth': 8,
  'min_samples_split': 3,
  'min_samples_leaf': 12,
  'max_features': 1.0,
  'bootstrap': True,
  'class_weight': 'balanced',
  'random_state': 42,
  'n_jobs': -1},
 0.34084329090923465)

In [None]:
best_rf = RandomForestClassifier(**best_params)
best_pipe = Pipeline(steps=[("prep", preprocess), ("clf", best_rf)])
best_pipe.fit(X_train, y_train)

y_pred = best_pipe.predict(X_test)
y_proba = best_pipe.predict_proba(X_test)[:,1]

print("=== RESULTADOS EN TEST ===")
print(classification_report(y_test, y_pred, digits=4))
print("AUC:", roc_auc_score(y_test, y_proba))


=== RESULTADOS EN TEST ===
              precision    recall  f1-score   support

           0     0.9197    0.8333    0.8744      1416
           1     0.2625    0.4492    0.3314       187

    accuracy                         0.7885      1603
   macro avg     0.5911    0.6413    0.6029      1603
weighted avg     0.8431    0.7885    0.8110      1603

AUC: 0.7039412066829813
