In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix


In [2]:
import kagglehub

# Pega a última versão do dataset
path = kagglehub.dataset_download("deepcontractor/australian-fatal-car-accident-data-19892021")

print("Path to dataset files:", path)

df = pd.read_csv(path + "/Crash_Data.csv", low_memory=False) # -> Lê o arquivo CSV

# Remover colunas irrelevantes
df.drop(columns=[
    'Crash ID', 'Month', 'Year', 'Dayweek', 'Time', 'Age',
    'SA4 Name 2016', 'National LGA Name 2017', 'National Road Type'
], inplace=True)

# Definir variáveis dependentes e independentes
X = df.drop(columns=['Crash Type'])
y = df['Crash Type']

# Identificar colunas categóricas e numéricas
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Dividir o dataset em treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


  from .autonotebook import tqdm as notebook_tqdm


Path to dataset files: C:\Users\vinicius.sousa\.cache\kagglehub\datasets\deepcontractor\australian-fatal-car-accident-data-19892021\versions\2


In [3]:
# Pipeline de pré-processamento para colunas categóricas
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Pipeline de pré-processamento para colunas numéricas
numerical_transformer = StandardScaler()

# Combinar os pré-processadores usando ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])


In [4]:
# Dicionário de modelos e seus respectivos grids de hiperparâmetros
models = {
    'Logistic Regression': {
        'model': LogisticRegression(max_iter=1000),
        'params': {
            'model__C': [0.01, 0.1, 1, 10],
            'model__penalty': ['l2'],
            'model__solver': ['lbfgs']
        }
    },
    'SVM': {
        'model': SVC(),
        'params': {
            'model__C': [0.1, 1, 10],
            'model__kernel': ['linear', 'rbf'],
            'model__gamma': ['scale', 'auto']
        }
    },
    'Random Forest': {
        'model': RandomForestClassifier(),
        'params': {
            'model__n_estimators': [100, 200],
            'model__max_depth': [None, 10, 20],
            'model__min_samples_split': [2, 5]
        }
    }
}


In [5]:
for model_name, mp in models.items():
    print(f"\n🔍 Treinando e otimizando: {model_name}")
    
    # Criar o pipeline completo
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', mp['model'])
    ])
    
    # Configurar o GridSearchCV
    grid_search = GridSearchCV(pipeline, mp['params'], cv=5, n_jobs=-1, verbose=1)
    
    # Treinar o modelo
    grid_search.fit(X_train, y_train)
    
    # Exibir os melhores hiperparâmetros
    print(f"Melhores hiperparâmetros para {model_name}: {grid_search.best_params_}")
    
    # Avaliar o modelo no conjunto de teste
    y_pred = grid_search.predict(X_test)
    print("Matriz de Confusão:")
    print(confusion_matrix(y_test, y_pred))
    print("Relatório de Classificação:")
    print(classification_report(y_test, y_pred))



🔍 Treinando e otimizando: Logistic Regression
Fitting 5 folds for each of 4 candidates, totalling 20 fits
Melhores hiperparâmetros para Logistic Regression: {'model__C': 0.1, 'model__penalty': 'l2', 'model__solver': 'lbfgs'}
Matriz de Confusão:
[[2951 1777]
 [1243 4598]]
Relatório de Classificação:
              precision    recall  f1-score   support

    Multiple       0.70      0.62      0.66      4728
      Single       0.72      0.79      0.75      5841

    accuracy                           0.71     10569
   macro avg       0.71      0.71      0.71     10569
weighted avg       0.71      0.71      0.71     10569


🔍 Treinando e otimizando: SVM
Fitting 5 folds for each of 12 candidates, totalling 60 fits
Melhores hiperparâmetros para SVM: {'model__C': 10, 'model__gamma': 'scale', 'model__kernel': 'rbf'}
Matriz de Confusão:
[[3066 1662]
 [1309 4532]]
Relatório de Classificação:
              precision    recall  f1-score   support

    Multiple       0.70      0.65      0.67      