In [1]:
## Load & imports

import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from utils.submission_utils import *
from sklearn.model_selection import train_test_split

traffic_train = pd.read_csv("../../datasets/training_data.csv", keep_default_na=False, encoding="latin1")
traffic_test  = pd.read_csv("../../datasets/test_data.csv", keep_default_na=False, encoding="latin1")

[utils] OUTPUT_DIR=../../submissions


In [2]:
traffic_train.head()

Unnamed: 0,city_name,record_date,AVERAGE_SPEED_DIFF,AVERAGE_FREE_FLOW_SPEED,AVERAGE_TIME_DIFF,AVERAGE_FREE_FLOW_TIME,LUMINOSITY,AVERAGE_TEMPERATURE,AVERAGE_ATMOSP_PRESSURE,AVERAGE_HUMIDITY,AVERAGE_WIND_SPEED,AVERAGE_CLOUDINESS,AVERAGE_PRECIPITATION,AVERAGE_RAIN
0,Porto,2019-08-29 07:00:00,Medium,41.5,11.5,71.4,LIGHT,15.0,1019.0,100.0,3.0,,0.0,
1,Porto,2018-08-10 14:00:00,High,41.7,48.3,87.4,LIGHT,21.0,1021.0,53.0,5.0,céu claro,0.0,
2,Porto,2019-09-01 16:00:00,High,38.6,38.4,85.2,LIGHT,26.0,1014.0,61.0,4.0,,0.0,
3,Porto,2019-02-26 11:00:00,High,37.4,61.0,94.1,LIGHT,18.0,1025.0,48.0,4.0,céu claro,0.0,
4,Porto,2019-06-06 12:00:00,Medium,41.6,50.4,77.0,LIGHT,15.0,1008.0,82.0,10.0,,0.0,


In [4]:
## Section for Data Treatment - VERSÃO RÁPIDA E EFICIENTE

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import classification_report, accuracy_score
import warnings
warnings.filterwarnings('ignore')

# ===================================================================
# 1. ENGENHARIA DE FEATURES ESSENCIAIS (RÁPIDO)
# ===================================================================

print("Processando dados...")

for df in [traffic_train, traffic_test]:
    # Features temporais básicas mas importantes
    df["record_date"] = pd.to_datetime(df["record_date"])
    df["hour"] = df["record_date"].dt.hour
    df["weekday"] = df["record_date"].dt.weekday
    df["is_weekend"] = (df["weekday"] >= 5).astype(int)
    df["is_rush_hour"] = df["hour"].apply(lambda x: 1 if (7 <= x <= 9) or (17 <= x <= 19) else 0)

# Luminosidade para numérico
luminosity_map = {'DARK': 0, 'LOW_LIGHT': 1, 'LIGHT': 2}
traffic_train['LUMINOSITY'] = traffic_train['LUMINOSITY'].replace(luminosity_map).astype(int)
traffic_test['LUMINOSITY'] = traffic_test['LUMINOSITY'].replace(luminosity_map).astype(int)

# 2. FEATURES DE INTERAÇÃO IMPORTANTES (SÓ AS MAIS RELEVANTES)
for df in [traffic_train, traffic_test]:
    # Apenas 2 features críticas
    df['congestion_ratio'] = df['AVERAGE_TIME_DIFF'] / (df['AVERAGE_FREE_FLOW_TIME'] + 1)
    df['speed_time_product'] = df['AVERAGE_FREE_FLOW_SPEED'] * df['AVERAGE_TIME_DIFF']

# ===================================================================
# 3. PREPARAÇÃO DOS DADOS
# ===================================================================

y = traffic_train["AVERAGE_SPEED_DIFF"]

# Colunas para manter (versão simplificada)
features_to_keep = [
    'AVERAGE_FREE_FLOW_SPEED',
    'AVERAGE_TIME_DIFF', 
    'AVERAGE_FREE_FLOW_TIME',
    'AVERAGE_TEMPERATURE',
    'AVERAGE_ATMOSP_PRESSURE',
    'AVERAGE_HUMIDITY',
    'AVERAGE_WIND_SPEED',
    'LUMINOSITY',
    'hour',
    'is_weekend',
    'is_rush_hour',
    'congestion_ratio',
    'speed_time_product'
]

# Filtrar apenas as que existem
features_to_keep = [col for col in features_to_keep if col in traffic_train.columns]

X = traffic_train[features_to_keep].copy()
X_test = traffic_test[[col for col in features_to_keep if col in traffic_test.columns]].copy()

# ===================================================================
# 4. PIPELINE OTIMIZADO PARA VELOCIDADE
# ===================================================================

print("Criando pipeline...")

# Pipeline simples mas eficaz
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # SVM precisa de scaling
    ('svc', SVC(kernel='poly', random_state=42))
])

# ===================================================================
# 5. RANDOMIZED SEARCH (MUITO MAIS RÁPIDO QUE GRID SEARCH)
# ===================================================================

# Parâmetros baseados nos seus melhores resultados anteriores
param_dist = {
    'svc__C': [0.5, 1, 5, 10, 15, 20],  # Focado em torno do seu melhor (10)
    'svc__kernel': ['poly', 'rbf'],
    'svc__degree': [2, 3],  # Para kernel poly
    'svc__gamma': ['scale', 'auto', 0.1, 0.01],
    'svc__coef0': [0.5, 1.0, 1.5],  # Para kernel poly
    'svc__class_weight': [None, 'balanced']
}

print("Executando Randomized Search (mais rápido)...")

# RandomizedSearch é muito mais rápido
random_search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_dist,
    n_iter=20,  # Apenas 20 combinações em vez de centenas
    cv=3,  # 3-fold em vez de 5
    scoring='accuracy',
    n_jobs=-1,  # Usar todos os cores
    verbose=1,
    random_state=42
)

# Treinar (será rápido)
random_search.fit(X, y)

print(f"\n✅ Busca concluída!")
print(f"Melhor score: {random_search.best_score_:.4f}")
print(f"Melhores parâmetros: {random_search.best_params_}")

# ===================================================================
# 6. MODELO FINAL COM MELHORES PARÂMETROS
# ===================================================================

print("\nTreinando modelo final com melhores parâmetros...")

# Usar os melhores parâmetros encontrados
best_params = random_search.best_params_

# Criar modelo final com os melhores parâmetros
final_model = Pipeline([
    ('scaler', StandardScaler()),
    ('svc', SVC(
        kernel=best_params.get('svc__kernel', 'poly'),
        C=best_params.get('svc__C', 10),
        degree=best_params.get('svc__degree', 2),
        gamma=best_params.get('svc__gamma', 'scale'),
        coef0=best_params.get('svc__coef0', 1.0),
        class_weight=best_params.get('svc__class_weight', None),
        random_state=42
    ))
])

# Treinar modelo final com todos os dados
final_model.fit(X, y)

# ===================================================================
# 7. PREDIÇÃO E SUBMISSÃO
# ===================================================================

print("Fazendo predições para submission...")

# Prever no conjunto de teste
y_pred_test = final_model.predict(X_test)

# Accuracy no treino (apenas para referência)
y_train_pred = final_model.predict(X)
train_accuracy = accuracy_score(y, y_train_pred)
print(f"Acurácia no treino: {train_accuracy:.4f}")

# Criar submissão
submission = pd.DataFrame({
    'RowId': traffic_test.index + 1,
    'Speed_Diff': y_pred_test
})

# Salvar
submission_path = '../../submissions/submission_svm_fast.csv'
submission.to_csv(submission_path, index=False)
print(f"✅ Submissão criada: {submission_path}")

Processando dados...
Criando pipeline...
Executando Randomized Search (mais rápido)...
Fitting 3 folds for each of 20 candidates, totalling 60 fits

✅ Busca concluída!
Melhor score: 0.7905
Melhores parâmetros: {'svc__kernel': 'poly', 'svc__gamma': 0.01, 'svc__degree': 2, 'svc__coef0': 1.5, 'svc__class_weight': 'balanced', 'svc__C': 15}

Treinando modelo final com melhores parâmetros...
Fazendo predições para submission...
Acurácia no treino: 0.8052
✅ Submissão criada: ../../submissions/submission_svm_fast.csv


In [9]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=23
)

print("Shapes:", X_train.shape, X_val.shape, y_train.shape, y_val.shape)

Shapes: (5449, 12) (1363, 12) (5449,) (1363,)


In [10]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {
        "svc__kernel": ["rbf"],
        "svc__C": [0.1, 1, 10, 20, 50,100],
        "svc__gamma": ["scale", "auto", 0.01, 0.1, 0.2, 0.5 ,1],
        "svc__class_weight": [None, "balanced"]
    },
    {
        "svc__kernel": ["poly"],
        "svc__degree": [2, 3],
        "svc__C": [0.1, 1, 10],
        "svc__coef0": [0, 1],
        "svc__class_weight": [None, "balanced"]
    }
]

grid_search = GridSearchCV(
    clf, 
    param_grid, 
    cv=3,             
    scoring="accuracy", 
    n_jobs=-1,         
    verbose=2
)

grid_search.fit(X_train, y_train)

print("Melhores parâmetros:", grid_search.best_params_)

best_clf = grid_search.best_estimator_

y_val_pred = best_clf.predict(X_val)
accuracy = accuracy_score(y_val, y_val_pred)
print("Accuracy SVM:", accuracy)

print(classification_report(y_val, y_val_pred))

traffic_test['Speed_Diff'] = best_clf.predict(X_test)

create_submission_file(traffic_test, prediction_col='Speed_Diff', filename='submission_svm_test1.csv')

Fitting 3 folds for each of 108 candidates, totalling 324 fits
[CV] END svc__C=0.1, svc__class_weight=None, svc__gamma=0.1, svc__kernel=rbf; total time=   0.8s
[CV] END svc__C=0.1, svc__class_weight=None, svc__gamma=0.2, svc__kernel=rbf; total time=   0.8s
[CV] END svc__C=0.1, svc__class_weight=None, svc__gamma=0.2, svc__kernel=rbf; total time=   0.9s
[CV] END svc__C=0.1, svc__class_weight=None, svc__gamma=0.01, svc__kernel=rbf; total time=   0.9s
[CV] END svc__C=0.1, svc__class_weight=None, svc__gamma=0.5, svc__kernel=rbf; total time=   1.0s
[CV] END svc__C=0.1, svc__class_weight=None, svc__gamma=scale, svc__kernel=rbf; total time=   1.2s
[CV] END svc__C=0.1, svc__class_weight=None, svc__gamma=1, svc__kernel=rbf; total time=   1.2s
[CV] END svc__C=0.1, svc__class_weight=None, svc__gamma=auto, svc__kernel=rbf; total time=   1.3s
[CV] END svc__C=0.1, svc__class_weight=None, svc__gamma=scale, svc__kernel=rbf; total time=   1.3s
[CV] END svc__C=0.1, svc__class_weight=None, svc__gamma=auto

'../../submissions/submission_svm_test1.csv'