In [1]:
## Load & imports

import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split

traffic_train = pd.read_csv("../../datasets/training_data.csv", keep_default_na=False, encoding="latin1")
traffic_test  = pd.read_csv("../../datasets/test_data.csv", keep_default_na=False, encoding="latin1")

In [2]:
traffic_train.head()

Unnamed: 0,city_name,record_date,AVERAGE_SPEED_DIFF,AVERAGE_FREE_FLOW_SPEED,AVERAGE_TIME_DIFF,AVERAGE_FREE_FLOW_TIME,LUMINOSITY,AVERAGE_TEMPERATURE,AVERAGE_ATMOSP_PRESSURE,AVERAGE_HUMIDITY,AVERAGE_WIND_SPEED,AVERAGE_CLOUDINESS,AVERAGE_PRECIPITATION,AVERAGE_RAIN
0,Porto,2019-08-29 07:00:00,Medium,41.5,11.5,71.4,LIGHT,15.0,1019.0,100.0,3.0,,0.0,
1,Porto,2018-08-10 14:00:00,High,41.7,48.3,87.4,LIGHT,21.0,1021.0,53.0,5.0,céu claro,0.0,
2,Porto,2019-09-01 16:00:00,High,38.6,38.4,85.2,LIGHT,26.0,1014.0,61.0,4.0,,0.0,
3,Porto,2019-02-26 11:00:00,High,37.4,61.0,94.1,LIGHT,18.0,1025.0,48.0,4.0,céu claro,0.0,
4,Porto,2019-06-06 12:00:00,Medium,41.6,50.4,77.0,LIGHT,15.0,1008.0,82.0,10.0,,0.0,


In [3]:
## Section for Data Treatment

traffic_train = traffic_train[traffic_train["AVERAGE_SPEED_DIFF"] != "None"]

for df in [traffic_train, traffic_test]:
    df["record_date"] = pd.to_datetime(df["record_date"])
    df["hour"] = df["record_date"].dt.hour
    df["day"] = df["record_date"].dt.day
    df["month"] = df["record_date"].dt.month
    df["weekday"] = df["record_date"].dt.weekday

y = traffic_train["AVERAGE_SPEED_DIFF"]

drop_cols = ["AVERAGE_SPEED_DIFF", "record_date"]
X = traffic_train.drop(columns=drop_cols)
X_test = traffic_test.drop(columns=["record_date"])

num_cols = [
    "AVERAGE_FREE_FLOW_SPEED",
    "AVERAGE_TIME_DIFF",
    "AVERAGE_FREE_FLOW_TIME",
    "AVERAGE_TEMPERATURE",
    "AVERAGE_ATMOSP_PRESSURE",
    "AVERAGE_HUMIDITY",
    "AVERAGE_WIND_SPEED",
    "AVERAGE_PRECIPITATION"
]

cat_cols = [
    "city_name",
    "LUMINOSITY",
    "AVERAGE_CLOUDINESS",
    "AVERAGE_RAIN"
]

preprocess = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
    ]
)

clf = Pipeline(steps=[
    ("preprocess", preprocess),
    ("svc", SVC(kernel="rbf", C=3, gamma="scale"))
])

In [4]:
traffic_train.head()

Unnamed: 0,city_name,record_date,AVERAGE_SPEED_DIFF,AVERAGE_FREE_FLOW_SPEED,AVERAGE_TIME_DIFF,AVERAGE_FREE_FLOW_TIME,LUMINOSITY,AVERAGE_TEMPERATURE,AVERAGE_ATMOSP_PRESSURE,AVERAGE_HUMIDITY,AVERAGE_WIND_SPEED,AVERAGE_CLOUDINESS,AVERAGE_PRECIPITATION,AVERAGE_RAIN,hour,day,month,weekday
0,Porto,2019-08-29 07:00:00,Medium,41.5,11.5,71.4,LIGHT,15.0,1019.0,100.0,3.0,,0.0,,7,29,8,3
1,Porto,2018-08-10 14:00:00,High,41.7,48.3,87.4,LIGHT,21.0,1021.0,53.0,5.0,céu claro,0.0,,14,10,8,4
2,Porto,2019-09-01 16:00:00,High,38.6,38.4,85.2,LIGHT,26.0,1014.0,61.0,4.0,,0.0,,16,1,9,6
3,Porto,2019-02-26 11:00:00,High,37.4,61.0,94.1,LIGHT,18.0,1025.0,48.0,4.0,céu claro,0.0,,11,26,2,1
4,Porto,2019-06-06 12:00:00,Medium,41.6,50.4,77.0,LIGHT,15.0,1008.0,82.0,10.0,,0.0,,12,6,6,3


In [5]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

clf.fit(X_train, y_train)

print("Shapes:", X_train.shape, X_val.shape, y_train.shape, y_val.shape)

Shapes: (3689, 16) (923, 16) (3689,) (923,)


In [6]:
test_pred = clf.predict(X_test)
traffic_test["AVERAGE_SPEED_DIFF"] = test_pred

## traffic_test[["AVERAGE_SPEED_DIFF"]].to_csv("../../submissions/submission_svm.csv", index=False)

In [7]:
## svm = pd.read_csv("../../submissions/submission_svm.csv", keep_default_na=False, encoding="latin1")

## svm.head()

In [8]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    "svc__C": [0.1, 1, 3, 10],
    "svc__gamma": ["scale", 0.01, 0.1, 1],
    "svc__kernel": ["rbf"] 
}

grid_search = GridSearchCV(
    clf, 
    param_grid, 
    cv=3,             
    scoring="accuracy", 
    n_jobs=-1,         
    verbose=2
)

grid_search.fit(X_train, y_train)

print("Melhores parâmetros:", grid_search.best_params_)

best_clf = grid_search.best_estimator_

y_val_pred = best_clf.predict(X_val)
accuracy = accuracy_score(y_val, y_val_pred)
print("Accuracy SVM:", accuracy)

print(classification_report(y_val, y_val_pred))

Fitting 3 folds for each of 16 candidates, totalling 48 fits
Melhores parâmetros: {'svc__C': 10, 'svc__gamma': 0.01, 'svc__kernel': 'rbf'}
Accuracy SVM: 0.7887323943661971
              precision    recall  f1-score   support

        High       0.70      0.69      0.69       192
         Low       0.84      0.91      0.87       304
      Medium       0.76      0.74      0.75       321
   Very_High       0.86      0.77      0.82       106

    accuracy                           0.79       923
   macro avg       0.79      0.78      0.78       923
weighted avg       0.79      0.79      0.79       923

