In [5]:
import sklearn as skl
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import holidays

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedShuffleSplit
from sklearn import tree

from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier, VotingClassifier, BaggingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression
#import xgboost as xgb
from xgboost.sklearn import XGBClassifier

from sklearn.metrics import classification_report
%matplotlib inline

traffic_train = pd.read_csv("../../datasets/training_data.csv", keep_default_na=False, encoding='latin1')
traffic_test = pd.read_csv("../../datasets/test_data.csv", keep_default_na=False, encoding='latin1')

**Data Preparation**

Remoção de colunas não necessárias:
- city_name e AVERAGE_PRECIPITATION porque é só um valor.
- AVERAGE_CLOUDINESS por causa dos missing values.
- AVERAGE_RAIN por causa dos missing values.
- AVERAGE_HUMIDITY tem grande correlação.

In [6]:
# Drop unnecessary columns 
for df in [traffic_train, traffic_test]:
    df.drop(columns=['city_name'], inplace=True)
    df.drop(columns=['AVERAGE_PRECIPITATION'], inplace=True)
    df.drop(columns=['AVERAGE_RAIN'], inplace=True)
    df.drop(columns=['AVERAGE_CLOUDINESS'], inplace=True)

Extração de campos do atributo da data.

In [7]:
# Date Treatment
for df in [traffic_train, traffic_test]:
    df['record_date'] = pd.to_datetime(df['record_date'])
    df['hour'] = df['record_date'].dt.hour
    df['day_of_week'] = df['record_date'].dt.dayofweek
    df['month'] = df['record_date'].dt.month
    df['year'] = df['record_date'].dt.year


**Feature Engineering:**

Criação de features para acrescentar valores e atributos ao dataset:
- is_weekend por causa do fim de semana.
- is_friday por causa de ser sexta-feira.
- is_holiday por causa de ser feriado.
- is_rush_hour por causa das horas de ponta.
- season por causa da estação do ano.

In [9]:
# Feature Engineering
pt_holidays = holidays.Portugal()

for df in [traffic_train, traffic_test]:
    df['is_weekend'] = df['record_date'].dt.weekday.isin([5, 6]).astype(int)
    df['is_friday'] = (df['record_date'].dt.weekday == 4).astype(int)
    df['is_holiday'] = df['record_date'].dt.date.map(lambda d: d in pt_holidays).astype(int)
    df['is_rush_hour'] = ((df['hour'] >= 7) & (df['hour'] <= 9)) | ((df['hour'] >= 17) & (df['hour'] <= 19)).astype(int)
    df['season'] = pd.cut(df['month'], bins=[0, 3, 6, 9, 12], labels=['Winter', 'Spring', 'Summer', 'Fall'])
    df.drop(columns=['record_date'], inplace=True)

**Categorical Encoding**

Fazer enconding de todas as features categóricas para poder trabalhar com os modelos.

In [10]:
# Categorical Encoding

print(traffic_train['AVERAGE_SPEED_DIFF'].unique())
speed_map = {'None': 0, 'Low': 1, 'Medium': 2, 'High': 3, 'Very_High': 4}
traffic_train['AVERAGE_SPEED_DIFF'] = traffic_train['AVERAGE_SPEED_DIFF'].map(speed_map).astype(int)

luminosity_map = {'DARK': 0, 'LOW_LIGHT': 1, 'LIGHT': 2}
traffic_train['LUMINOSITY'] = traffic_train['LUMINOSITY'].replace(luminosity_map).astype(int)
traffic_test['LUMINOSITY'] = traffic_test['LUMINOSITY'].replace(luminosity_map).astype(int)

rush_hour_map = {'True': 1, 'False': 0}
traffic_train['is_rush_hour'] = traffic_train['is_rush_hour'].replace(rush_hour_map).astype(int)
traffic_test['is_rush_hour'] = traffic_test['is_rush_hour'].replace(rush_hour_map).astype(int)

season_map = {'Winter': 0, 'Spring': 1, 'Summer': 2, 'Fall': 3}
traffic_train['season'] = traffic_train['season'].replace(season_map).astype(int)
traffic_test['season'] = traffic_test['season'].replace(season_map).astype(int)

['Medium' 'High' 'None' 'Low' 'Very_High']


  traffic_train['LUMINOSITY'] = traffic_train['LUMINOSITY'].replace(luminosity_map).astype(int)
  traffic_test['LUMINOSITY'] = traffic_test['LUMINOSITY'].replace(luminosity_map).astype(int)
  traffic_train['season'] = traffic_train['season'].replace(season_map).astype(int)
  traffic_train['season'] = traffic_train['season'].replace(season_map).astype(int)
  traffic_test['season'] = traffic_test['season'].replace(season_map).astype(int)
  traffic_test['season'] = traffic_test['season'].replace(season_map).astype(int)


**Tratamento de Outliers**
 -> Aqui faz-se uma substituição dos outliers pelos valores mais próximos permitidos em termos de percentis divisão.

**A fazer (testar combinações)**
- Gráficos de caixas de bigodes no dataPreparation para ver os outliers.
- Substituir outliers pela média ou assim e normalizar.
- Fazer só com outliers sem winsorize.
- Quantis puros.

In [11]:
# Outlier Treatment
from scipy.stats.mstats import winsorize

for df in [traffic_train, traffic_test]:
    df['AVERAGE_FREE_FLOW_TIME'] = winsorize(df['AVERAGE_FREE_FLOW_TIME'], limits=[0.01, 0.01])
    df['AVERAGE_FREE_FLOW_SPEED'] = winsorize(df['AVERAGE_FREE_FLOW_SPEED'], limits=[0.05, 0.01])
    df['AVERAGE_TEMPERATURE'] = winsorize(df['AVERAGE_TEMPERATURE'], limits=[0.01, 0.02])
    df['AVERAGE_ATMOSP_PRESSURE'] = winsorize(df['AVERAGE_ATMOSP_PRESSURE'], limits=[0.05, 0.015])
    df['AVERAGE_WIND_SPEED'] = winsorize(df['AVERAGE_WIND_SPEED'], limits=[0.01, 0.03])

In [12]:
def is_outlier(s): 
    lower_limit = s.mean() - s.std()
    upper_limit = s.mean() + s.std()
    return ~s.between(lower_limit, upper_limit)

outliers_train = traffic_train['AVERAGE_TIME_DIFF'].transform(is_outlier)

In [13]:
mask = ~traffic_train['AVERAGE_TIME_DIFF'].transform(is_outlier)

train_com_outliers = traffic_train[mask]
train_clean = traffic_train[~outliers_train]

**Modeling**

Modelação com ambos os datasets com tratamento de outliers.

In [14]:
# Modeling
X = train_clean.drop(['AVERAGE_SPEED_DIFF'], axis=1)
y = train_clean['AVERAGE_SPEED_DIFF'].to_frame()

X_c_outliers = train_com_outliers.drop(['AVERAGE_SPEED_DIFF'], axis=1)
y_c_outliers = train_com_outliers['AVERAGE_SPEED_DIFF'].to_frame()


In [15]:
X_treino_final = X.to_numpy()
y_treino_final = y.to_numpy()

X_c_outliers_final = X_c_outliers.to_numpy()
y_c_outliers_final = y_c_outliers.to_numpy()

teste_final_clean = traffic_test.copy()

## Logistic Regression

In [17]:
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.model_selection import cross_val_score
lr_model = LogisticRegression(random_state=2025, max_iter=1000, multi_class='auto')

param_grid = {
    'C': [0.01, 0.1, 1, 10],
    'solver': ['liblinear', 'lbfgs'],
    'penalty': ['l2']
}

inner_cv = KFold(n_splits=5, shuffle=True, random_state=2025)
outer_cv = KFold(n_splits=5, shuffle=True, random_state=42)

grid_search = GridSearchCV(
    estimator=lr_model,
    param_grid=param_grid,
    cv=inner_cv,
    scoring='accuracy',
    n_jobs=-1,
    refit=True
)
grid_search.fit(X_c_outliers_final,np.ravel(y_c_outliers_final))

print(f"Best params: {grid_search.best_params_}")
print(f"Best cross-validated accuracy: {grid_search.best_score_:.4f}")

best_model = grid_search.best_estimator_
predictions = best_model.predict(teste_final_clean)
scores = cross_val_score(grid_search, X_c_outliers_final, y_c_outliers_final, cv=outer_cv, scoring='accuracy', n_jobs=-1)

print("Accuracy: %.2f%% (+/- %.2f%%)" % (scores.mean() * 100, scores.std() * 100))


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to 

Best params: {'C': 0.1, 'penalty': 'l2', 'solver': 'lbfgs'}
Best cross-validated accuracy: 0.7712


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

Accuracy: 77.27% (+/- 1.31%)


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to 