In [13]:
import pandas as pd
from utils.data_utils import load_datasets
from utils.submission_utils import *
import holidays
from sklearn.preprocessing import OneHotEncoder, LabelEncoder 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

traffic_train = pd.read_csv("../../datasets/training_data.csv", keep_default_na=False, encoding='latin1')
traffic_test = pd.read_csv("../../datasets/test_data.csv", keep_default_na=False, encoding='latin1')

**Data Preparation**

Remoção de colunas não necessárias:
- city_name e AVERAGE_PRECIPITATION porque é só um valor.
- AVERAGE_CLOUDINESS por causa dos missing values.
- AVERAGE_RAIN por causa dos missing values.
- AVERAGE_HUMIDITY tem grande correlação.

In [14]:
# Drop unnecessary columns 
for df in [traffic_train, traffic_test]:
    df.drop(columns=['city_name'], inplace=True)
    df.drop(columns=['AVERAGE_PRECIPITATION'], inplace=True)
    df.drop(columns=['AVERAGE_RAIN'], inplace=True)
    df.drop(columns=['AVERAGE_CLOUDINESS'], inplace=True)

Extração de campos do atributo da data.

In [15]:
# Date Treatment
for df in [traffic_train, traffic_test]:
    df['record_date'] = pd.to_datetime(df['record_date'])
    df['hour'] = df['record_date'].dt.hour
    df['day_of_week'] = df['record_date'].dt.dayofweek
    df['month'] = df['record_date'].dt.month
    df['year'] = df['record_date'].dt.year


**Feature Engineering:**

Criação de features para acrescentar valores e atributos ao dataset:
- is_weekend por causa do fim de semana.
- is_friday por causa de ser sexta-feira.
- is_holiday por causa de ser feriado.
- is_rush_hour por causa das horas de ponta.
- season por causa da estação do ano.

In [16]:
# Feature Engineering
pt_holidays = holidays.Portugal()

for df in [traffic_train, traffic_test]:
    df['is_weekend'] = df['record_date'].dt.weekday.isin([5, 6]).astype(int)
    df['is_friday'] = (df['record_date'].dt.weekday == 4).astype(int)
    df['is_holiday'] = df['record_date'].dt.date.map(lambda d: d in pt_holidays).astype(int)
    df['is_rush_hour'] = ((df['hour'] >= 7) & (df['hour'] <= 9)) | ((df['hour'] >= 17) & (df['hour'] <= 19)).astype(int)
    df['season'] = pd.cut(df['month'], bins=[0, 3, 6, 9, 12], labels=['Winter', 'Spring', 'Summer', 'Fall'])
    df.drop(columns=['record_date'], inplace=True)

**Categorical Encoding**

Fazer enconding de todas as features categóricas para poder trabalhar com os modelos.

In [84]:
speed_map = {'None': 0, 'Low': 1, 'Medium': 2, 'High': 3, 'Very_High': 4}
traffic_train['AVERAGE_SPEED_DIFF'] = traffic_train['AVERAGE_SPEED_DIFF'].map(speed_map).astype(int)

rush_hour_map = {'True': 1, 'False': 0}
traffic_train['is_rush_hour'] = traffic_train['is_rush_hour'].replace(rush_hour_map).astype(int)
traffic_test['is_rush_hour'] = traffic_test['is_rush_hour'].replace(rush_hour_map).astype(int)

In [17]:
# Categorical Encoding

speed_map = {'None': 0, 'Low': 1, 'Medium': 2, 'High': 3, 'Very_High': 4}
traffic_train['AVERAGE_SPEED_DIFF'] = traffic_train['AVERAGE_SPEED_DIFF'].map(speed_map).astype(int)

luminosity_map = {'DARK': 0, 'LOW_LIGHT': 1, 'LIGHT': 2}
traffic_train['LUMINOSITY'] = traffic_train['LUMINOSITY'].replace(luminosity_map).astype(int)
traffic_test['LUMINOSITY'] = traffic_test['LUMINOSITY'].replace(luminosity_map).astype(int)

rush_hour_map = {'True': 1, 'False': 0}
traffic_train['is_rush_hour'] = traffic_train['is_rush_hour'].replace(rush_hour_map).astype(int)
traffic_test['is_rush_hour'] = traffic_test['is_rush_hour'].replace(rush_hour_map).astype(int)

season_map = {'Winter': 0, 'Spring': 1, 'Summer': 2, 'Fall': 3}
traffic_train['season'] = traffic_train['season'].replace(season_map).astype(int)
traffic_test['season'] = traffic_test['season'].replace(season_map).astype(int)

  traffic_train['LUMINOSITY'] = traffic_train['LUMINOSITY'].replace(luminosity_map).astype(int)
  traffic_test['LUMINOSITY'] = traffic_test['LUMINOSITY'].replace(luminosity_map).astype(int)
  traffic_train['season'] = traffic_train['season'].replace(season_map).astype(int)
  traffic_train['season'] = traffic_train['season'].replace(season_map).astype(int)
  traffic_test['season'] = traffic_test['season'].replace(season_map).astype(int)
  traffic_test['season'] = traffic_test['season'].replace(season_map).astype(int)


**Tratamento de Outliers**
 -> Aqui faz-se uma substituição dos outliers pelos valores mais próximos permitidos em termos de percentis divisão.

**A fazer (testar combinações)**
- Gráficos de caixas de bigodes no dataPreparation para ver os outliers.
- Substituir outliers pela média ou assim e normalizar.
- Fazer só com outliers sem winsorize.
- Quantis puros.

In [18]:
# Outlier Treatment
from scipy.stats.mstats import winsorize

for df in [traffic_train, traffic_test]:
    df['AVERAGE_FREE_FLOW_TIME'] = winsorize(df['AVERAGE_FREE_FLOW_TIME'], limits=[0.01, 0.01])
    df['AVERAGE_FREE_FLOW_SPEED'] = winsorize(df['AVERAGE_FREE_FLOW_SPEED'], limits=[0.05, 0.01])
    df['AVERAGE_TEMPERATURE'] = winsorize(df['AVERAGE_TEMPERATURE'], limits=[0.01, 0.02])
    df['AVERAGE_ATMOSP_PRESSURE'] = winsorize(df['AVERAGE_ATMOSP_PRESSURE'], limits=[0.05, 0.015])
    df['AVERAGE_WIND_SPEED'] = winsorize(df['AVERAGE_WIND_SPEED'], limits=[0.01, 0.03])
    df['AVERAGE_HUMIDITY'] = winsorize(df['AVERAGE_HUMIDITY'], limits=[0.03, 0.01])

In [19]:
def is_outlier(s): 
    lower_limit = s.mean() - s.std()
    upper_limit = s.mean() + s.std()
    return ~s.between(lower_limit, upper_limit)

outliers_train = traffic_train['AVERAGE_TIME_DIFF'].transform(is_outlier)

In [20]:
mask = ~traffic_train['AVERAGE_TIME_DIFF'].transform(is_outlier)

train_com_outliers = traffic_train
train_clean = traffic_train[~outliers_train]

**Modeling**

Modelação com ambos os datasets com tratamento de outliers.

In [21]:
# Modeling
X = train_clean.drop(['AVERAGE_SPEED_DIFF'], axis=1)
y = train_clean['AVERAGE_SPEED_DIFF'].to_frame()

X_c_outliers = train_com_outliers.drop(['AVERAGE_SPEED_DIFF'], axis=1)
y_c_outliers = train_com_outliers['AVERAGE_SPEED_DIFF'].to_frame()

features = X_c_outliers.columns


In [22]:
X_treino_final = X.to_numpy()
y_treino_final = y.to_numpy()

X_c_outliers_final = X_c_outliers.to_numpy()
y_c_outliers_final = y_c_outliers.to_numpy()

teste_final_clean = traffic_test.copy()

**Random Forest com cross validation**

**A fazer**
- Stratified Kfold

In [23]:
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.model_selection import cross_val_score

xgb_model = XGBClassifier(
    random_state=42,
)

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'eval_metric': ['logloss', 'mlogloss', 'error'],
}

outer_cv = KFold(n_splits=5, shuffle=True, random_state=42)
inner_cv = KFold(n_splits=5, shuffle=True, random_state=42)


grid_search = GridSearchCV(xgb_model, param_grid, cv=inner_cv, scoring='accuracy', n_jobs=1, refit=True)

grid_search.fit(X_c_outliers_final, np.ravel(y_c_outliers_final))

print(f"Best params: {grid_search.best_params_}")
print(f"Best cross-validated accuracy: {grid_search.best_score_:.4f}")

best_model = grid_search.best_estimator_
predictions = best_model.predict(teste_final_clean)
scores = cross_val_score(grid_search, X_c_outliers_final, y_c_outliers_final, cv=outer_cv, scoring='accuracy', n_jobs=-1)

print("Accuracy: %.2f%% (+/- %.2f%%)" % (scores.mean() * 100, scores.std() * 100))

teste_final_clean['Speed_Diff'] = predictions
teste_final_clean['Speed_Diff'] = teste_final_clean['Speed_Diff'].map({0: 'None', 1: 'Low', 2: 'Medium', 3: 'High', 4: 'Very_High'})


create_submission_file(teste_final_clean,  prediction_col='Speed_Diff', filename='submission_63.csv')

feature_importances = pd.DataFrame({
    'feature': features,
    'importance': best_model.feature_importances_
}).sort_values(by='importance', ascending=False)

print(feature_importances)

Best params: {'eval_metric': 'logloss', 'max_depth': 3, 'n_estimators': 100}
Best cross-validated accuracy: 0.8067
Accuracy: 80.39% (+/- 1.11%)
Submissão criada: ../../submissions\submission_63.csv
                    feature  importance
1         AVERAGE_TIME_DIFF    0.468327
0   AVERAGE_FREE_FLOW_SPEED    0.089290
15             is_rush_hour    0.061555
8                      hour    0.061295
2    AVERAGE_FREE_FLOW_TIME    0.041677
9               day_of_week    0.041572
10                    month    0.036578
3                LUMINOSITY    0.034586
11                     year    0.031465
4       AVERAGE_TEMPERATURE    0.028366
6          AVERAGE_HUMIDITY    0.024976
13                is_friday    0.023614
7        AVERAGE_WIND_SPEED    0.021983
5   AVERAGE_ATMOSP_PRESSURE    0.019122
14               is_holiday    0.015595
12               is_weekend    0.000000
16                   season    0.000000
