In [1]:
import pandas as pd
from utils.data_utils import load_datasets
from utils.submission_utils import *
import holidays
from sklearn.preprocessing import OneHotEncoder, LabelEncoder 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

traffic_train = pd.read_csv("../../datasets/training_data.csv", keep_default_na=False, encoding='latin1')
traffic_test = pd.read_csv("../../datasets/test_data.csv", keep_default_na=False, encoding='latin1')

[utils] TRAIN_PATH=../../datasets/training_data.csv
[utils] TEST_PATH=../../datasets/test_data.csv
[utils] OUTPUT_DIR=../../submissions


**Data Preparation**

Remoção de colunas não necessárias:
- city_name e AVERAGE_PRECIPITATION porque é só um valor.
- AVERAGE_CLOUDINESS por causa dos missing values.
- AVERAGE_RAIN por causa dos missing values.
- AVERAGE_HUMIDITY tem grande correlação.

In [2]:
# Drop unnecessary columns 
for df in [traffic_train, traffic_test]:
    df.drop(columns=['city_name'], inplace=True)
    df.drop(columns=['AVERAGE_PRECIPITATION'], inplace=True)
    df.drop(columns=['AVERAGE_RAIN'], inplace=True)
    df.drop(columns=['AVERAGE_CLOUDINESS'], inplace=True)

Extração de campos do atributo da data.

In [3]:
# Date Treatment
for df in [traffic_train, traffic_test]:
    df['record_date'] = pd.to_datetime(df['record_date'])
    df['hour'] = df['record_date'].dt.hour
    df['day_of_week'] = df['record_date'].dt.dayofweek
    df['month'] = df['record_date'].dt.month
    df['year'] = df['record_date'].dt.year


**Feature Engineering:**

Criação de features para acrescentar valores e atributos ao dataset:
- is_weekend por causa do fim de semana.
- is_friday por causa de ser sexta-feira.
- is_holiday por causa de ser feriado.
- is_rush_hour por causa das horas de ponta.
- season por causa da estação do ano.

In [4]:
# Feature Engineering
pt_holidays = holidays.Portugal()

for df in [traffic_train, traffic_test]:
    df['is_weekend'] = df['record_date'].dt.weekday.isin([5, 6]).astype(int)
    df['is_friday'] = (df['record_date'].dt.weekday == 4).astype(int)
    df['is_holiday'] = df['record_date'].dt.date.map(lambda d: d in pt_holidays).astype(int)
    df['is_rush_hour'] = ((df['hour'] >= 7) & (df['hour'] <= 9)) | ((df['hour'] >= 17) & (df['hour'] <= 19)).astype(int)
    df['season'] = pd.cut(df['month'], bins=[0, 3, 6, 9, 12], labels=['Winter', 'Spring', 'Summer', 'Fall'])
    df.drop(columns=['record_date'], inplace=True)

**Categorical Encoding**

Fazer enconding de todas as features categóricas para poder trabalhar com os modelos.

In [5]:
# Categorical Encoding

print(traffic_train['AVERAGE_SPEED_DIFF'].unique())
speed_map = {'None': 0, 'Low': 1, 'Medium': 2, 'High': 3, 'Very_High': 4}
traffic_train['AVERAGE_SPEED_DIFF'] = traffic_train['AVERAGE_SPEED_DIFF'].map(speed_map).astype(int)

luminosity_map = {'DARK': 0, 'LOW_LIGHT': 1, 'LIGHT': 2}
traffic_train['LUMINOSITY'] = traffic_train['LUMINOSITY'].replace(luminosity_map).astype(int)
traffic_test['LUMINOSITY'] = traffic_test['LUMINOSITY'].replace(luminosity_map).astype(int)

rush_hour_map = {'True': 1, 'False': 0}
traffic_train['is_rush_hour'] = traffic_train['is_rush_hour'].replace(rush_hour_map).astype(int)
traffic_test['is_rush_hour'] = traffic_test['is_rush_hour'].replace(rush_hour_map).astype(int)

season_map = {'Winter': 0, 'Spring': 1, 'Summer': 2, 'Fall': 3}
traffic_train['season'] = traffic_train['season'].replace(season_map).astype(int)
traffic_test['season'] = traffic_test['season'].replace(season_map).astype(int)

['Medium' 'High' 'None' 'Low' 'Very_High']


  traffic_train['LUMINOSITY'] = traffic_train['LUMINOSITY'].replace(luminosity_map).astype(int)
  traffic_test['LUMINOSITY'] = traffic_test['LUMINOSITY'].replace(luminosity_map).astype(int)
  traffic_train['season'] = traffic_train['season'].replace(season_map).astype(int)
  traffic_train['season'] = traffic_train['season'].replace(season_map).astype(int)
  traffic_test['season'] = traffic_test['season'].replace(season_map).astype(int)
  traffic_test['season'] = traffic_test['season'].replace(season_map).astype(int)


**Tratamento de Outliers**
 -> Aqui faz-se uma substituição dos outliers pelos valores mais próximos permitidos em termos de percentis divisão.

**A fazer (testar combinações)**
- Gráficos de caixas de bigodes no dataPreparation para ver os outliers.
- Substituir outliers pela média ou assim e normalizar.
- Fazer só com outliers sem winsorize.
- Quantis puros.

In [6]:
# Outlier Treatment
from scipy.stats.mstats import winsorize

for df in [traffic_train, traffic_test]:
    df['AVERAGE_FREE_FLOW_TIME'] = winsorize(df['AVERAGE_FREE_FLOW_TIME'], limits=[0.01, 0.01])
    df['AVERAGE_FREE_FLOW_SPEED'] = winsorize(df['AVERAGE_FREE_FLOW_SPEED'], limits=[0.05, 0.01])
    df['AVERAGE_TEMPERATURE'] = winsorize(df['AVERAGE_TEMPERATURE'], limits=[0.01, 0.02])
    df['AVERAGE_ATMOSP_PRESSURE'] = winsorize(df['AVERAGE_ATMOSP_PRESSURE'], limits=[0.05, 0.015])
    df['AVERAGE_WIND_SPEED'] = winsorize(df['AVERAGE_WIND_SPEED'], limits=[0.01, 0.03])

In [7]:
def is_outlier(s): 
    lower_limit = s.mean() - s.std()
    upper_limit = s.mean() + s.std()
    return ~s.between(lower_limit, upper_limit)

outliers_train = traffic_train['AVERAGE_TIME_DIFF'].transform(is_outlier)

In [8]:
mask = ~traffic_train['AVERAGE_TIME_DIFF'].transform(is_outlier)

train_com_outliers = traffic_train
train_clean = traffic_train[~outliers_train]

**Modeling**

Modelação com ambos os datasets com tratamento de outliers.

In [9]:
# Modeling
X = train_clean.drop(['AVERAGE_SPEED_DIFF'], axis=1)
y = train_clean['AVERAGE_SPEED_DIFF'].to_frame()

X_c_outliers = train_com_outliers.drop(['AVERAGE_SPEED_DIFF'], axis=1)
y_c_outliers = train_com_outliers['AVERAGE_SPEED_DIFF'].to_frame()


In [10]:
X_treino_final = X.to_numpy()
y_treino_final = y.to_numpy()

X_c_outliers_final = X_c_outliers.to_numpy()
y_c_outliers_final = y_c_outliers.to_numpy()

teste_final_clean = traffic_test.copy()

**Random Forest com cross validation**

**A fazer**
- Stratified Kfold

In [34]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier

ada_model = AdaBoostClassifier(
    random_state=42,
)

param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1.0],
    'estimator': [DecisionTreeClassifier(max_depth=d) for d in [1, 2, 3]]
}

outer_cv = KFold(n_splits=5, shuffle=True, random_state=42)
inner_cv = KFold(n_splits=5, shuffle=True, random_state=42)


grid_search = GridSearchCV(ada_model, param_grid, cv=inner_cv, scoring='accuracy', n_jobs=1, refit=True)

grid_search.fit(X_c_outliers_final, np.ravel(y_c_outliers_final))

print(f"Best params: {grid_search.best_params_}")
print(f"Best cross-validated accuracy: {grid_search.best_score_:.4f}")

best_model = grid_search.best_estimator_
predictions = best_model.predict(teste_final_clean)
scores = cross_val_score(grid_search, X_c_outliers_final, y_c_outliers_final, cv=outer_cv, scoring='accuracy', n_jobs=-1)

print("Accuracy: %.2f%% (+/- %.2f%%)" % (scores.mean() * 100, scores.std() * 100))

teste_final_clean['Speed_Diff'] = predictions
teste_final_clean['Speed_Diff'] = teste_final_clean['Speed_Diff'].map({0: 'None', 1: 'Low', 2: 'Medium', 3: 'High', 4: 'Very_High'})


create_submission_file(teste_final_clean,  prediction_col='Speed_Diff', filename='submission_9.csv')

Best params: {'estimator': DecisionTreeClassifier(max_depth=3), 'learning_rate': 0.1, 'n_estimators': 200}
Best cross-validated accuracy: 0.7832




Accuracy: 78.32% (+/- 1.54%)
Submissão criada: ../../submissions\submission_9.csv


'../../submissions\\submission_9.csv'