In [1]:
import pandas as pd
from utils.data_utils import load_datasets
from utils.submission_utils import *

traffic_train, traffic_test = load_datasets()

[utils] TRAIN_PATH=../../datasets/training_data.csv
[utils] TEST_PATH=../../datasets/test_data.csv
[utils] OUTPUT_DIR=../../submissions


In [2]:
## Section for Data Treatment

for df in [traffic_train, traffic_test]:
    df['record_date'] = pd.to_datetime(df['record_date'])
    df['hour'] = df['record_date'].dt.hour
    df['day_of_week'] = df['record_date'].dt.dayofweek
    df['month'] = df['record_date'].dt.month

traffic_train.drop(['AVERAGE_RAIN','AVERAGE_CLOUDINESS','LUMINOSITY'], axis = 1, inplace=True)
traffic_test.drop(['AVERAGE_RAIN','AVERAGE_CLOUDINESS','LUMINOSITY'], axis = 1, inplace=True)

features = ['AVERAGE_FREE_FLOW_SPEED', 'AVERAGE_TIME_DIFF', 'AVERAGE_FREE_FLOW_TIME',
            'AVERAGE_TEMPERATURE', 'AVERAGE_ATMOSP_PRESSURE',
            'AVERAGE_HUMIDITY', 'AVERAGE_WIND_SPEED', 'hour', 'day_of_week', 'month']

traffic_train.dropna(inplace=True)

for col in features:
    Q1 = traffic_train[col].quantile(0.25)
    Q3 = traffic_train[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    traffic_train = traffic_train[(traffic_train[col] >= lower_bound) & (traffic_train[col] <= upper_bound)]

X = traffic_train[features]
y = traffic_train['AVERAGE_SPEED_DIFF']

# Only re-run the following box :)

In [3]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.ensemble import GradientBoostingClassifier

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=20)

gb_params = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.05],
    'max_depth': [3, 5]
}

gb_grid = GridSearchCV(GradientBoostingClassifier(random_state=20),
                       gb_params,
                       cv=5,
                       scoring='accuracy',
                       n_jobs=-1)
gb_grid.fit(X_train, y_train)

print("Melhor Gradient Boosting:", gb_grid.best_params_)
y_pred_gb = gb_grid.predict(X_val)
print("Accuracy GB:", accuracy_score(y_val, y_pred_gb))

best_model = gb_grid.best_estimator_  
X_test = traffic_test[features]
traffic_test['Speed_Diff'] = best_model.predict(X_test)

# Automatic Submission Maker :)

create_submission_file(traffic_test, prediction_col='Speed_Diff', filename='submission_gb.csv')



Melhor Gradient Boosting: {'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 100}
Accuracy GB: 0.7971391417425228
SubmissÃ£o criada: ../../submissions/submission_gb.csv


'../../submissions/submission_gb.csv'