In [2]:
import pandas as pd
from utils.submission_utils import *

traffic_train = pd.read_csv("../../datasets/training_data.csv", keep_default_na=False, encoding='latin1')
traffic_test = pd.read_csv("../../datasets/test_data.csv", keep_default_na=False, encoding='latin1')

[utils] OUTPUT_DIR=../../submissions


In [3]:
## Section for Data Treatment

for df in [traffic_train, traffic_test]:
    df['record_date'] = pd.to_datetime(df['record_date'])
    df['hour'] = df['record_date'].dt.hour
    df['day_of_week'] = df['record_date'].dt.dayofweek
    df['month'] = df['record_date'].dt.month

traffic_train.drop(['AVERAGE_RAIN','AVERAGE_CLOUDINESS','LUMINOSITY'], axis = 1, inplace=True)
traffic_test.drop(['AVERAGE_RAIN','AVERAGE_CLOUDINESS','LUMINOSITY'], axis = 1, inplace=True)

features = ['AVERAGE_FREE_FLOW_SPEED', 'AVERAGE_TIME_DIFF', 'AVERAGE_FREE_FLOW_TIME',
            'AVERAGE_TEMPERATURE', 'AVERAGE_ATMOSP_PRESSURE',
            'AVERAGE_HUMIDITY', 'AVERAGE_WIND_SPEED', 'hour', 'day_of_week', 'month']

traffic_train.dropna(inplace=True)

for col in features:
    Q1 = traffic_train[col].quantile(0.25)
    Q3 = traffic_train[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    traffic_train = traffic_train[(traffic_train[col] >= lower_bound) & (traffic_train[col] <= upper_bound)]

X = traffic_train[features]
y = traffic_train['AVERAGE_SPEED_DIFF']

# Only re-run the following box :)

In [4]:
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.metrics import accuracy_score
from sklearn.ensemble import GradientBoostingClassifier, VotingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

speed_map = {'None': 0, 'Low': 1, 'Medium': 2, 'High': 3, 'Very_High': 4}
y = traffic_train['AVERAGE_SPEED_DIFF'].map(speed_map).astype(int)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=20)
inner_cv = KFold(n_splits=5, shuffle=True, random_state=2025)

lr_model = LogisticRegression(random_state=2025, max_iter=1000, multi_class='auto')
param_grid_lr = {
    'C': [0.01, 0.1, 1, 10],
    'solver': ['liblinear', 'lbfgs'],
    'penalty': ['l2']
}
grid_lr = GridSearchCV(lr_model, param_grid_lr, cv=inner_cv, scoring='accuracy', n_jobs=-1)
grid_lr.fit(X_train, y_train)
print("Best Logistic Regression params:", grid_lr.best_params_)
print("Best Logistic Regression score:", grid_lr.best_score_)

rf_model = RandomForestClassifier(random_state=2025)
param_grid_rf = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}
grid_rf = GridSearchCV(rf_model, param_grid_rf, cv=inner_cv, scoring='accuracy', n_jobs=-1)
grid_rf.fit(X_train, y_train)
print("Best Random Forest params:", grid_rf.best_params_)
print("Best Random Forest score:", grid_rf.best_score_)

xgb_model = XGBClassifier(
    objective='multi:softprob',
    eval_metric='mlogloss',
    random_state=2025,
    use_label_encoder=False
)
param_grid_xgb = {
    'n_estimators': [100, 200],
    'max_depth': [3, 6, 10],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0]
}
grid_xgb = GridSearchCV(xgb_model, param_grid_xgb, cv=inner_cv, scoring='accuracy', n_jobs=-1)
grid_xgb.fit(X_train, y_train)
print("Best XGBoost params:", grid_xgb.best_params_)
print("Best XGBoost score:", grid_xgb.best_score_)

voting_clf = VotingClassifier(
    estimators=[
        ('lr', grid_lr.best_estimator_),
        ('rf', grid_rf.best_estimator_),
        ('xgb', grid_xgb.best_estimator_)
    ],
    voting='soft',  
    n_jobs=-1
)


voting_clf.fit(X_train, y_train)
y_pred = voting_clf.predict(X_val)

acc = accuracy_score(y_val, y_pred)
print(f"Validation Accuracy do VotingClassifier: {acc:.4f}")


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to 

Best Logistic Regression params: {'C': 1, 'penalty': 'l2', 'solver': 'lbfgs'}
Best Logistic Regression score: 0.7768651413561899
Best Random Forest params: {'max_depth': 20, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 200}
Best Random Forest score: 0.7944997580700905


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


Best XGBoost params: {'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.8}
Best XGBoost score: 0.8029639870049078


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Validation Accuracy do VotingClassifier: 0.7961


## Bagging

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

best_lr = LogisticRegression(
    C=grid_lr.best_params_['C'],
    solver=grid_lr.best_params_['solver'],
    penalty='l2',
    max_iter=1000,
    random_state=2025
)

bg_model = BaggingClassifier(
    estimator=best_lr,
    n_estimators=50,   
    max_samples=0.8,
    bootstrap=True,
    n_jobs=-1,         
    random_state=2025
)

bg_model.fit(X_train, y_train)

y_pred = bg_model.predict(X_val)

acc = accuracy_score(y_val, y_pred)
print(f"Validation Accuracy do Bagging: {acc:.4f}")


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to 

Validation Accuracy do Bagging: 0.7744


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Stacking

In [7]:
from sklearn.ensemble import StackingClassifier

estimators = [
    ('lr', LogisticRegression(C=grid_lr.best_params_['C'],
                              solver=grid_lr.best_params_['solver'],
                              max_iter=1000,
                              random_state=2025)),
    ('rf', RandomForestClassifier(n_estimators=200,
                                  max_depth=10,
                                  random_state=2025)),
    ('xgb', XGBClassifier(
        use_label_encoder=False,
        eval_metric='mlogloss',
        n_estimators=200,
        max_depth=6,
        learning_rate=0.1,
        random_state=2025))
]

meta_model = LogisticRegression(max_iter=1000, random_state=2025)

stacking = StackingClassifier(estimators=estimators,final_estimator= meta_model,cv=5)

stacking.fit(X_train,y_train)
y_pred = stacking.predict(X_val)
accuracy = accuracy_score(y_val,y_pred)
print(f"Accuracy do modelo Stacking: {accuracy:.4f}")

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increa

Accuracy do modelo Stacking: 0.7942
