In [1]:
import pandas as pd
import numpy as np
from utils.ModelingUtils import *
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import KNNImputer
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, roc_auc_score, roc_curve, auc, make_scorer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import joblib

pd.set_option('display.max_columns', None)

In [2]:
data = pd.read_parquet('../data/final_data_to_modeling1105.parquet')
data = make_ml_target_classification(data)[MODEL_FEATURES]

In [3]:
data = data.where(pd.notnull(data), np.nan)

In [4]:
exclude_cols = ['temp', 'feelslike', 'humidity', 'dew', 'precip', 'precipprob', 'snow', 'preciptype', 'windgust', 'visibility', 'solarradiation', 'solarenergy', 'uvindex',]

for col in data.columns:
    if pd.api.types.is_numeric_dtype(data[col]) and col not in exclude_cols and data[col].isnull().any():
        data[col] = data[col].fillna(-1)

In [5]:
y = data['ML_TARGET'].values
X = data.drop('ML_TARGET', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [6]:
pipe = joblib.load('files/pipeline_data_preprocessing.pickle')

In [7]:
X_train_processed = pipe.transform(X_train)
X_test_processed = pipe.transform(X_test)

In [None]:
param_grid = {
    'max_depth': [3, 5, 7],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.5, 0.7],
    'colsample_bytree': [0.5, 0.7],
    'n_estimators': [50, 100, 200]
}

xgb = XGBClassifier(objective='multi:softmax', num_class=4, seed=123)

grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, cv=5, scoring='precision', n_jobs=-1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
grid_search.fit(X_train_processed, y_train)

Fitting 5 folds for each of 243 candidates, totalling 1215 fits


In [None]:
print("Best Parameters: ", grid_search.best_params_)
print("Best CV Accuracy Score: ", grid_search.best_score_)
print("Test Set Accuracy Score: ", grid_search.score(X_test, y_test))