In [28]:
import pandas as pd
import numpy as np
from utils.ModelingUtils import *

from utils.PreprocessingUtils import get_route_names, fix_polish_chars
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import KNNImputer
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score, roc_curve, auc, make_scorer, mean_squared_error, mean_absolute_error
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import RFECV
import joblib

pd.set_option('display.max_columns', None)

In [2]:
data = pd.read_parquet('../data/final_data_to_modeling1105.parquet')
data = make_ml_target_regression(data)[MODEL_FEATURES]

In [3]:
data = data.where(pd.notnull(data), np.nan)

In [4]:
exclude_cols = ['temp', 'feelslike', 'humidity', 'dew', 'precip', 'precipprob', 'snow', 'preciptype', 'windgust', 'visibility', 'solarradiation', 'solarenergy', 'uvindex',]

for col in data.columns:
    if pd.api.types.is_numeric_dtype(data[col]) and col not in exclude_cols and data[col].isnull().any():
        data[col] = data[col].fillna(-1)

In [5]:
y = data['ML_TARGET'].values
X = data.drop('ML_TARGET', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

numeric_features = X_train.select_dtypes(include=['number']).columns
categorical_features = X_train.select_dtypes(exclude=['number']).columns

numeric_transformer = Pipeline(steps=[
    ('imputer', KNNImputer()),
    ('scaler', StandardScaler()),
    ])

categorical_transformer = Pipeline(steps=[
    ('transform_preciptype', CustomTransformer(column='preciptype', function=transform_preciptype)),
    ('transform_conditions', CustomTransformer(column='conditions', function=transform_conditions)),
    ('onehot', OneHotEncoder(handle_unknown='ignore', drop='first'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

pipe = Pipeline([
    ('preprocessor', preprocessor),
])

In [6]:
pipe.fit(X_train)

In [7]:
joblib.dump(pipe, f'files/pipeline_data_preprocessing.pickle')

['files/pipeline_data_preprocessing.pickle']

In [8]:
X_train_processed = pipe.transform(X_train)
X_test_processed = pipe.transform(X_test)

In [9]:
# Get column names after transformation
transformed_numeric_columns = preprocessor.named_transformers_['num'].named_steps['imputer'].get_feature_names_out(input_features=numeric_features)
transformed_categorical_columns = preprocessor.named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(input_features=categorical_features)

In [12]:
transformed_columns = list(transformed_numeric_columns) + list(transformed_categorical_columns)

In [13]:
#Create DataFrame with processed data and column names
processed_df_train = pd.DataFrame(X_train_processed, columns=transformed_columns)
processed_df_test = pd.DataFrame(X_test_processed, columns=transformed_columns)

processed_df_train = pd.concat([processed_df_train, pd.Series(y_train).rename('ML_TARGET')], axis=1)
processed_df_test = pd.concat([processed_df_test, pd.Series(y_test).rename('ML_TARGET')], axis=1)

processed_df_train.to_parquet(f'../data/preprocessed_for_modeling/train_data_1205.parquet')
processed_df_test.to_parquet(f'../data/preprocessed_for_modeling/test_data_1205.parquet')

In [25]:
# Define the XGBRegressor model
xgb_model = XGBRegressor()

# Define the parameters grid for tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
}

# Define mean squared error as the evaluation metric
mse_scorer = make_scorer(mean_squared_error)

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=xgb_model,
                           param_grid=param_grid,
                           scoring=mse_scorer,
                           cv=5,
                           verbose=1,
                           n_jobs=-1)

# Perform grid search cross-validation
grid_search.fit(X_train_processed, y_train)

Fitting 5 folds for each of 243 candidates, totalling 1215 fits


In [26]:
# Print the best parameters and the corresponding MSE
print("Best Parameters: ", grid_search.best_params_)
print("Best MSE Score on Training Data: ", grid_search.best_score_)

# Evaluate the model on the test set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test_processed)
mse_test = mean_squared_error(y_test, y_pred)
print("MSE Score on Test Data: ", mse_test)

Best Parameters:  {'colsample_bytree': 0.8, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100, 'subsample': 1.0}
Best MSE Score on Training Data:  92.43317447542685
MSE Score on Test Data:  93.55381303925277


In [29]:


mse_test = mean_absolute_error(y_test, y_pred)
print("MAE Score on Test Data: ", mse_test)

MAE Score on Test Data:  3.9677941420369987


In [31]:
feature_importances = best_model.feature_importances_

array([0.        , 0.05742499, 0.0045023 , 0.        , 0.        ,
       0.14255932, 0.0043502 , 0.01205255, 0.0671249 , 0.08942919,
       0.        , 0.0366019 , 0.        , 0.        , 0.        ,
       0.0651065 , 0.        , 0.        , 0.00915503, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.00930441, 0.03871274, 0.05116314, 0.04786247, 0.04922304,
       0.04247646, 0.03690142, 0.        , 0.        , 0.        ,
       0.0363699 , 0.02237812, 0.        , 0.        , 0.04231522,
       0.0280104 , 0.03342962, 0.01646234, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.037073  , 0.02001089,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.     

In [35]:
pd.set_option('display.max_rows',None)
pd.DataFrame({
    'name':  transformed_columns,
    'importance': feature_importances
}).sort_values('importance')

Unnamed: 0,name,importance
0,station_count_on_curr_station,0.0
66,preciptype_rain_snow,0.0
65,gestosc_zaludnienia_1km2_powiat,0.0
64,ludnosc_powiat,0.0
63,powierzchnia_km2_powiat,0.0
62,gestosc_zaludnienia_1km2_gmina,0.0
61,ludnosc_gmina,0.0
60,powierzchnia_km2_gmina,0.0
59,switches_odometer_powiat,0.0
67,preciptype_snow,0.0


In [36]:
joblib.dump(best_model, f'models/best_regressor1205.pickle')

['models/best_regressor1205.pickle']