In [1]:
from sklearnex import patch_sklearn
patch_sklearn()

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [2]:
import platform 

import pandas as pd
import numpy as np
import os
from tqdm import tqdm

import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
from tqdm import tqdm
from datetime import datetime
import matplotlib.font_manager
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, PowerTransformer, QuantileTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import make_scorer
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error
from numpy import array
from sklearn.feature_selection import chi2, RFECV
import optuna
from optuna.samplers import TPESampler
# import lightgbm as lgb

from sklearn.linear_model import LinearRegression, Ridge, SGDRegressor, ElasticNet, Lars, Lasso, OrthogonalMatchingPursuit
from sklearn.linear_model import ARDRegression, BayesianRidge
from sklearn.linear_model import HuberRegressor, RANSACRegressor, TheilSenRegressor

from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR, NuSVR
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import (
    RandomForestRegressor, 
    HistGradientBoostingRegressor, 
    IsolationForest, 
    ExtraTreesRegressor, 
    AdaBoostRegressor
    )
from sklearn.linear_model import PoissonRegressor, TweedieRegressor, GammaRegressor
import catboost
import xgboost
import lightgbm

# import umap
# from cuml.manifold import UMAP

# if platform.processor() == 'arm':
#     from sklearn.svm import SVR, NuSVR
#     from sklearn.neighbors import KNeighborsClassifier
# else:
#     from cuml.svm import SVR
#     from cuml.neighbors import KNeighborsClassifier
#     import catboost as ctb

import sklearn
import mlflow
import warnings 
warnings.filterwarnings('ignore')

In [3]:
RANDOM_SEED = 2023

In [4]:
data_folder = 'data'
train_data_fname = 'train.csv'
test_data_fname = 'test.csv'
external_data_fname = 'external_data.csv'
sample_submission_fname = 'sample_submission.csv'


train_data_path = os.path.join(data_folder, train_data_fname)
test_data_path = os.path.join(data_folder, test_data_fname)
external_data_path = os.path.join(data_folder, external_data_fname)
sample_data_path = os.path.join(data_folder, sample_submission_fname)

train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)
external_data = pd.read_csv(external_data_path)
sample_data = pd.read_csv(sample_data_path)

In [5]:
TARGET = 'yield'

In [6]:
train_data.shape, test_data.shape, sample_data.shape, external_data.shape

((15289, 18), (10194, 17), (10194, 2), (777, 18))

In [7]:
train_data = train_data.loc[~train_data.duplicated()]
train_data.shape

(15289, 18)

In [8]:
X, Y = train_data.drop(['id','yield'], axis=1), train_data[TARGET]

In [9]:
mlflow.end_run()

In [10]:
def cross_validate_(
    X,
    Y,
    clf,
    scaler=None,
    n_splits=5,
    shuffle=True,
    run_info=None,
    experiment_id=None,
    verbose=1,
):
    model_name = clf.__str__().split("(")[0]
    if "CatBoost" in model_name:
        model_name = "CatBoost"

    run_timestamp = datetime.now().strftime("%H%M%S%d%m%Y")

    if run_info == None:
        run_name_ = f"{model_name}_{run_timestamp}"
    else:
        run_name_ = f"{model_name}_{run_info}_{run_timestamp}"

    if experiment_id:
        try:
            mlflow.start_run(run_name=run_name_, experiment_id=experiment_id)
        except:
            mlflow.end_run()
            mlflow.start_run(run_name=run_name_, experiment_id=experiment_id)

    else:
        try:
            mlflow.start_run(run_name=run_name_)
        except:
            mlflow.end_run()
            mlflow.start_run(run_name=run_name_)

    cv = KFold(n_splits=n_splits, shuffle=shuffle, random_state=RANDOM_SEED)

    if scaler:
        pipeline = Pipeline(steps=[("scaler", scaler), ("clf", clf)])
    else:
        pipeline = Pipeline(steps=[("clf", clf)])

    metrics = cross_validate(
        estimator=pipeline,
        X=X,
        y=Y,
        cv=cv,
        scoring=[
            "neg_mean_absolute_error",
            "neg_mean_absolute_percentage_error",
            # 'neg_mean_squared_error'
        ],
    )

    # Logging
    metrics_dict = {
        "mae":
        np.mean(metrics["test_neg_mean_absolute_error"] * -1),
        "mape":
        np.mean(metrics["test_neg_mean_absolute_percentage_error"] * -1),
        "fit_time":
        np.mean(metrics["fit_time"]),
        "inf_time":
        np.mean(metrics["score_time"]),
        "min_mae":
        np.min(metrics["test_neg_mean_absolute_error"] * -1),
        "min_mape":
        np.min(metrics["test_neg_mean_absolute_percentage_error"] * -1),
        "max_mae":
        np.max(metrics["test_neg_mean_absolute_error"] * -1),
        "max_mape":
        np.max(metrics["test_neg_mean_absolute_percentage_error"] * -1),
        "std_mae":
        np.std(metrics["test_neg_mean_absolute_error"] * -1),
        "std_mape":
        np.std(metrics["test_neg_mean_absolute_percentage_error"] * -1),
        "var_mae":
        np.var(metrics["test_neg_mean_absolute_error"] * -1),
        "var_mape":
        np.var(metrics["test_neg_mean_absolute_percentage_error"] * -1),
    }
    model_params = clf.get_params()

    mlflow.log_metrics(metrics=metrics_dict)
    mlflow.log_params(params=model_params)

    mlflow.end_run()

    if verbose:
        print(f"\n {run_name_}")
        print(
            f"\t MAE: \t {metrics_dict['mae']:.4f} \t ± {metrics_dict['std_mae']:.4f}; \t min: {metrics_dict['min_mae']:.4f} \t max: {metrics_dict['max_mae']:.4f}"
        )
        print(
            f"\t MAPE: \t {metrics_dict['mape']:.2%} \t\t ± {metrics_dict['std_mape']:.2%}; \t min: {metrics_dict['min_mape']:.2%} \t max: {metrics_dict['max_mape']:.2%}"
        )
        print(f"\t Time: \t {metrics_dict['fit_time']:.2f}s")

        print(f"{80*'_'}")


#### Initial run

In [11]:
# mlflow.create_experiment('baseline_rerun')
# 972367584723232732

In [12]:
models = [
    LinearRegression(),
    Ridge(),
    SGDRegressor(),
    ElasticNet(),
    Lars(), 
    Lasso(),
    BayesianRidge(),
    ARDRegression(), 
    HuberRegressor(),
    RANSACRegressor(),
    TheilSenRegressor(),
    KNeighborsRegressor(),
    SVR(),
    NuSVR(),
    GaussianProcessRegressor(),
    DecisionTreeRegressor(),
    RandomForestRegressor(),
    HistGradientBoostingRegressor(),
    IsolationForest(),
    ExtraTreesRegressor(),
    AdaBoostRegressor(),

    lightgbm.LGBMRegressor(), 
    catboost.CatBoostRegressor(verbose=False),
    xgboost.XGBRegressor(),
    PoissonRegressor(),
    TweedieRegressor(),
    GammaRegressor()
    
]
scalers = [
    None,
    StandardScaler(),
    MinMaxScaler(),
    RobustScaler(),
    PowerTransformer(),
    QuantileTransformer()
]

In [13]:

for model in tqdm(models):
    for scaler in scalers:
        if scaler:
            cross_validate_(
                X = X,
                Y = Y,
                clf = model, 
                scaler = scaler,
                n_splits=10,
                shuffle = True,
                experiment_id = '972367584723232732',
                run_info = scaler.__str__()[:-2],
                verbose=0
            )
        else:
            cross_validate_(
                X = X,
                Y = Y,
                clf = model, 
                n_splits=10,
                scaler = None,
                shuffle= True,
                experiment_id='972367584723232732',
                run_info = 'noscaler',
                verbose=0,
            )

 52%|█████▏    | 14/27 [03:13<05:38, 26.03s/it]

#### Baseline

In [47]:
model = HistGradientBoostingRegressor()
scaler = StandardScaler()
cross_validate_(
                X = X,
                Y = Y,
                clf = model, 
                scaler = scaler,
                n_splits=10,
                run_info = scaler.__str__()[:-2],
                experiment_id='972367584723232732',
                verbose=1
            )


 HistGradientBoostingRegressor_StandardScaler_13534805052023
	 MAE: 	 353.8872 	 ± 12.9959; 	 min: 338.3613 	 max: 383.0388
	 MAPE: 	 6.28% 		 ± 0.25%; 	 min: 5.87% 	 max: 6.77%
	 Time: 	 0.35
________________________________________________________________________________
