In [49]:
import platform 

In [50]:
if platform.processor() != 'arm':
    from sklearnex import patch_sklearn
    patch_sklearn()

In [51]:

import pandas as pd
import numpy as np
import os
from tqdm import tqdm

import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
from tqdm import tqdm
from datetime import datetime
import matplotlib.font_manager
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, PowerTransformer, QuantileTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import make_scorer
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error
from numpy import array
from sklearn.feature_selection import chi2, RFECV
import optuna
from optuna.samplers import TPESampler
# import lightgbm as lgb

from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.linear_model import LinearRegression, Ridge, SGDRegressor, ElasticNet, Lars, Lasso, OrthogonalMatchingPursuit
from sklearn.linear_model import ARDRegression, BayesianRidge
from sklearn.linear_model import HuberRegressor, RANSACRegressor, TheilSenRegressor

from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR, NuSVR
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import (
    RandomForestRegressor, 
    HistGradientBoostingRegressor, 
    IsolationForest, 
    ExtraTreesRegressor, 
    AdaBoostRegressor
    )
from sklearn.linear_model import PoissonRegressor, TweedieRegressor, GammaRegressor
import catboost
import xgboost
import lightgbm

from category_encoders import OneHotEncoder, TargetEncoder

# import umap
# from cuml.manifold import UMAP

# if platform.processor() == 'arm':
#     from sklearn.svm import SVR, NuSVR
#     from sklearn.neighbors import KNeighborsClassifier
# else:
#     from cuml.svm import SVR
#     from cuml.neighbors import KNeighborsClassifier
#     import catboost as ctb

import sklearn
import mlflow
import warnings 
warnings.filterwarnings('ignore')

In [52]:
RANDOM_SEED = 2023

In [53]:
data_folder = 'data'
train_data_fname = 'train.csv'
test_data_fname = 'test.csv'
external_data_fname = 'external_data.csv'
sample_submission_fname = 'sample_submission.csv'


train_data_path = os.path.join(data_folder, train_data_fname)
test_data_path = os.path.join(data_folder, test_data_fname)
external_data_path = os.path.join(data_folder, external_data_fname)
sample_data_path = os.path.join(data_folder, sample_submission_fname)

train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)
external_data = pd.read_csv(external_data_path)
sample_data = pd.read_csv(sample_data_path)

In [54]:
TARGET = 'yield'

In [55]:
train_data.shape, test_data.shape, sample_data.shape, external_data.shape

((15289, 18), (10194, 17), (10194, 2), (777, 18))

In [56]:
train_data = train_data.loc[~train_data.duplicated()]
train_data.shape

(15289, 18)

In [57]:
low_cardinality_features = (train_data.drop(['yield'], axis = 1).nunique() < 10)
low_cardinality_features = low_cardinality_features[low_cardinality_features]
low_cardinality_features = low_cardinality_features.index.tolist()
low_cardinality_features

['clonesize',
 'honeybee',
 'MaxOfUpperTRange',
 'MinOfUpperTRange',
 'AverageOfUpperTRange',
 'MaxOfLowerTRange',
 'MinOfLowerTRange',
 'AverageOfLowerTRange',
 'RainingDays',
 'AverageRainingDays']

In [58]:
mid_cardinality_features = ((train_data.drop(['yield'] , axis = 1).nunique() > 10) & (train_data.drop(['yield'], axis = 1).nunique() < 20))
mid_cardinality_features = mid_cardinality_features[mid_cardinality_features]
mid_cardinality_features = mid_cardinality_features.index.tolist()
mid_cardinality_features

['bumbles', 'andrena', 'osmia']

In [59]:
continuous_features = (train_data.drop(['yield'], axis = 1).nunique() > 20)
continuous_features = continuous_features[continuous_features]
continuous_features = continuous_features.index.tolist()
continuous_features

['id', 'fruitset', 'fruitmass', 'seeds']

In [60]:
X, Y = train_data.drop(['id','yield'], axis=1), train_data[TARGET]

In [61]:
mlflow.end_run()

In [64]:
def cross_validate_(
    X,
    Y,
    # clf,
    pipeline=None,
    n_splits=5,
    # scaler = None,
    shuffle=True,
    run_info=None,
    experiment_id=None,
    verbose=1,
):
    # model_name = clf.__str__().split("(")[0]
    model_name = pipeline.named_steps['clf'].__str__().split("(")[0]
    if "CatBoost" in model_name:
        model_name = "CatBoost"

    run_timestamp = datetime.now().strftime("%H%M%S%d%m%Y")

    if run_info == None:
        run_name_ = f"{model_name}_{run_timestamp}"
    else:
        run_name_ = f"{model_name}_{run_info}_{run_timestamp}"

    if experiment_id:
        try:
            mlflow.start_run(run_name=run_name_, experiment_id=experiment_id)
        except:
            mlflow.end_run()
            mlflow.start_run(run_name=run_name_, experiment_id=experiment_id)

    else:
        try:
            mlflow.start_run(run_name=run_name_)
        except:
            mlflow.end_run()
            mlflow.start_run(run_name=run_name_)

    cv = KFold(n_splits=n_splits, shuffle=shuffle, random_state=RANDOM_SEED)

    # if scaler:
    #     pipeline = Pipeline(steps=[("scaler", scaler), ("clf", clf)])
    # else:
    #     pipeline = Pipeline(steps=[("clf", clf)])

    metrics = cross_validate(
        estimator=pipeline,
        X=X,
        y=Y,
        cv=cv,
        scoring=[
            "neg_mean_absolute_error",
            "neg_mean_absolute_percentage_error",
            # 'neg_mean_squared_error'
        ],
    )

    # Logging
    metrics_dict = {
        "mae":
        np.mean(metrics["test_neg_mean_absolute_error"] * -1),
        "mape":
        np.mean(metrics["test_neg_mean_absolute_percentage_error"] * -1),
        "fit_time":
        np.mean(metrics["fit_time"]),
        "inf_time":
        np.mean(metrics["score_time"]),
        "min_mae":
        np.min(metrics["test_neg_mean_absolute_error"] * -1),
        "min_mape":
        np.min(metrics["test_neg_mean_absolute_percentage_error"] * -1),
        "max_mae":
        np.max(metrics["test_neg_mean_absolute_error"] * -1),
        "max_mape":
        np.max(metrics["test_neg_mean_absolute_percentage_error"] * -1),
        "std_mae":
        np.std(metrics["test_neg_mean_absolute_error"] * -1),
        "std_mape":
        np.std(metrics["test_neg_mean_absolute_percentage_error"] * -1),
        "var_mae":
        np.var(metrics["test_neg_mean_absolute_error"] * -1),
        "var_mape":
        np.var(metrics["test_neg_mean_absolute_percentage_error"] * -1),
    }
    # model_params = clf.get_params()
    model_params = pipeline.named_steps['clf'].get_params()

    mlflow.log_metrics(metrics=metrics_dict)
    mlflow.log_params(params=model_params)

    mlflow.end_run()

    if verbose:
        print(f"\n {run_name_}")
        print(
            f"\t MAE: \t {metrics_dict['mae']:.4f} \t ± {metrics_dict['std_mae']:.4f}; \t min: {metrics_dict['min_mae']:.4f} \t max: {metrics_dict['max_mae']:.4f}"
        )
        print(
            f"\t MAPE: \t {metrics_dict['mape']:.2%} \t\t ± {metrics_dict['std_mape']:.2%}; \t min: {metrics_dict['min_mape']:.2%} \t max: {metrics_dict['max_mape']:.2%}"
        )
        print(f"\t Time: \t {metrics_dict['fit_time']:.2f}s")

        print(f"{80*'_'}")


#### Initial run

In [11]:
# mlflow.create_experiment('baseline_rerun')
# 972367584723232732

In [12]:
models = [
    LinearRegression(),
    Ridge(),
    SGDRegressor(),
    ElasticNet(),
    Lars(), 
    Lasso(),
    BayesianRidge(),
    ARDRegression(), 
    HuberRegressor(),
    RANSACRegressor(),
    TheilSenRegressor(),
    KNeighborsRegressor(),
    SVR(),
    NuSVR(),
    GaussianProcessRegressor(),
    DecisionTreeRegressor(),
    RandomForestRegressor(),
    HistGradientBoostingRegressor(),
    IsolationForest(),
    ExtraTreesRegressor(),
    AdaBoostRegressor(),

    lightgbm.LGBMRegressor(), 
    catboost.CatBoostRegressor(verbose=False),
    xgboost.XGBRegressor(),
    PoissonRegressor(),
    TweedieRegressor(),
    GammaRegressor()
    
]
scalers = [
    None,
    StandardScaler(),
    MinMaxScaler(),
    RobustScaler(),
    PowerTransformer(),
    QuantileTransformer()
]

In [13]:

for model in tqdm(models):
    for scaler in scalers:
        if scaler:
            cross_validate_(
                X = X,
                Y = Y,
                clf = model, 
                scaler = scaler,
                n_splits=10,
                shuffle = True,
                experiment_id = '972367584723232732',
                run_info = scaler.__str__()[:-2],
                verbose=0
            )
        else:
            cross_validate_(
                X = X,
                Y = Y,
                clf = model, 
                n_splits=10,
                scaler = None,
                shuffle= True,
                experiment_id='972367584723232732',
                run_info = 'noscaler',
                verbose=0,
            )

 52%|█████▏    | 14/27 [03:13<05:38, 26.03s/it]

#### Baseline

In [65]:
pipeline = Pipeline([
    # ('ohe', OneHotEncoder(cols = low_cardinality_features))
    ('clf', LinearRegression())
])

cross_validate_(
                X = X,
                Y = Y,
                pipeline = pipeline,
                n_splits = 10,
                run_info = scaler.__str__()[:-2],
                verbose = 1
            )


 LinearRegression_StandardScaler_16513705052023
	 MAE: 	 371.7057 	 ± 12.6379; 	 min: 359.0105 	 max: 401.0413
	 MAPE: 	 6.64% 		 ± 0.24%; 	 min: 6.30% 	 max: 7.10%
	 Time: 	 0.01s
________________________________________________________________________________


In [72]:
pipeline = Pipeline([
    ('ohe', OneHotEncoder(cols = low_cardinality_features)),
    ('scaler', StandardScaler()),
    ('clf', HuberRegressor())
])

cross_validate_(
                X = X,
                Y = Y,
                pipeline = pipeline,
                n_splits = 10,
                run_info = scaler.__str__()[:-2],
                verbose = 1
            )


 HuberRegressor_StandardScaler_16570705052023
	 MAE: 	 366.2967 	 ± 12.7244; 	 min: 354.3803 	 max: 398.5282
	 MAPE: 	 6.49% 		 ± 0.24%; 	 min: 6.16% 	 max: 7.02%
	 Time: 	 0.40s
________________________________________________________________________________


In [73]:
pipeline = Pipeline([
    ('ohe', OneHotEncoder(cols = low_cardinality_features)),
    ('scaler', StandardScaler()),
])

In [68]:
pipeline.fit(X, Y)

In [74]:
pipeline.fit_transform(X, Y)

array([[ 9.24302960e-01, -8.85210574e-01, -1.32809808e-01, ...,
        -1.04493821e+00, -7.83259616e-01, -9.18904668e-01],
       [ 9.24302960e-01, -8.85210574e-01, -1.32809808e-01, ...,
        -7.77454130e-01, -6.61588364e-01, -5.72229990e-01],
       [-1.08189635e+00,  1.12967471e+00, -1.32809808e-01, ...,
         6.74658543e-01,  6.56156954e-01,  5.40028603e-01],
       ...,
       [ 9.24302960e-01, -8.85210574e-01, -1.32809808e-01, ...,
        -1.07805142e+00, -8.03776960e-01, -9.59051027e-01],
       [ 9.24302960e-01, -8.85210574e-01, -1.32809808e-01, ...,
         5.30055766e-01, -3.35372413e-01,  1.26345340e-01],
       [ 9.24302960e-01, -8.85210574e-01, -1.32809808e-01, ...,
        -1.43358446e-01,  6.23179116e-04, -2.65499695e-01]])

In [70]:
pipeline.named_steps['ohe'].transform(X)

Unnamed: 0,clonesize_1,clonesize_2,clonesize_3,clonesize_4,clonesize_5,clonesize_6,honeybee_1,honeybee_2,honeybee_3,honeybee_4,...,AverageRainingDays_2,AverageRainingDays_3,AverageRainingDays_4,AverageRainingDays_5,AverageRainingDays_6,AverageRainingDays_7,AverageRainingDays_8,fruitset,fruitmass,seeds
0,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0.425011,0.417545,32.460887
1,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0.444908,0.422051,33.858317
2,0,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0.552927,0.470853,38.341781
3,0,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0.565976,0.478137,39.467561
4,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0.579677,0.494165,40.484512
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15284,0,1,0,0,0,0,0,1,0,0,...,0,0,1,0,0,0,0,0.556302,0.476308,40.546480
15285,0,1,0,0,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0.354413,0.388145,29.467434
15286,1,0,0,0,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0.422548,0.416786,32.299059
15287,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0.542170,0.434133,36.674243


#### Feature engineering 

In [None]:
from category_encoders import OneHotEncoder, TargetEncoder, 