In [2]:
import platform 

In [3]:
if platform.processor() != 'arm':
    from sklearnex import patch_sklearn
    patch_sklearn()

In [4]:

import pandas as pd
import numpy as np
import os
from tqdm import tqdm

import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
from tqdm import tqdm
from datetime import datetime
import matplotlib.font_manager
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, PowerTransformer, QuantileTransformer
from sklearn.compose import ColumnTransformer

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import make_scorer
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error
from numpy import array
from sklearn.feature_selection import chi2, RFECV
import optuna
from optuna.samplers import TPESampler
# import lightgbm as lgb

from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.linear_model import LinearRegression, Ridge, SGDRegressor, ElasticNet, Lars, Lasso, OrthogonalMatchingPursuit
from sklearn.linear_model import ARDRegression, BayesianRidge
from sklearn.linear_model import HuberRegressor, RANSACRegressor, TheilSenRegressor

from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR, NuSVR
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import (
    RandomForestRegressor, 
    HistGradientBoostingRegressor, 
    IsolationForest, 
    ExtraTreesRegressor, 
    AdaBoostRegressor
    )
from sklearn.linear_model import PoissonRegressor, TweedieRegressor, GammaRegressor
import catboost
import xgboost
import lightgbm

from category_encoders import OneHotEncoder, TargetEncoder, LeaveOneOutEncoder

# import umap
# from cuml.manifold import UMAP

# if platform.processor() == 'arm':
#     from sklearn.svm import SVR, NuSVR
#     from sklearn.neighbors import KNeighborsClassifier
# else:
#     from cuml.svm import SVR
#     from cuml.neighbors import KNeighborsClassifier
#     import catboost as ctb

import sklearn
import mlflow
import warnings 
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# test_experiment_id = mlflow.create_experiment('Test')
test_experiment_id = '482311471011279799'

In [6]:
RANDOM_SEED = 2023

In [7]:
data_folder = 'data'
train_data_fname = 'train.csv'
test_data_fname = 'test.csv'
external_data_fname = 'external_data.csv'
sample_submission_fname = 'sample_submission.csv'


train_data_path = os.path.join(data_folder, train_data_fname)
test_data_path = os.path.join(data_folder, test_data_fname)
external_data_path = os.path.join(data_folder, external_data_fname)
sample_data_path = os.path.join(data_folder, sample_submission_fname)

train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)
external_data = pd.read_csv(external_data_path)
sample_data = pd.read_csv(sample_data_path)

In [8]:
TARGET = 'yield'

In [9]:
train_data.shape, test_data.shape, sample_data.shape, external_data.shape

((15289, 18), (10194, 17), (10194, 2), (777, 18))

In [10]:
train_data.drop('id', axis='columns', inplace=True)
test_data.drop('id', axis=1, inplace = True)

In [11]:
train_data = train_data.loc[~train_data.duplicated()]
train_data.shape

(15282, 17)

In [10]:
low_cardinality_features = (train_data.drop(['yield'], axis = 1).nunique() < 10)
low_cardinality_features = low_cardinality_features[low_cardinality_features]
low_cardinality_features = low_cardinality_features.index.tolist()
low_cardinality_features

['clonesize',
 'honeybee',
 'MaxOfUpperTRange',
 'MinOfUpperTRange',
 'AverageOfUpperTRange',
 'MaxOfLowerTRange',
 'MinOfLowerTRange',
 'AverageOfLowerTRange',
 'RainingDays',
 'AverageRainingDays']

In [11]:
mid_cardinality_features = ((train_data.drop(['yield'] , axis = 1).nunique() > 10) & (train_data.drop(['yield'], axis = 1).nunique() < 20))
mid_cardinality_features = mid_cardinality_features[mid_cardinality_features]
mid_cardinality_features = mid_cardinality_features.index.tolist()
mid_cardinality_features

['bumbles', 'andrena', 'osmia']

In [12]:
continuous_features = (train_data.drop(['yield'], axis = 1).nunique() > 20)
continuous_features = continuous_features[continuous_features]
continuous_features = continuous_features.index.tolist()
continuous_features

['fruitset', 'fruitmass', 'seeds']

#### Define X and Y

In [13]:
train_data.reset_index(drop=True, inplace=True)
train_data.columns = train_data.columns.str.lower()
X, Y = train_data.drop(['yield'], axis=1), train_data[TARGET]

In [None]:
# Just in case a run has failed 
mlflow.end_run()

#### Validation loop

In [15]:
def cross_validate_(
    X,
    Y,
    # clf,
    pipeline=None,
    n_splits=5,
    # scaler = None,
    shuffle=True,
    run_info=None,
    experiment_id=None,
    verbose=1,
):
    # model_name = clf.__str__().split("(")[0]
    model_name = pipeline.named_steps['clf'].__str__().split("(")[0]
    if "CatBoost" in model_name:
        model_name = "CatBoost"

    run_timestamp = datetime.now().strftime("%H%M%S%d%m%Y")

    if run_info == None:
        run_name_ = f"{model_name}_{run_timestamp}"
    else:
        run_name_ = f"{model_name}_{run_info}_{run_timestamp}"

    if experiment_id:
        try:
            mlflow.start_run(run_name=run_name_, experiment_id=experiment_id)
        except:
            mlflow.end_run()
            mlflow.start_run(run_name=run_name_, experiment_id=experiment_id)

    else:
        try:
            mlflow.start_run(run_name=run_name_)
        except:
            mlflow.end_run()
            mlflow.start_run(run_name=run_name_)

    cv = KFold(n_splits=n_splits, shuffle=shuffle, random_state=RANDOM_SEED)

    # if scaler:
    #     pipeline = Pipeline(steps=[("scaler", scaler), ("clf", clf)])
    # else:
    #     pipeline = Pipeline(steps=[("clf", clf)])

    metrics = cross_validate(
        estimator=pipeline,
        X=X,
        y=Y,
        cv=cv,
        scoring=[
            "neg_mean_absolute_error",
            "neg_mean_absolute_percentage_error",
            # 'neg_mean_squared_error'
        ],
    )

    # Logging
    metrics_dict = {
        "mae":
        np.mean(metrics["test_neg_mean_absolute_error"] * -1),
        "mape":
        np.mean(metrics["test_neg_mean_absolute_percentage_error"] * -1),
        "fit_time":
        np.mean(metrics["fit_time"]),
        "inf_time":
        np.mean(metrics["score_time"]),
        "min_mae":
        np.min(metrics["test_neg_mean_absolute_error"] * -1),
        "min_mape":
        np.min(metrics["test_neg_mean_absolute_percentage_error"] * -1),
        "max_mae":
        np.max(metrics["test_neg_mean_absolute_error"] * -1),
        "max_mape":
        np.max(metrics["test_neg_mean_absolute_percentage_error"] * -1),
        "std_mae":
        np.std(metrics["test_neg_mean_absolute_error"] * -1),
        "std_mape":
        np.std(metrics["test_neg_mean_absolute_percentage_error"] * -1),
        "var_mae":
        np.var(metrics["test_neg_mean_absolute_error"] * -1),
        "var_mape":
        np.var(metrics["test_neg_mean_absolute_percentage_error"] * -1),
    }
    # model_params = clf.get_params()
    model_params = pipeline.named_steps['clf'].get_params()

    mlflow.log_metrics(metrics=metrics_dict)
    mlflow.log_params(params=model_params)

    mlflow.end_run()

    if verbose:
        print(f"\n {run_name_}")
        print(
            f"\t MAE: \t {metrics_dict['mae']:.4f} \t ± {metrics_dict['std_mae']:.4f}; \t min: {metrics_dict['min_mae']:.4f} \t max: {metrics_dict['max_mae']:.4f}"
        )
        print(
            f"\t MAPE: \t {metrics_dict['mape']:.2%} \t\t ± {metrics_dict['std_mape']:.2%}; \t min: {metrics_dict['min_mape']:.2%} \t max: {metrics_dict['max_mape']:.2%}"
        )
        print(f"\t Time: \t {metrics_dict['fit_time']:.2f}s")

        print(f"{80*'_'}")


#### Initial run

In [28]:
# mlflow.create_experiment('baseline_rerun')
# 972367584723232732

In [29]:
models = [
    LinearRegression(),
    Ridge(),
    SGDRegressor(),
    ElasticNet(),
    Lars(), 
    Lasso(),
    BayesianRidge(),
    ARDRegression(), 
    HuberRegressor(),
    RANSACRegressor(),
    TheilSenRegressor(),
    KNeighborsRegressor(),
    SVR(),
    NuSVR(),
    GaussianProcessRegressor(),
    DecisionTreeRegressor(),
    RandomForestRegressor(),
    HistGradientBoostingRegressor(),
    IsolationForest(),
    ExtraTreesRegressor(),
    AdaBoostRegressor(),
    lightgbm.LGBMRegressor(), 
    catboost.CatBoostRegressor(verbose=False),
    xgboost.XGBRegressor(),
    PoissonRegressor(),
    TweedieRegressor(),
    GammaRegressor()
    
]
scalers = [
    None,
    StandardScaler(),
    MinMaxScaler(),
    RobustScaler(),
    PowerTransformer(),
    QuantileTransformer()
]

In [30]:

# for model in tqdm(models):
#     for scaler in scalers:
#         if scaler:
#             cross_validate_(
#                 X = X,
#                 Y = Y,
#                 clf = model, 
#                 scaler = scaler,
#                 n_splits=10,
#                 shuffle = True,
#                 experiment_id = '972367584723232732',
#                 run_info = scaler.__str__()[:-2],
#                 verbose=0
#             )
#         else:
#             cross_validate_(
#                 X = X,
#                 Y = Y,
#                 clf = model, 
#                 n_splits=10,
#                 scaler = None,
#                 shuffle= True,
#                 experiment_id='972367584723232732',
#                 run_info = 'noscaler',
#                 verbose=0,
#             )

#### Baseline

In [31]:
pipeline = Pipeline([
    # ('ohe', OneHotEncoder(cols = low_cardinality_features))
    ('clf', lightgbm.LGBMRegressor())
])

cross_validate_(
                X = X,
                Y = Y,
                pipeline = pipeline,
                n_splits = 10,
                run_info = None,
                verbose = 1
            )


 LGBMRegressor_21133405052023
	 MAE: 	 354.1594 	 ± 11.9917; 	 min: 340.3411 	 max: 377.4922
	 MAPE: 	 6.29% 		 ± 0.24%; 	 min: 5.93% 	 max: 6.68%
	 Time: 	 0.26s
________________________________________________________________________________


In [32]:
pipeline = Pipeline([
    # ('ohe', OneHotEncoder(cols = low_cardinality_features))
    ('clf', HistGradientBoostingRegressor())
])

cross_validate_(
                X = X,
                Y = Y,
                pipeline = pipeline,
                n_splits = 10,
                run_info = None,
                verbose = 1
            )


 HistGradientBoostingRegressor_21142405052023
	 MAE: 	 353.2118 	 ± 12.9764; 	 min: 340.8332 	 max: 382.2670
	 MAPE: 	 6.27% 		 ± 0.25%; 	 min: 5.92% 	 max: 6.74%
	 Time: 	 0.55s
________________________________________________________________________________


In [72]:
pipeline = Pipeline([
    ('ohe', OneHotEncoder(cols = low_cardinality_features)),
    ('scaler', StandardScaler()),
    ('clf', HuberRegressor())
])

cross_validate_(
                X = X,
                Y = Y,
                pipeline = pipeline,
                n_splits = 10,
                run_info = scaler.__str__()[:-2],
                verbose = 1
            )


 HuberRegressor_StandardScaler_16570705052023
	 MAE: 	 366.2967 	 ± 12.7244; 	 min: 354.3803 	 max: 398.5282
	 MAPE: 	 6.49% 		 ± 0.24%; 	 min: 6.16% 	 max: 7.02%
	 Time: 	 0.40s
________________________________________________________________________________


### Feature enineering

In [None]:
models = [
    LinearRegression(),
    Ridge(),
    SGDRegressor(),
    ElasticNet(),
    Lars(), 
    Lasso(),
    BayesianRidge(),
    ARDRegression(), 
    HuberRegressor(),
    RANSACRegressor(),
    TheilSenRegressor(),
    KNeighborsRegressor(),
    SVR(),
    NuSVR(),
    GaussianProcessRegressor(),
    DecisionTreeRegressor(),
    RandomForestRegressor(),
    HistGradientBoostingRegressor(),
    IsolationForest(),
    ExtraTreesRegressor(),
    AdaBoostRegressor(),
    lightgbm.LGBMRegressor(), 
    catboost.CatBoostRegressor(verbose=False),
    xgboost.XGBRegressor(),
    PoissonRegressor(),
    TweedieRegressor(),
    GammaRegressor()
]

#### FE_1

In [61]:
mlflow.create_experiment(name = 'FE_1')

'387500007015104142'

In [58]:

columns_of_interest = [*mid_cardinality_features, *continuous_features]

ct = ColumnTransformer(
    transformers = [
        ('scaler', StandardScaler(), columns_of_interest)
    ], remainder= 'passthrough'
)

pipeline = Pipeline([
    ('ohe', OneHotEncoder(cols = low_cardinality_features)),
    ('scaling', ct), 
    ('clf', HuberRegressor())
])

cross_validate_(
                X = X,
                Y = Y,
                pipeline = pipeline,
                n_splits = 10,
                run_info = 'FE_1',
                verbose = 1
            )


 HuberRegressor_FE_1_22323605052023
	 MAE: 	 366.4569 	 ± 12.2825; 	 min: 348.0209 	 max: 389.7244
	 MAPE: 	 6.50% 		 ± 0.27%; 	 min: 6.12% 	 max: 6.99%
	 Time: 	 0.33s
________________________________________________________________________________


In [63]:
for model in tqdm(models):
    pipeline = Pipeline([
        ('ohe', OneHotEncoder(cols = low_cardinality_features)),
        ('scaling', ct), 
        ('clf', model)
    ])
    cross_validate_(
                X = X,
                Y = Y,
                pipeline = pipeline,
                n_splits = 10,
                run_info = 'FE_1',
                experiment_id = '387500007015104142',
                verbose = 0
            )


100%|██████████| 27/27 [06:27<00:00, 14.36s/it]


#### FE_2

In [64]:
experiment_id = mlflow.create_experiment(name = 'FE_2')

In [65]:
columns_to_be_scaled = [*continuous_features]

ct = ColumnTransformer(
    transformers = [
        ('scaler', StandardScaler(), columns_to_be_scaled)
    ], remainder= 'passthrough'
)

pipeline = Pipeline([
    ('ohe', OneHotEncoder(cols = [*low_cardinality_features, *mid_cardinality_features])),
    ('scaling', ct), 
    ('clf', HuberRegressor())
])

# Validate pipeline
cross_validate_(
                X = X,
                Y = Y,
                pipeline = pipeline,
                n_splits = 10,
                run_info = 'FE_2',
                experiment_id=experiment_id,
                verbose = 1
            )


 HuberRegressor_FE_2_23514005052023
	 MAE: 	 366.5738 	 ± 11.9880; 	 min: 347.9648 	 max: 389.2351
	 MAPE: 	 6.50% 		 ± 0.26%; 	 min: 6.13% 	 max: 6.98%
	 Time: 	 0.42s
________________________________________________________________________________


In [67]:
for model in tqdm(models):
    pipeline = Pipeline([
        ('ohe', OneHotEncoder(cols = [*low_cardinality_features, *mid_cardinality_features])),
        ('scaling', ct), 
        ('clf', model)
    ])


    cross_validate_(
                X = X,
                Y = Y,
                pipeline = pipeline,
                n_splits = 10,
                run_info = 'FE_2',
                experiment_id = experiment_id,
                verbose = 0
            )


100%|██████████| 27/27 [09:10<00:00, 20.40s/it]


#### FE_3

In [68]:
experiment_id = mlflow.create_experiment(name = 'FE_3')

In [70]:
columns_to_be_scaled = [*continuous_features]

ct = ColumnTransformer(
    transformers = [
        ('scaler', StandardScaler(), columns_to_be_scaled)
    ], remainder= 'passthrough'
)

pipeline = Pipeline([
    ('ohe', TargetEncoder(cols = [*low_cardinality_features, *mid_cardinality_features])),
    ('scaling', ct), 
    ('clf', HuberRegressor())
])

# Validate pipeline
cross_validate_(
                X = X,
                Y = Y,
                pipeline = pipeline,
                n_splits = 10,
                run_info = 'FE_3',
                experiment_id=experiment_id,
                verbose = 1
            )


 HuberRegressor_FE_3_00023606052023
	 MAE: 	 802.8663 	 ± 129.1879; 	 min: 419.9551 	 max: 885.3595
	 MAPE: 	 15.31% 		 ± 2.63%; 	 min: 7.61% 	 max: 17.20%
	 Time: 	 0.19s
________________________________________________________________________________


In [71]:
for model in tqdm(models):
    pipeline = Pipeline([
        ('ohe', TargetEncoder(cols = [*low_cardinality_features, *mid_cardinality_features])),
        ('scaling', ct), 
        ('clf', model)
    ])


    cross_validate_(
                X = X,
                Y = Y,
                pipeline = pipeline,
                n_splits = 10,
                run_info = 'FE_3',
                experiment_id = experiment_id,
                verbose = 0
            )


100%|██████████| 27/27 [06:03<00:00, 13.47s/it]


#### FE_4

In [72]:
experiment_id = mlflow.create_experiment(name = 'FE_4')

In [77]:
columns_to_be_scaled = [*continuous_features]

ct = ColumnTransformer(
    transformers = [
        ('scaler', StandardScaler(), columns_to_be_scaled)
    ], remainder= 'passthrough'
)

pipeline = Pipeline([
    ('ohe', LeaveOneOutEncoder(cols = [*low_cardinality_features, *mid_cardinality_features])),
    ('scaling', ct), 
    ('clf', HuberRegressor())
])

# Validate pipeline
cross_validate_(
                X = X,
                Y = Y,
                pipeline = pipeline,
                n_splits = 10,
                run_info = 'FE_4',
                experiment_id=experiment_id,
                verbose = 1
            )


 HuberRegressor_FE_4_00113006052023
	 MAE: 	 728.3016 	 ± 163.8783; 	 min: 468.0907 	 max: 885.5168
	 MAPE: 	 13.80% 		 ± 3.33%; 	 min: 8.42% 	 max: 17.15%
	 Time: 	 0.14s
________________________________________________________________________________


In [78]:
for model in tqdm(models):
    pipeline = Pipeline([
        ('ohe', LeaveOneOutEncoder(cols = [*low_cardinality_features, *mid_cardinality_features])),
        ('scaling', ct), 
        ('clf', model)
    ])


    cross_validate_(
                X = X,
                Y = Y,
                pipeline = pipeline,
                n_splits = 10,
                run_info = 'FE_4',
                experiment_id = experiment_id,
                verbose = 0
            )


100%|██████████| 27/27 [06:08<00:00, 13.66s/it]


#### FE_5|

In [79]:
experiment_id = mlflow.create_experiment(name = 'FE_5')

In [80]:
columns_to_be_scaled = [*continuous_features, *mid_cardinality_features]

ct = ColumnTransformer(
    transformers = [
        ('scaler', StandardScaler(), columns_to_be_scaled)
    ], remainder= 'passthrough'
)

pipeline = Pipeline([
    ('ohe', TargetEncoder(cols = [*low_cardinality_features])),
    ('scaling', ct), 
    ('clf', HuberRegressor())
])

# Validate pipeline
cross_validate_(
                X = X,
                Y = Y,
                pipeline = pipeline,
                n_splits = 10,
                run_info = 'FE_5',
                experiment_id=experiment_id,
                verbose = 1
            )


 HuberRegressor_FE_5_00482506052023
	 MAE: 	 443.0820 	 ± 13.1775; 	 min: 423.8685 	 max: 470.2143
	 MAPE: 	 7.90% 		 ± 0.27%; 	 min: 7.49% 	 max: 8.31%
	 Time: 	 0.19s
________________________________________________________________________________


In [81]:
for model in tqdm(models):
    pipeline = Pipeline([
        ('ohe', TargetEncoder(cols = [*low_cardinality_features])),
        ('scaling', ct), 
        ('clf', model)
    ])


    cross_validate_(
                X = X,
                Y = Y,
                pipeline = pipeline,
                n_splits = 10,
                run_info = 'FE_5',
                experiment_id = experiment_id,
                verbose = 0
            )


100%|██████████| 27/27 [06:10<00:00, 13.73s/it]


#### FE_6

In [82]:
experiment_id = mlflow.create_experiment(name = 'FE_6')

In [83]:
columns_to_be_scaled = [*continuous_features, *mid_cardinality_features]

ct = ColumnTransformer(
    transformers = [
        ('scaler', StandardScaler(), columns_to_be_scaled)
    ], remainder= 'passthrough'
)

pipeline = Pipeline([
    ('ohe', LeaveOneOutEncoder(cols = [*low_cardinality_features])),
    ('scaling', ct), 
    ('clf', HuberRegressor())
])

# Validate pipeline
cross_validate_(
                X = X,
                Y = Y,
                pipeline = pipeline,
                n_splits = 10,
                run_info = 'FE_6',
                experiment_id=experiment_id,
                verbose = 1
            )


 HuberRegressor_FE_6_00545506052023
	 MAE: 	 441.0685 	 ± 13.1213; 	 min: 423.1919 	 max: 463.5341
	 MAPE: 	 7.86% 		 ± 0.30%; 	 min: 7.42% 	 max: 8.39%
	 Time: 	 0.16s
________________________________________________________________________________


In [84]:
for model in tqdm(models):
    pipeline = Pipeline([
        ('ohe', LeaveOneOutEncoder(cols = [*low_cardinality_features])),
        ('scaling', ct), 
        ('clf', model)
    ])


    cross_validate_(
                X = X,
                Y = Y,
                pipeline = pipeline,
                n_splits = 10,
                run_info = 'FE_6',
                experiment_id = experiment_id,
                verbose = 0
            )


100%|██████████| 27/27 [06:34<00:00, 14.61s/it]


#### FE_7

In [85]:
experiment_id = mlflow.create_experiment(name = 'FE_7')

In [91]:
columns_to_be_scaled = [*continuous_features, *low_cardinality_features]

ct = ColumnTransformer(
    transformers = [
        ('scaler', StandardScaler(), columns_to_be_scaled)
    ], remainder= 'passthrough'
)

pipeline = Pipeline([
    ('ohe', TargetEncoder(cols = [*mid_cardinality_features])),
    ('scaling', ct), 
    ('clf', HuberRegressor())
])

# Validate pipeline
cross_validate_(
                X = X,
                Y = Y,
                pipeline = pipeline,
                n_splits = 10,
                run_info = 'FE_7',
                experiment_id=experiment_id,
                verbose = 1
            )


 HuberRegressor_FE_7_01185306052023
	 MAE: 	 423.4191 	 ± 13.8015; 	 min: 403.0938 	 max: 448.0353
	 MAPE: 	 7.59% 		 ± 0.32%; 	 min: 7.13% 	 max: 8.12%
	 Time: 	 0.14s
________________________________________________________________________________


In [92]:
pipeline = Pipeline([
    ('ohe', TargetEncoder(cols = [*mid_cardinality_features])),
    ('scaling', ct), 
    ('clf', lightgbm.LGBMRegressor())
])

# Validate pipeline
cross_validate_(
                X = X,
                Y = Y,
                pipeline = pipeline,
                n_splits = 10,
                run_info = 'FE_7',
                experiment_id=experiment_id,
                verbose = 1
            )


 LGBMRegressor_FE_7_01193206052023
	 MAE: 	 353.9356 	 ± 10.8769; 	 min: 335.5217 	 max: 377.2859
	 MAPE: 	 6.29% 		 ± 0.24%; 	 min: 5.89% 	 max: 6.82%
	 Time: 	 0.13s
________________________________________________________________________________


In [87]:
for model in tqdm(models):
    pipeline = Pipeline([
        ('ohe', TargetEncoder(cols = [*mid_cardinality_features])),
        ('scaling', ct), 
        ('clf', model)
    ])


    cross_validate_(
                X = X,
                Y = Y,
                pipeline = pipeline,
                n_splits = 10,
                run_info = 'FE_7',
                experiment_id = experiment_id,
                verbose = 0
            )


100%|██████████| 27/27 [05:51<00:00, 13.04s/it]


#### FE_8

In [88]:
experiment_id = mlflow.create_experiment(name = 'FE_8')

In [89]:
columns_to_be_scaled = [*continuous_features, *low_cardinality_features]

ct = ColumnTransformer(
    transformers = [
        ('scaler', StandardScaler(), columns_to_be_scaled)
    ], remainder= 'passthrough'
)

pipeline = Pipeline([
    ('ohe', LeaveOneOutEncoder(cols = [*mid_cardinality_features])),
    ('scaling', ct), 
    ('clf', HuberRegressor())
])

# Validate pipeline
cross_validate_(
                X = X,
                Y = Y,
                pipeline = pipeline,
                n_splits = 10,
                run_info = 'FE_8',
                experiment_id=experiment_id,
                verbose = 1
            )


 HuberRegressor_FE_8_01072606052023
	 MAE: 	 423.7994 	 ± 14.2340; 	 min: 401.2343 	 max: 448.7224
	 MAPE: 	 7.60% 		 ± 0.31%; 	 min: 7.05% 	 max: 8.14%
	 Time: 	 0.12s
________________________________________________________________________________


In [90]:
for model in tqdm(models):
    pipeline = Pipeline([
        ('ohe', LeaveOneOutEncoder(cols = [*mid_cardinality_features])),
        ('scaling', ct), 
        ('clf', model)
    ])


    cross_validate_(
                X = X,
                Y = Y,
                pipeline = pipeline,
                n_splits = 10,
                run_info = 'FE_8',
                experiment_id = experiment_id,
                verbose = 0
            )


100%|██████████| 27/27 [05:58<00:00, 13.27s/it]


#### FE_9
Removing collinear features

In [None]:
models = [
    LinearRegression(),
    Ridge(),
    SGDRegressor(loss = 'huber', penalty='l1'),
    ElasticNet(l1_ratio=.9),
    Lars(), 
    Lasso(warm_start=True),
    BayesianRidge(),
    ARDRegression(), 
    HuberRegressor(warm_start=True),
    RANSACRegressor(),
    TheilSenRegressor(),
    KNeighborsRegressor(),
    SVR(),
    NuSVR(),
    GaussianProcessRegressor(),
    DecisionTreeRegressor(criterion='absolute_error'),
    RandomForestRegressor(criterion='absolute_error'),
    HistGradientBoostingRegressor(loss = 'absolute_error'),
    IsolationForest(warm_start=True),
    ExtraTreesRegressor(criterion='absolute_error'),
    AdaBoostRegressor(loss = 'linear'),
    lightgbm.LGBMRegressor(objective = 'mae'), 
    catboost.CatBoostRegressor(loss_function = 'MAE', verbose=False),
    xgboost.XGBRegressor(objective = 'reg:squarederror'),
    PoissonRegressor(solver = 'newton-cholesky'),
    TweedieRegressor(solver = 'newton-cholesky'),
    GammaRegressor(solver = 'newton-cholesky' )
]

In [None]:
experiment_id = mlflow.create_experiment(name = 'FE_9')

In [29]:
train_data.reset_index(drop=True, inplace=True)
train_data.columns = train_data.columns.str.lower()
test_data.columns = test_data.columns.str.lower()


In [30]:
_, rainingdays_bins = pd.cut(train_data.rainingdays, bins = train_data.rainingdays.nunique() + 1, retbins=True, right = True)
rainingdays_bins = [0, 1, *rainingdays_bins[1:]]
rainingdays_bins

[0,
 1,
 5.714285714285714,
 10.428571428571429,
 15.142857142857142,
 19.857142857142858,
 24.571428571428573,
 29.285714285714285,
 34.0]

In [31]:
train_data['binned_rainingdays'] = pd.cut(train_data.rainingdays, bins = rainingdays_bins, retbins=False, right = True).cat.codes
test_data['binned_rainingdays'] = pd.cut(test_data.rainingdays, bins = rainingdays_bins, retbins=False, right = True).cat.codes

In [32]:
train_data

Unnamed: 0,clonesize,honeybee,bumbles,andrena,osmia,maxofuppertrange,minofuppertrange,averageofuppertrange,maxoflowertrange,minoflowertrange,averageoflowertrange,rainingdays,averagerainingdays,fruitset,fruitmass,seeds,yield,binned_rainingdays
0,25.0,0.50,0.25,0.75,0.50,69.7,42.1,58.2,50.2,24.3,41.2,24.0,0.39,0.425011,0.417545,32.460887,4476.81146,5
1,25.0,0.50,0.25,0.50,0.50,69.7,42.1,58.2,50.2,24.3,41.2,24.0,0.39,0.444908,0.422051,33.858317,5548.12201,5
2,12.5,0.25,0.25,0.63,0.63,86.0,52.0,71.9,62.0,30.0,50.8,24.0,0.39,0.552927,0.470853,38.341781,6869.77760,5
3,12.5,0.25,0.25,0.63,0.50,77.4,46.8,64.7,55.8,27.0,45.8,24.0,0.39,0.565976,0.478137,39.467561,6880.77590,5
4,25.0,0.50,0.25,0.63,0.63,77.4,46.8,64.7,55.8,27.0,45.8,24.0,0.39,0.579677,0.494165,40.484512,7479.93417,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15277,12.5,0.25,0.25,0.38,0.50,77.4,46.8,64.7,55.8,27.0,45.8,16.0,0.26,0.556302,0.476308,40.546480,7667.83619,4
15278,12.5,0.25,0.25,0.25,0.50,86.0,52.0,71.9,62.0,30.0,50.8,34.0,0.56,0.354413,0.388145,29.467434,3680.56025,7
15279,25.0,0.50,0.25,0.38,0.75,77.4,46.8,64.7,55.8,27.0,45.8,34.0,0.56,0.422548,0.416786,32.299059,4696.44394,7
15280,25.0,0.50,0.25,0.63,0.63,69.7,42.1,58.2,50.2,24.3,41.2,24.0,0.39,0.542170,0.434133,36.674243,6772.93347,5


In [33]:
columns_to_be_dropped = [
        'rainingdays', 
        'averagerainingdays',
        'maxofuppertrange',
        'minofuppertrange',
        'averageofuppertrange',
        'maxoflowertrange',
        'minoflowertrange',
        'averageoflowertrange',
        'fruitset',
        'fruitmass']

In [34]:
train_subset = train_data.drop(columns_to_be_dropped, axis = 1)
test_subset = test_data.drop(columns_to_be_dropped, axis = 1)

In [38]:
train_subset.columns

Index(['clonesize', 'honeybee', 'bumbles', 'andrena', 'osmia', 'seeds',
       'yield', 'binned_rainingdays'],
      dtype='object')

In [37]:
X, Y = train_subset.drop(['yield'], axis=1), train_data[TARGET]

In [40]:
columns_of_interest = ['clonesize', 'honeybee', 'bumbles', 'andrena', 'osmia', 'seeds']

In [39]:
columns_to_be_scaled = [*columns_of_interest]
ct = ColumnTransformer(
        transformers = [
        ('scaler', StandardScaler(), columns_to_be_scaled)
    ], remainder = 'passthrough')



NameError: name 'columns_of_interest' is not defined

In [None]:
for model in tqdm(models):
    pipeline = Pipeline([
        ('scaling', ct), 
        ('clf', model)
    ])

    cross_validate_(
                X = X,
                Y = Y,
                pipeline = pipeline,
                n_splits = 10,
                run_info = 'FE_9',
                experiment_id = experiment_id,
                verbose = 0
            )



### Hyperparameter tuning

In [146]:
# FE 7
columns_to_be_scaled = [*continuous_features, *low_cardinality_features]
ct = ColumnTransformer(
        transformers = [
        ('scaler', StandardScaler(), columns_to_be_scaled)
    ], remainder = 'passthrough')

In [147]:

def objective(trial):
    
    params = {
        "n_estimators": trial.suggest_categorical("n_estimators", [10000]),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "num_leaves": trial.suggest_int("num_leaves", 20, 3000, step=20),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 200, 10000, step=100),
        "lambda_l1": trial.suggest_int("lambda_l1", 0, 100, step=5),
        "lambda_l2": trial.suggest_int("lambda_l2", 0, 100, step=5),
        "min_gain_to_split": trial.suggest_float("min_gain_to_split", 0, 15),
        "bagging_fraction": trial.suggest_float(
            "bagging_fraction", 0.2, 0.95, step=0.1
        ),
        "bagging_freq": trial.suggest_categorical("bagging_freq", [1]),
        "feature_fraction": trial.suggest_float(
            "feature_fraction", 0.2, 0.95, step=0.1
        ),
    }

    pipeline = Pipeline([
        ('target_encoder', TargetEncoder(cols = [*mid_cardinality_features])),
        ('scaling', ct), 
    ])

    cv = KFold(n_splits=10, shuffle=True, random_state=RANDOM_SEED)
    cv_scores = np.empty(10)
    
    for idx, (train_idx, test_idx) in enumerate(cv.split(X, Y)):
        X_train, X_valid = X.iloc[train_idx], X.iloc[test_idx]
        Y_train, Y_valid = Y[train_idx], Y[test_idx]

        X_train = pipeline.fit_transform(X_train, Y_train)
        X_valid = pipeline.transform(X_valid)

        model = lightgbm.LGBMRegressor(objective="mae", **params)
        
        model.fit(
            X_train,
            Y_train,
            eval_set=[(X_valid, Y_valid)],
            eval_metric="mean_absolute_error",
            early_stopping_rounds=100,
            verbose=0
        )

        Y_pred = model.predict(X_valid)
        metric = mean_absolute_error(Y_valid, Y_pred)
        cv_scores[idx] = metric

    return np.mean(cv_scores)


In [148]:
study = optuna.create_study(
    direction='minimize')

[32m[I 2023-05-06 03:03:27,187][0m A new study created in memory with name: no-name-8c1f4be8-851b-480d-8913-36f2d0307595[0m


In [None]:
study.optimize(objective, n_trials=1000)

In [19]:
# best_params_lgbm = study.best_params
best_params_lgbm = {
    'n_estimators': 10000,
    'learning_rate': 0.05062637076615562,
    'num_leaves': 2460,
    'max_depth': 6,
    'min_data_in_leaf': 200,
    'lambda_l1': 0,
    'lambda_l2': 5,
    'min_gain_to_split': 0.4683462591354549,
    'bagging_fraction': 0.7,
    'bagging_freq': 1,
    'feature_fraction': 0.7
}

#### Submissions

#### LGBM + FE 7

In [20]:
columns_to_be_scaled = [*continuous_features, *low_cardinality_features]
ct = ColumnTransformer(
    transformers = [
        ('scaler', StandardScaler(), columns_to_be_scaled)
    ], remainder= 'passthrough'
)

pipeline = Pipeline([
    ('ohe', TargetEncoder(cols = [*mid_cardinality_features])),
    ('scaling', ct), 
    ('clf', lightgbm.LGBMRegressor(objective = 'mae', **best_params_lgbm))
])

cross_validate_(
                X = X,
                Y = Y,
                pipeline = pipeline,
                n_splits = 10,
                run_info = 'FE_7_test',
                experiment_id=test_experiment_id,
                verbose = 1
            )


 LGBMRegressor_FE_7_test_13182106052023
	 MAE: 	 348.7927 	 ± 10.2941; 	 min: 330.7703 	 max: 370.1816
	 MAPE: 	 6.20% 		 ± 0.23%; 	 min: 5.80% 	 max: 6.68%
	 Time: 	 7.09s
________________________________________________________________________________


In [22]:
columns_to_be_scaled = [*continuous_features, *low_cardinality_features]

ct = ColumnTransformer(
    transformers = [
        ('scaler', StandardScaler(), columns_to_be_scaled)
    ], remainder= 'passthrough'
)

pipeline = Pipeline([
    ('ohe', TargetEncoder(cols = [*mid_cardinality_features])),
    ('scaling', ct), 
])

model = lightgbm.LGBMRegressor(objective='mae', **best_params_lgbm)

X_train = pipeline.fit_transform(X, Y)
X_test = pipeline.transform(test_data)

model.fit(X_train, Y)
predictions = model.predict(X_test)

sample_data['yield'] = predictions
sample_data.to_csv('predictions/fe_7_lgbm.csv', index=False)



In [24]:
sample_data

Unnamed: 0,id,yield
0,15289,4275.540612
1,15290,5925.288404
2,15291,7335.517747
3,15292,4579.411151
4,15293,3798.513373
...,...,...
10189,25478,5390.993074
10190,25479,5593.679409
10191,25480,6501.064780
10192,25481,4387.668311


Base boosting machines

In [29]:
columns_to_be_scaled = [*low_cardinality_features, *mid_cardinality_features,  *continuous_features]

ct = ColumnTransformer(
    transformers = [
        ('scaler', StandardScaler(), columns_to_be_scaled)
    ], remainder= 'passthrough'
)

pipeline = Pipeline([
    ('scaling', ct), 
    ('clf', lightgbm.LGBMRegressor(objective = 'mae'))
])

cross_validate_(
                X = X,
                Y = Y,
                pipeline = pipeline,
                n_splits = 10,
                run_info = 'standardscaler',
                experiment_id=test_experiment_id,
                verbose = 1
            )


 LGBMRegressor_standardscaler_14252606052023
	 MAE: 	 343.2072 	 ± 11.1412; 	 min: 325.0423 	 max: 365.2969
	 MAPE: 	 6.05% 		 ± 0.25%; 	 min: 5.65% 	 max: 6.55%
	 Time: 	 0.37s
________________________________________________________________________________


In [25]:
columns_to_be_scaled = [*low_cardinality_features, *mid_cardinality_features,  *continuous_features]

ct = ColumnTransformer(
    transformers = [
        ('scaler', StandardScaler(), columns_to_be_scaled)
    ], remainder= 'passthrough'
)

pipeline = Pipeline([
    ('scaling', ct), 
    ('clf', HistGradientBoostingRegressor(loss = 'absolute_error'))
])

cross_validate_(
                X = X,
                Y = Y,
                pipeline = pipeline,
                n_splits = 10,
                run_info = 'standardscaler',
                experiment_id=test_experiment_id,
                verbose = 1
            )


 HistGradientBoostingRegressor_standardscaler_13583906052023
	 MAE: 	 344.5012 	 ± 11.1876; 	 min: 324.7196 	 max: 366.3490
	 MAPE: 	 6.07% 		 ± 0.25%; 	 min: 5.64% 	 max: 6.56%
	 Time: 	 0.69s
________________________________________________________________________________


In [28]:
columns_to_be_scaled = [*low_cardinality_features, *mid_cardinality_features,  *continuous_features]

ct = ColumnTransformer(
    transformers = [
        ('scaler', StandardScaler(), columns_to_be_scaled)
    ], remainder= 'passthrough'
)

pipeline = Pipeline([
    ('scaling', ct), 
    ('clf', catboost.CatBoostRegressor(loss_function='MAE'))
])

cross_validate_(
                X = X,
                Y = Y,
                pipeline = pipeline,
                n_splits = 10,
                run_info = 'standardscaler',
                experiment_id=test_experiment_id,
                verbose = 1
            )

0:	learn: 1066.7657996	total: 52.4ms	remaining: 52.3s
1:	learn: 1043.3965324	total: 56.7ms	remaining: 28.3s
2:	learn: 1020.5357121	total: 64.8ms	remaining: 21.6s
3:	learn: 998.3631159	total: 67.6ms	remaining: 16.8s
4:	learn: 974.9976719	total: 69.4ms	remaining: 13.8s
5:	learn: 955.4410335	total: 71ms	remaining: 11.8s
6:	learn: 936.4021644	total: 73.5ms	remaining: 10.4s
7:	learn: 916.8083763	total: 74.8ms	remaining: 9.27s
8:	learn: 897.3669265	total: 77.1ms	remaining: 8.49s
9:	learn: 878.3966874	total: 78.8ms	remaining: 7.8s
10:	learn: 859.7729289	total: 80.7ms	remaining: 7.25s
11:	learn: 843.0981458	total: 82.4ms	remaining: 6.79s
12:	learn: 828.4112382	total: 84ms	remaining: 6.38s
13:	learn: 811.6345095	total: 85ms	remaining: 5.99s
14:	learn: 796.7094232	total: 86.8ms	remaining: 5.7s
15:	learn: 780.8514558	total: 88.2ms	remaining: 5.42s
16:	learn: 765.9746099	total: 89.4ms	remaining: 5.17s
17:	learn: 751.2978768	total: 90.7ms	remaining: 4.95s
18:	learn: 739.0422745	total: 92.1ms	remain

In [26]:
columns_to_be_scaled = [*low_cardinality_features, *mid_cardinality_features,  *continuous_features]

ct = ColumnTransformer(
    transformers = [
        ('scaler', StandardScaler(), columns_to_be_scaled)
    ], remainder= 'passthrough'
)

pipeline = Pipeline([
    ('scaling', ct), 
])

model = HistGradientBoostingRegressor(loss = 'absolute_error')

X_train = pipeline.fit_transform(X, Y)
X_test = pipeline.transform(test_data)

model.fit(X_train, Y)
predictions = model.predict(X_test)

sample_data['yield'] = predictions
sample_data.to_csv('predictions/hgbr_base.csv', index=False)