## Setup

In [1]:
import numpy as np 
import pandas as pd 
import os, gc
import lightgbm as lgb
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler,RobustScaler
import joblib
from sklearn.metrics import log_loss
from lightgbm import LGBMRegressor,log_evaluation
from sklearn.preprocessing import StandardScaler
import optuna
from sklearn.ensemble import BaggingRegressor
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from catboost import CatBoostRegressor

train_df = pd.read_csv('/kaggle/input/widsdatathon2023/train_data.csv')
test_df = pd.read_csv('/kaggle/input/widsdatathon2023/test_data.csv')
submission = pd.read_csv('/kaggle/input/widsdatathon2023/sample_solution.csv')

def startdate_info(train, test):
    train_df.dropna(inplace=True)
    train_df['startdate'] = pd.to_datetime(train_df["startdate"])
    train_df['year'] = train_df['startdate'].dt.year
    train_df['month'] =train_df['startdate'].dt.month
    train_df['day_of_year'] = train_df['startdate'].dt.dayofyear

    test_df['startdate'] = pd.to_datetime(test_df["startdate"])
    test_df['year'] = test_df['startdate'].dt.year
    test_df['month'] =test_df['startdate'].dt.month
    test_df['day_of_year'] = test_df['startdate'].dt.dayofyear
    return train, test

def categorical_encode(train, test):
    le = LabelEncoder()
    train['climateregions__climateregion'] = le.fit_transform(train['climateregions__climateregion'])
    test['climateregions__climateregion'] = le.transform(test['climateregions__climateregion'])
    return train, test

train_df,test_df= startdate_info(train_df,test_df)
train_df,test_df=categorical_encode(train_df,test_df)

ycol ='contest-tmp2m-14d__tmp2m'
features =list(filter(lambda x: x not in [ycol, 'index','startdate'], train_df.columns))
target=train_df['contest-tmp2m-14d__tmp2m']

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import StackingRegressor
from skopt.space import Real, Integer, Categorical
from skopt import BayesSearchCV
def run_model(train_df, test_df):
    # Split the training set into two parts, for use in ensembling
    train_df1, train_df2 = train_test_split(train_df, test_size=0.5, random_state=42)
    T1=train_df1['contest-tmp2m-14d__tmp2m']
    T2=train_df2['contest-tmp2m-14d__tmp2m']
    
    scaler = StandardScaler()
    train_df1 = train_df1[features]
    train_df2 = train_df2[features]
    test_df = test_df[features]
    train_df1[features] = scaler.fit_transform(train_df1[features])
    train_df2[features] = scaler.transform(train_df2[features])
    test_df[features] = scaler.transform(test_df[features])
    
    # Define the search spaces for the different CatBoostRegressor models
    search_spaces = [    
        {        
            'learning_rate': Real(0.001, 0.1, 'log-uniform'),        
            'depth': Integer(3, 6),        
            'l2_leaf_reg': Real(1e-3, 1e-1, 'log-uniform'),        
            'random_strength': Real(10, 50, 'log-uniform'),        
            'grow_policy': Categorical(['Lossguide', 'Depthwise']),
            'max_bin': Integer(100, 300),
            'min_data_in_leaf': Integer(10, 50),
            'bootstrap_type': Categorical(['Bayesian']),
        },
        {
            'learning_rate': Real(0.01, 0.3, 'log-uniform'),
            'depth': Integer(6, 10),
            'l2_leaf_reg': Real(1e-5, 1e-3, 'log-uniform'),
            'random_strength': Real(5, 20, 'log-uniform'),
            'grow_policy': Categorical(['Lossguide']),
            'max_bin': Integer(200, 400),
            'min_data_in_leaf': Integer(5, 20),
            'bootstrap_type': Categorical(['Bayesian', 'Bernoulli']),
        },
        {
            'learning_rate': Real(0.01, 0.1, 'log-uniform'),
            'depth': Integer(3, 6),
            'l2_leaf_reg': Real(1e-3, 1e-1, 'log-uniform'),
            'random_strength': Real(10, 50, 'log-uniform'),
            'grow_policy': Categorical(['Lossguide', 'Depthwise']),
            'max_bin': Integer(100, 300),
            'min_data_in_leaf': Integer(10, 50),
            'bootstrap_type': Categorical(['Bayesian']),
        }
    ]

    # Train multiple CatBoostRegressor models with different hyperparameters
    models = []
    for i, search_space in enumerate(search_spaces):
        opt = BayesSearchCV(
            estimator=CatBoostRegressor(verbose=500,task_type="GPU", devices='0:1'),
            search_spaces=search_space,
            scoring='neg_mean_squared_error',
            cv=KFold(n_splits=5, shuffle=True, random_state=2022),
            n_jobs=1,
            n_iter=20,
            verbose=1,
            random_state=42
        )
        opt.fit(train_df1, T1)
        best_params = opt.best_params_
        model = CatBoostRegressor(verbose=500, **best_params,task_type="GPU", devices='0:1')
        model.fit(train_df1[features], T1)
        models.append(model)
    
    # Combine the predictions from the individual models using a StackingRegressor
    stacker = StackingRegressor(
        estimators=[('model%d' % i, model) for i, model in enumerate(models)],
        final_estimator=CatBoostRegressor(verbose=500,task_type="GPU", devices='0:1'),
        cv=KFold(n_splits=5, shuffle=True, random_state=2022),
        passthrough=True
    )
    
    
    
    
    kf = KFold(n_splits=5, shuffle=True, random_state=0)
    oof = np.zeros(len(train_df))
    predictions = np.zeros(len(test_df))
    rmse = 0
    for fold, (train_index, val_index) in enumerate(kf.split(train_df2)):
        print("Fold {}".format(fold))
        X_train, X_val = train_df2.iloc[train_index], train_df2.iloc[val_index]
        y_train, y_val = T2.iloc[train_index], T2.iloc[val_index]
        stacker.fit(X_train, y_train)
        fold_predictions =stacker.predict(X_val)
        rmse += np.sqrt(mean_squared_error(y_val, fold_predictions)) / kf.n_splits
        print("CV RMSE:", rmse)
        oof[val_index] = fold_predictions
        predictions += stacker.predict(test_df) / kf.n_splits

    return predictions


In [3]:
pred= run_model(train_df, test_df)
submission[ycol] = pred
submission.to_csv('submission.csv', index = False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)


Fitting 5 folds for each of 1 candidates, totalling 5 fits
0:	learn: 9.7311944	total: 123ms	remaining: 2m 2s
500:	learn: 0.9471450	total: 50.9s	remaining: 50.7s
999:	learn: 0.7496999	total: 1m 39s	remaining: 0us
0:	learn: 9.7487273	total: 85.7ms	remaining: 1m 25s
500:	learn: 0.9476800	total: 49.7s	remaining: 49.5s
999:	learn: 0.7497913	total: 1m 36s	remaining: 0us
0:	learn: 9.7249062	total: 143ms	remaining: 2m 22s
500:	learn: 0.9469696	total: 49.3s	remaining: 49.1s
999:	learn: 0.7483871	total: 1m 37s	remaining: 0us
0:	learn: 9.7356132	total: 101ms	remaining: 1m 40s
500:	learn: 0.9485159	total: 50.2s	remaining: 50s
999:	learn: 0.7487685	total: 1m 38s	remaining: 0us
0:	learn: 9.7428429	total: 79ms	remaining: 1m 18s
500:	learn: 0.9404855	total: 50.6s	remaining: 50.4s
999:	learn: 0.7461619	total: 1m 38s	remaining: 0us
Fitting 5 folds for each of 1 candidates, totalling 5 fits
0:	learn: 9.4319333	total: 63.9ms	remaining: 1m 3s
500:	learn: 0.5627659	total: 28s	remaining: 27.9s
999:	learn: 0.

## Preprocessing

In [4]:
"""
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_regression
from sklearn.feature_selection import RFE
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from matplotlib import pyplot

from joblib import Parallel, delayed
import numpy as np

model = CatBoostRegressor(verbose=1000,task_type='GPU', devices='0:1')

k_range = range(5, len(features)+1)
param_grid = dict(n_features_to_select=k_range)
grid = GridSearchCV(RFE(model), param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid.fit(train_df[features], train_df[ycol])

mask = grid.best_estimator_.support_
df = train_df[feature_names]
feature_names = df.columns[:][mask]
k = grid.best_params_['n_features_to_select']

X_new = train_df[feature_names]

def parallel_cv_scores(model, X, y, cv):
    return np.average(Parallel(n_jobs=-1)(delayed(cross_val_score)(model, X, y, cv=cv_fold, scoring='neg_mean_squared_error') for cv_fold in cv))

cv = 5
scores = parallel_cv_scores(model, X_new, train_df[ycol], cv)

print("Best number of features:", k)
print("Selected features:", feature_names)
print("Cross-validation mean squared error: %0.2f" % (-scores))
"""

'\nfrom sklearn.datasets import make_regression\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.feature_selection import SelectKBest\nfrom sklearn.feature_selection import mutual_info_regression\nfrom sklearn.feature_selection import RFE\nfrom sklearn.model_selection import GridSearchCV\nfrom sklearn.model_selection import cross_val_score\nfrom matplotlib import pyplot\n\nfrom joblib import Parallel, delayed\nimport numpy as np\n\nmodel = CatBoostRegressor(verbose=1000,task_type=\'GPU\', devices=\'0:1\')\n\nk_range = range(5, len(features)+1)\nparam_grid = dict(n_features_to_select=k_range)\ngrid = GridSearchCV(RFE(model), param_grid, cv=5, scoring=\'neg_mean_squared_error\', n_jobs=-1)\ngrid.fit(train_df[features], train_df[ycol])\n\nmask = grid.best_estimator_.support_\ndf = train_df[feature_names]\nfeature_names = df.columns[:][mask]\nk = grid.best_params_[\'n_features_to_select\']\n\nX_new = train_df[feature_names]\n\ndef parallel_cv_scores(model, X, y, cv):\n  

In [5]:
#train_df=train_df[features]
#test_df=test_df[features]

In [6]:
submission

Unnamed: 0,contest-tmp2m-14d__tmp2m,index
0,28.978540,375734
1,28.879029,375735
2,29.080993,375736
3,29.630383,375737
4,29.631249,375738
...,...,...
31349,3.498991,407083
31350,2.895121,407084
31351,2.455746,407085
31352,2.230769,407086
