In [None]:
import os
import gc
import numpy as np
import pandas as pd
import polars as pl
from glob import glob
from pathlib import Path
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.metrics import roc_auc_score
import lightgbm as lgb

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import optuna
from sklearn.model_selection import cross_validate
from lightgbm import LGBMRegressor
# import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.model_selection import KFold

# Reading the data

In [None]:
train_data = pd.read_csv('/kaggle/input/playground-series-s4e5/train.csv')
test_data = pd.read_csv('/kaggle/input/playground-series-s4e5/test.csv')
sub_data = pd.read_csv('/kaggle/input/playground-series-s4e5/sample_submission.csv')

## train data

In [None]:
train_data.head()

In [None]:
train_data.info()

- no object type
- no missing value

In [None]:
train_data.duplicated().sum()

## test data

In [None]:
test_data.head()

In [None]:
test_data.info()

## submission data

In [None]:
sub_data.head()

# make LGBM model

In [None]:
train_df = train_data.drop(['id', 'FloodProbability'], axis=1)
target_df = train_data['FloodProbability']

In [None]:
cols = train_df.columns

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(data=train_df)
plt.title('Boxplot of Each Column')
plt.xlabel('Columns')
plt.ylabel('Values')
plt.xticks(rotation=90)
plt.show()

In [None]:
def replace_outliers_with_boundary(df):
    for column in df.columns:
        mean = df[column].mean()
        std_dev = df[column].std()
        lower_bound = mean - 3 * std_dev
        upper_bound = mean + 3 * std_dev
        
        df[column] = np.where(df[column] < lower_bound, lower_bound, df[column])
        df[column] = np.where(df[column] > upper_bound, upper_bound, df[column])
    
    return df

# 外れ値を境界値に書き換え
train_df = replace_outliers_with_boundary(train_df)

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(data=train_df)
plt.title('Boxplot of Each Column')
plt.xlabel('Columns')
plt.ylabel('Values')
plt.xticks(rotation=90)
plt.show()

In [None]:
scaler = MinMaxScaler()
train_df = pd.DataFrame(scaler.fit_transform(train_df), columns=cols)

In [None]:
def objective(trial):
    max_depth = trial.suggest_int('max_depth', 5, 15)
    n_estimators = trial.suggest_int('n_estimators', 3000, 50000)
    gamma = trial.suggest_float('gamma', 0, 1)
    reg_alpha = trial.suggest_float('reg_alpha', 0, 1)
    reg_lambda = trial.suggest_float('reg_lambda', 0, 1)
    min_child_weight = trial.suggest_int('min_child_weight', 5, 15)
    subsample = trial.suggest_float('subsample', 0, 1)
    colsample_bytree = trial.suggest_float('colsample_bytree', 0, 1)
    learning_rate = trial.suggest_float('learning_rate', 0.5, 1.5)
    
#     print('Training the model with', X.shape[1], 'features')
    
#       LightGBM
    params = {'learning_rate': learning_rate,
              'n_estimators': n_estimators,
              'max_depth': max_depth,
              'lambda_l1': reg_alpha,
              'lambda_l2': reg_lambda,
              'colsample_bytree': colsample_bytree, 
              'subsample': subsample,    
              'min_child_samples': min_child_weight,
              'class_weight': 'balanced'}
    
    clf = LGBMRegressor(**params, verbose = -1, verbosity = -1)
    
    cv_results = cross_validate(clf,train_df,target_df, cv=5, scoring='r2')
    
    validation_score = np.mean(cv_results['test_score'])
    
    return validation_score

In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials= 5)

In [None]:
df_study = study.trials_dataframe()
df_study = df_study.sort_values(by='value', ascending=False)

df_study.head()

In [None]:
best_params_LGBM = study.best_params

print(best_params_LGBM)

In [None]:
class VotingModel(BaseEstimator, RegressorMixin):
    def __init__(self, estimators):
        super().__init__()
        self.estimators = estimators

    def fit(self, X, y=None):
        return self

    def predict(self, X):
        y_preds = [estimator.predict(X) for estimator in self.estimators]
        return np.mean(y_preds, axis=0)

#     def predict_proba(self, X):
#         y_preds = [estimator.predict_proba(X) for estimator in self.estimators]
#         return np.mean(y_preds, axis=0)

In [None]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
fitted_models = []
cv_scores = []


for idx_train, idx_valid in kf.split(train_df):
    X_train, y_train = train_df.iloc[idx_train], target_df.iloc[idx_train]
    X_valid, y_valid = train_df.iloc[idx_valid], target_df.iloc[idx_valid]

    model = lgb.LGBMRegressor(**best_params_LGBM)
    model.fit(
        X_train, y_train,
        eval_set=[(X_valid, y_valid)],
        callbacks=[lgb.log_evaluation(50), lgb.early_stopping(50)]
    )

    fitted_models.append(model)


    y_pred_valid = model.predict(X_valid)
    r2 = r2_score(y_valid, y_pred_valid)
    cv_scores.append(r2)

model = VotingModel(fitted_models)
print("CV AUC scores: ", cv_scores)

# separate the data
X_train, X_test, y_train, y_test = train_test_split(train_df, target_df, test_size=0.2, random_state=42)

# set hyperparameter
params = {
    'objective': 'regression',
    'metric': 'r2',
    'learning_rate': 0.1,
    'n_estimators': 50000,
    'max_depth': 10,
    'min_child_weight': 1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'num_leaves': 64
}

# instantiation
model = lgb.LGBMRegressor(**params)

# study & pridict
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# calc r2 score
r2_score = r2_score(y_test, y_pred)
print('---------------------------')
print("r2 score:", r2_score)

# plot feature importance

model.plot_importance(model)

# submission

In [None]:
test_df = test_data.drop(['id'], axis=1)
y_pred_sub = model.predict(test_df)

# submission df
sub = pd.DataFrame()
sub['id'] = test_data['id']
sub['FloodProbability'] = y_pred_sub

sub.head()

In [None]:
sub.to_csv('submission.csv', index=False)
sub.to_csv('/kaggle/output/submission.csv', index=False)