In [None]:
import os
import gc
import numpy as np
import pandas as pd
import polars as pl
from glob import glob
from pathlib import Path
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.metrics import roc_auc_score
import lightgbm as lgb

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import optuna
from sklearn.model_selection import cross_validate
from lightgbm import LGBMRegressor
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.model_selection import KFold

# Reading the data

In [None]:
train_data = pd.read_csv('/kaggle/input/playground-series-s4e5/train.csv')
test_data = pd.read_csv('/kaggle/input/playground-series-s4e5/test.csv')
sub_data = pd.read_csv('/kaggle/input/playground-series-s4e5/sample_submission.csv')

## train data

In [None]:
train_data.head()

In [None]:
train_data.info()

- no object type
- no missing value

In [None]:
train_data.duplicated().sum()

## test data

In [None]:
test_data.head()

In [None]:
test_data.info()

## submission data

In [None]:
sub_data.head()

# make LGBM model

In [None]:
train_df = train_data.drop(['id', 'FloodProbability'], axis=1)
target_df = train_data['FloodProbability']

In [None]:
cols = train_df.columns

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(data=train_df)
plt.title('Boxplot of Each Column')
plt.xlabel('Columns')
plt.ylabel('Values')
plt.xticks(rotation=90)
plt.show()

In [None]:
def replace_outliers_with_boundary(df):
    for column in df.columns:
        mean = df[column].mean()
        std_dev = df[column].std()
        lower_bound = mean - 3 * std_dev
        upper_bound = mean + 3 * std_dev
        
        df[column] = np.where(df[column] < lower_bound, lower_bound, df[column])
        df[column] = np.where(df[column] > upper_bound, upper_bound, df[column])
    
    return df

train_df = replace_outliers_with_boundary(train_df)

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(data=train_df)
plt.title('Boxplot of Each Column')
plt.xlabel('Columns')
plt.ylabel('Values')
plt.xticks(rotation=90)
plt.show()

In [None]:
def add_describe(df):
    df['ClimateAnthropogenicInteraction'] = (df['MonsoonIntensity'] + df['ClimateChange']) * (df['Deforestation'] + df['Urbanization'] + df['AgriculturalPractices'] + df['Encroachments'])
    df['InfrastructurePreventionInteraction'] = (df['DamsQuality'] + df['DrainageSystems'] + df['DeterioratingInfrastructure']) * (df['RiverManagement'] + df['IneffectiveDisasterPreparedness'] + df['InadequatePlanning'])
    df['SocioPoliticalContext'] = df['PopulationScore'] * df['PoliticalFactors']
    df['EcosystemImpact'] = df['WetlandLoss'] + df['Watersheds']
    num_cols = list(df.columns)
    df['sum'] = df[num_cols].sum(axis=1)
    df['std']  = df[num_cols].std(axis=1)
    df['mean'] = df[num_cols].mean(axis=1)
    df['max']  = df[num_cols].max(axis=1)
    df['min']  = df[num_cols].min(axis=1)
    df['mode'] = df[num_cols].mode(axis=1)[0]
    df['median'] = df[num_cols].median(axis=1)
    df['skew'] = df[num_cols].skew(axis=1)
    df['kurt'] = df[num_cols].kurt(axis=1)

    for i in range(10,100,10):
        df[f'{i}th'] = df[num_cols].quantile(i/100, axis=1)

    df['harmonic'] = len(num_cols) / df[num_cols].apply(lambda x: (1/x).mean(), axis=1)
    df['geometric'] = df[num_cols].apply(lambda x: x.prod()**(1/len(x)), axis=1)
    df['zscore'] = df[num_cols].apply(lambda x: (x - x.mean()) / x.std(), axis=1).mean(axis=1)
    df['cv'] = df['std'] / df['mean']
    df['Skewness_75'] = (df[num_cols].quantile(0.75, axis=1) - df[num_cols].mean(axis=1)) / df[num_cols].std(axis=1)
    df['Skewness_25'] = (df[num_cols].quantile(0.25, axis=1) - df[num_cols].mean(axis=1)) / df[num_cols].std(axis=1)
    df['2ndMoment'] = df[num_cols].apply(lambda x: (x**2).mean(), axis=1)
    df['3rdMoment'] = df[num_cols].apply(lambda x: (x**3).mean(), axis=1)
    df['entropy'] = df[num_cols].apply(lambda x: -1*(x*np.log(x)).sum(), axis=1)
    return df

In [None]:
train_df = add_describe(train_df)

In [None]:
cols = train_df.columns

In [None]:
scaler1 = RobustScaler()
df1 = pd.DataFrame(scaler1.fit_transform(train_df), columns=cols)
df1

In [None]:
scaler2 = MinMaxScaler()
df2 = pd.DataFrame(scaler2.fit_transform(train_df), columns=cols)
df2

In [None]:
train_df = df1.join(df2,how='outer',lsuffix='_minmax', rsuffix='_robust')

In [None]:
del df1
del df2

gc.collect()

In [None]:
n = 20
pca = PCA(n_components=n)
columns = [f'feature_{j+1}' for j in range(n)]
tmp_df = pd.DataFrame(pca.fit_transform(train_df), columns=columns)
train_df = pd.concat([train_df, tmp_df], axis=1)

In [None]:
del tmp_df

gc.collect()

In [None]:
train_df = train_df.astype('float32')

In [None]:
def objective(trial):
    max_depth = trial.suggest_int('max_depth', 9, 13)
    n_estimators = trial.suggest_int('n_estimators', 11000, 14000)
    reg_alpha = trial.suggest_float('reg_alpha', 0, 1)
    reg_lambda = trial.suggest_float('reg_lambda', 0, 1)
    min_child_weight = trial.suggest_int('min_child_weight', 12, 18)
    subsample = trial.suggest_float('subsample', 0, 1)
    colsample_bytree = trial.suggest_float('colsample_bytree', 0, 1)
    learning_rate = trial.suggest_float('learning_rate', 0.3, 1.2)
    
#     print('Training the model with', X.shape[1], 'features')
    
#       LightGBM
    params = {'learning_rate': learning_rate,
              'n_estimators': n_estimators,
              'max_depth': max_depth,
              'lambda_l1': reg_alpha,
              'lambda_l2': reg_lambda,
              'colsample_bytree': colsample_bytree, 
              'subsample': subsample,    
              'min_child_samples': min_child_weight,
              'class_weight': 'balanced',
              'device':'gpu',}
    
    clf = LGBMRegressor(**params, verbose = -1, verbosity = -1)
    
    cv_results = cross_validate(clf,train_df,target_df, cv=5, scoring='r2')
    
    validation_score = np.mean(cv_results['test_score'])
    
    return validation_score

In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials= 4)

In [None]:
df_study = study.trials_dataframe()
df_study = df_study.sort_values(by='value', ascending=False)

df_study.head()

In [None]:
best_params_LGBM = study.best_params

print(best_params_LGBM)

In [None]:
class VotingModel(BaseEstimator, RegressorMixin):
    def __init__(self, estimators):
        super().__init__()
        self.estimators = estimators

    def fit(self, X, y=None):
        return self

    def predict(self, X):
        y_preds = [estimator.predict(X) for estimator in self.estimators]
        return np.mean(y_preds, axis=0)

In [None]:
kf = KFold(n_splits=4, shuffle=True, random_state=42)
fitted_models = []
cv_scores = []


for idx_train, idx_valid in kf.split(train_df):
    X_train, y_train = train_df.iloc[idx_train], target_df.iloc[idx_train]
    X_valid, y_valid = train_df.iloc[idx_valid], target_df.iloc[idx_valid]

    model = lgb.LGBMRegressor(**best_params_LGBM)
    model.fit(
        X_train, y_train,
        eval_set=[(X_valid, y_valid)],
        callbacks=[lgb.log_evaluation(50), lgb.early_stopping(50)]
    )

    fitted_models.append(model)


    y_pred_valid = model.predict(X_valid)
    r2 = r2_score(y_valid, y_pred_valid)
    cv_scores.append(r2)

model = VotingModel(fitted_models)
print("CV AUC scores: ", cv_scores)

# submission

In [None]:
test_df = test_data.drop(['id'], axis=1)
test_df = add_describe(test_df)
df1 = pd.DataFrame(scaler1.transform(test_df), columns=cols)
df2 = pd.DataFrame(scaler2.transform(test_df), columns=cols)
test_df = df1.join(df2,how='outer',lsuffix='_minmax', rsuffix='_robust')
tmp_df = pd.DataFrame(pca.transform(test_df), columns=columns)
test_df = pd.concat([test_df, tmp_df], axis=1)
# test_df = test_df.astype('float32')

y_pred_sub = model.predict(test_df)

# submission df
sub = pd.DataFrame()
sub['id'] = test_data['id']
sub['FloodProbability'] = y_pred_sub

sub.head()

In [None]:
sub.to_csv('./submission.csv', index=False)