## **Importing Libraries**

In [15]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import optuna
import warnings
import scipy
from sklearn.compose import TransformedTargetRegressor
from sklearn import set_config
from colorama import Style, Fore
from sklearn.inspection import permutation_importance, PartialDependenceDisplay
from sklearn.model_selection import StratifiedKFold, KFold
from xgboost import XGBRegressor
from sklearn.linear_model import Ridge, LinearRegression
from lightgbm import LGBMRegressor
from category_encoders import TargetEncoder, OneHotEncoder, MEstimateEncoder, OrdinalEncoder
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.metrics import roc_auc_score, roc_curve, make_scorer, mean_squared_log_error, r2_score
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.preprocessing import FunctionTransformer, StandardScaler, LabelEncoder, LabelBinarizer, MinMaxScaler, PolynomialFeatures, SplineTransformer
from sklearn.compose import ColumnTransformer
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.spatial.distance import squareform
from catboost import CatBoostRegressor

from great_tables import GT, style ,exibble, from_column, loc
from colorama import Style, Fore

sns.set_theme(style = 'white', palette = 'colorblind')
pal = sns.color_palette('colorblind')

pd.set_option('display.max_rows', 100)
set_config(transform_output = 'pandas')
pd.options.mode.chained_assignment = None
warnings.simplefilter(action='ignore', category=FutureWarning)

In [44]:
palette = ["d9ed92","b5e48c","99d98c","76c893","52b69a","34a0a4","168aad","1a759f","1e6091","184e77"]

config = {
    'SEED' : 42,
    'N_SPLITS': 5,
    'SUBMIT' : True,
    'USE_ORIGINAL': False
    
}

In [18]:
train = pd.read_csv('./data/train.csv')

In [19]:
train.head()

Unnamed: 0,id,MonsoonIntensity,TopographyDrainage,RiverManagement,Deforestation,Urbanization,ClimateChange,DamsQuality,Siltation,AgriculturalPractices,...,DrainageSystems,CoastalVulnerability,Landslides,Watersheds,DeterioratingInfrastructure,PopulationScore,WetlandLoss,InadequatePlanning,PoliticalFactors,FloodProbability
0,0,5,8,5,8,6,4,4,3,3,...,5,3,3,5,4,7,5,7,3,0.445
1,1,6,7,4,4,8,8,3,5,4,...,7,2,0,3,5,3,3,4,3,0.45
2,2,6,5,6,7,3,7,1,5,4,...,7,3,7,5,6,8,2,3,3,0.53
3,3,3,4,6,5,4,8,4,7,6,...,2,4,7,4,4,6,5,7,5,0.535
4,4,5,3,2,6,4,4,3,3,3,...,2,2,6,6,4,1,2,3,5,0.415


In [20]:
train.shape

(1117957, 22)

In [21]:
train.isnull().sum()

id                                 0
MonsoonIntensity                   0
TopographyDrainage                 0
RiverManagement                    0
Deforestation                      0
Urbanization                       0
ClimateChange                      0
DamsQuality                        0
Siltation                          0
AgriculturalPractices              0
Encroachments                      0
IneffectiveDisasterPreparedness    0
DrainageSystems                    0
CoastalVulnerability               0
Landslides                         0
Watersheds                         0
DeterioratingInfrastructure        0
PopulationScore                    0
WetlandLoss                        0
InadequatePlanning                 0
PoliticalFactors                   0
FloodProbability                   0
dtype: int64

In [22]:
train.describe()

Unnamed: 0,id,MonsoonIntensity,TopographyDrainage,RiverManagement,Deforestation,Urbanization,ClimateChange,DamsQuality,Siltation,AgriculturalPractices,...,DrainageSystems,CoastalVulnerability,Landslides,Watersheds,DeterioratingInfrastructure,PopulationScore,WetlandLoss,InadequatePlanning,PoliticalFactors,FloodProbability
count,1117957.0,1117957.0,1117957.0,1117957.0,1117957.0,1117957.0,1117957.0,1117957.0,1117957.0,1117957.0,...,1117957.0,1117957.0,1117957.0,1117957.0,1117957.0,1117957.0,1117957.0,1117957.0,1117957.0,1117957.0
mean,558978.0,4.92145,4.926671,4.955322,4.94224,4.942517,4.934093,4.955878,4.927791,4.942619,...,4.946893,4.953999,4.931376,4.929032,4.925907,4.92752,4.950859,4.940587,4.939004,0.5044803
std,322726.5,2.056387,2.093879,2.072186,2.051689,2.083391,2.057742,2.083063,2.065992,2.068545,...,2.072333,2.088899,2.078287,2.082395,2.064813,2.074176,2.068696,2.081123,2.09035,0.0510261
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.285
25%,279489.0,3.0,3.0,4.0,4.0,3.0,3.0,4.0,3.0,3.0,...,4.0,3.0,3.0,3.0,3.0,3.0,4.0,3.0,3.0,0.47
50%,558978.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,...,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,0.505
75%,838467.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,...,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,0.54
max,1117956.0,16.0,18.0,16.0,17.0,17.0,17.0,16.0,16.0,16.0,...,17.0,17.0,16.0,16.0,17.0,18.0,19.0,16.0,16.0,0.725


In [23]:
test = pd.read_csv('./data/test.csv')

In [24]:
test.head()

Unnamed: 0,id,MonsoonIntensity,TopographyDrainage,RiverManagement,Deforestation,Urbanization,ClimateChange,DamsQuality,Siltation,AgriculturalPractices,...,IneffectiveDisasterPreparedness,DrainageSystems,CoastalVulnerability,Landslides,Watersheds,DeterioratingInfrastructure,PopulationScore,WetlandLoss,InadequatePlanning,PoliticalFactors
0,1117957,4,6,3,5,6,7,8,7,8,...,8,5,7,5,6,3,6,4,4,5
1,1117958,4,4,2,9,5,5,4,7,5,...,2,4,7,4,5,1,7,4,4,3
2,1117959,1,3,6,5,7,2,4,6,4,...,7,9,2,5,5,2,3,6,8,3
3,1117960,2,4,4,6,4,5,4,3,4,...,7,8,4,6,7,6,4,2,4,4
4,1117961,6,3,2,4,6,4,5,5,3,...,4,3,2,6,4,6,8,4,5,5


In [25]:
test.shape

(745305, 21)

In [26]:
test.isnull().sum()

id                                 0
MonsoonIntensity                   0
TopographyDrainage                 0
RiverManagement                    0
Deforestation                      0
Urbanization                       0
ClimateChange                      0
DamsQuality                        0
Siltation                          0
AgriculturalPractices              0
Encroachments                      0
IneffectiveDisasterPreparedness    0
DrainageSystems                    0
CoastalVulnerability               0
Landslides                         0
Watersheds                         0
DeterioratingInfrastructure        0
PopulationScore                    0
WetlandLoss                        0
InadequatePlanning                 0
PoliticalFactors                   0
dtype: int64

In [27]:
test.describe()

Unnamed: 0,id,MonsoonIntensity,TopographyDrainage,RiverManagement,Deforestation,Urbanization,ClimateChange,DamsQuality,Siltation,AgriculturalPractices,...,IneffectiveDisasterPreparedness,DrainageSystems,CoastalVulnerability,Landslides,Watersheds,DeterioratingInfrastructure,PopulationScore,WetlandLoss,InadequatePlanning,PoliticalFactors
count,745305.0,745305.0,745305.0,745305.0,745305.0,745305.0,745305.0,745305.0,745305.0,745305.0,...,745305.0,745305.0,745305.0,745305.0,745305.0,745305.0,745305.0,745305.0,745305.0,745305.0
mean,1490609.0,4.91561,4.930288,4.960027,4.946084,4.938424,4.933524,4.958468,4.927651,4.945308,...,4.947436,4.944003,4.957209,4.92762,4.93072,4.926062,4.926957,4.948424,4.940204,4.943918
std,215151.2,2.056295,2.094117,2.071722,2.052602,2.081816,2.059243,2.089312,2.06811,2.073404,...,2.081322,2.072335,2.088787,2.079006,2.083348,2.065638,2.073692,2.065891,2.079128,2.087387
min,1117957.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1304283.0,3.0,3.0,4.0,4.0,3.0,3.0,4.0,3.0,3.0,...,3.0,4.0,3.0,3.0,3.0,3.0,3.0,4.0,3.0,3.0
50%,1490609.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,...,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0
75%,1676935.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,...,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0
max,1863261.0,16.0,17.0,16.0,17.0,17.0,17.0,16.0,16.0,16.0,...,16.0,17.0,17.0,16.0,16.0,17.0,19.0,22.0,16.0,16.0


In [29]:
original = pd.read_csv('./data/original.csv')

In [30]:
original.head()

Unnamed: 0,MonsoonIntensity,TopographyDrainage,RiverManagement,Deforestation,Urbanization,ClimateChange,DamsQuality,Siltation,AgriculturalPractices,Encroachments,...,DrainageSystems,CoastalVulnerability,Landslides,Watersheds,DeterioratingInfrastructure,PopulationScore,WetlandLoss,InadequatePlanning,PoliticalFactors,FloodProbability
0,3,8,6,6,4,4,6,2,3,2,...,10,7,4,2,3,4,3,2,6,0.45
1,8,4,5,7,7,9,1,5,5,4,...,9,2,6,2,1,1,9,1,3,0.475
2,3,10,4,1,7,5,4,7,4,9,...,7,4,4,8,6,1,8,3,6,0.515
3,4,4,2,7,3,4,1,4,6,4,...,4,2,6,6,8,8,6,6,10,0.52
4,3,7,5,2,5,8,5,2,7,5,...,7,6,5,3,3,4,4,3,4,0.475


In [31]:
original.shape

(50000, 21)

In [32]:
original.isnull().sum()

MonsoonIntensity                   0
TopographyDrainage                 0
RiverManagement                    0
Deforestation                      0
Urbanization                       0
ClimateChange                      0
DamsQuality                        0
Siltation                          0
AgriculturalPractices              0
Encroachments                      0
IneffectiveDisasterPreparedness    0
DrainageSystems                    0
CoastalVulnerability               0
Landslides                         0
Watersheds                         0
DeterioratingInfrastructure        0
PopulationScore                    0
WetlandLoss                        0
InadequatePlanning                 0
PoliticalFactors                   0
FloodProbability                   0
dtype: int64

## **FUNCTIONS**

In [38]:
def printInfo(df):
    print(f'{Style.BRIGHT}{Fore.YELLOW}SHAPE{Style.RESET_ALL}')
    print(f'{Style.BRIGHT}{Fore.GREEN} train: {train.shape}')
    print(f'{Style.BRIGHT}{Fore.GREEN} test:  {test.shape}')
    print(f'{Style.BRIGHT}{Fore.GREEN} original:  {df.shape}')
    print(f'{Style.BRIGHT}{Fore.YELLOW}\nNULL VALUES{Style.RESET_ALL}')
    print(f'{Style.BRIGHT}{Fore.GREEN} train: {train.isnull().any().any()}')
    print(f'{Style.BRIGHT}{Fore.GREEN} train: {test.isnull().any().any()}')
    print(f'{Style.BRIGHT}{Fore.GREEN} original: {df.isnull().any().any()}')    
    print(f'{Style.BRIGHT}{Fore.YELLOW}\nDUPLICATES{Style.RESET_ALL}')
    print(f'{Style.BRIGHT}{Fore.GREEN} train: {train.duplicated().any().any()}')
    print(f'{Style.BRIGHT}{Fore.GREEN} train: {test.duplicated().any().any()}')
    print(f'{Style.BRIGHT}{Fore.GREEN} original: {df.duplicated().any().any()}')

In [40]:
printInfo(original)

[1m[33mSHAPE[0m
[1m[32m train: (1117957, 22)
[1m[32m test:  (745305, 21)
[1m[32m original:  (50000, 21)
[1m[33m
NULL VALUES[0m
[1m[32m train: False
[1m[32m train: False
[1m[32m original: False
[1m[33m
DUPLICATES[0m
[1m[32m train: False
[1m[32m train: False
[1m[32m original: False


In [41]:
def Statistic(df: pd.DataFrame(), categoric = False):
    num_cols = list(df._get_numeric_data())
    cat_cols = list(df.drop(num_cols,axis=1))
    if categoric:
        desc = pd.DataFrame(index = list(df[cat_cols]))
        df = df[cat_cols]
    else:
        desc = pd.DataFrame(index = list(df[num_cols]))
        df = df[num_cols]
        desc['skew'] = df[num_cols].skew()
        
    desc['type'] = df.dtypes
    desc['count'] = df.count()
    desc['nunique'] = df.nunique()
    desc['%unique'] = desc['nunique'] /len(df) * 100 
    desc['null'] = df.isnull().sum()
    desc['%null'] = desc['null'] / len(df) * 100
    desc = pd.concat([desc,df.describe().T.drop('count',axis=1)],axis=1)    

    desc = desc.round(2)
    return desc.reset_index().rename(columns={'index':'Column'}).sort_values(by=['type'])

In [42]:
Statistic(original)

Unnamed: 0,Column,skew,type,count,nunique,%unique,null,%null,mean,std,min,25%,50%,75%,max
0,MonsoonIntensity,0.44,int64,50000,17,0.03,0,0.0,4.99,2.24,0.0,3.0,5.0,6.0,16.0
18,InadequatePlanning,0.46,int64,50000,17,0.03,0,0.0,4.99,2.23,0.0,3.0,5.0,6.0,16.0
17,WetlandLoss,0.44,int64,50000,19,0.04,0,0.0,5.01,2.23,0.0,3.0,5.0,6.0,22.0
16,PopulationScore,0.46,int64,50000,18,0.04,0,0.0,4.98,2.24,0.0,3.0,5.0,6.0,19.0
15,DeterioratingInfrastructure,0.45,int64,50000,18,0.04,0,0.0,4.99,2.23,0.0,3.0,5.0,6.0,17.0
14,Watersheds,0.47,int64,50000,17,0.03,0,0.0,4.98,2.23,0.0,3.0,5.0,6.0,16.0
13,Landslides,0.43,int64,50000,17,0.03,0,0.0,4.98,2.23,0.0,3.0,5.0,6.0,16.0
12,CoastalVulnerability,0.46,int64,50000,18,0.04,0,0.0,5.0,2.25,0.0,3.0,5.0,6.0,17.0
11,DrainageSystems,0.46,int64,50000,18,0.04,0,0.0,5.01,2.24,0.0,3.0,5.0,6.0,17.0
19,PoliticalFactors,0.45,int64,50000,17,0.03,0,0.0,4.99,2.25,0.0,3.0,5.0,6.0,16.0


In [46]:
import pandas as pd

def min_max_unique(data_train, data_test):
    df = pd.DataFrame(index=data_train.columns)
    summary = {}
    
    for col in data_train.columns:
        if col in data_train and col in data_test:  # Check if column exists in both dataframes
            if pd.api.types.is_numeric_dtype(data_train[col]):  
                min_train = min(data_train[col])
                min_test = min(data_test[col])
                max_train = max(data_train[col])
                max_test = max(data_test[col])
                unique_train = len(data_train[col].unique())
                unique_test = len(data_test[col].unique())
                top5_train = sorted(data_train[col])[:5]
                top5_test = sorted(data_test[col])[:5]
            else:  
                min_train = min_test = max_train = max_test = None
                unique_train = len(data_train[col].unique())
                unique_test = len(data_test[col].unique())
                top5_train = top5_test = None
            summary[col] = [min_train, min_test, max_train, max_test, 
                            unique_train, unique_test]
        else:
            print(f"Column '{col}' not found in both data_train and data_test.")

    df = pd.DataFrame.from_dict(summary, orient='index', columns=['min_train', 'min_test', 'max_train', 'max_test', 
                                                                  'unique_train', 'unique_test'])\
        .reset_index().rename(columns={'index': 'columns'})
    return df


In [48]:
TARGET = 'FloodProbability'


In [56]:
def Number_of_columns(df):
    NUMERIC_COLS = [f for f in df._get_numeric_data() if f not in TARGET]
    CAT_COLS = list(df.drop(NUMERIC_COLS,axis=1))
    print(f'Numerical cols: {len(NUMERIC_COLS)}')
    print(f'Categorical cols: {len(CAT_COLS)}')

In [57]:
Number_of_columns(original)
Number_of_columns(train)
Number_of_columns(test)

Numerical cols: 20
Categorical cols: 1
Numerical cols: 21
Categorical cols: 1
Numerical cols: 21
Categorical cols: 0


In [58]:
min_max_unique(data_train=train, data_test=test)

Column 'FloodProbability' not found in both data_train and data_test.


Unnamed: 0,columns,min_train,min_test,max_train,max_test,unique_train,unique_test
0,id,0,1117957,1117956,1863261,1117957,745305
1,MonsoonIntensity,0,0,16,16,17,17
2,TopographyDrainage,0,0,18,17,19,18
3,RiverManagement,0,0,16,16,17,17
4,Deforestation,0,0,17,17,18,18
5,Urbanization,0,0,17,17,18,18
6,ClimateChange,0,0,17,17,18,18
7,DamsQuality,0,0,16,16,17,17
8,Siltation,0,0,16,16,17,17
9,AgriculturalPractices,0,0,16,16,17,17


In [59]:
min_max_unique(train.drop(TARGET,axis=1),test)

Unnamed: 0,columns,min_train,min_test,max_train,max_test,unique_train,unique_test
0,id,0,1117957,1117956,1863261,1117957,745305
1,MonsoonIntensity,0,0,16,16,17,17
2,TopographyDrainage,0,0,18,17,19,18
3,RiverManagement,0,0,16,16,17,17
4,Deforestation,0,0,17,17,18,18
5,Urbanization,0,0,17,17,18,18
6,ClimateChange,0,0,17,17,18,18
7,DamsQuality,0,0,16,16,17,17
8,Siltation,0,0,16,16,17,17
9,AgriculturalPractices,0,0,16,16,17,17
