In [5]:
import datetime
from sklearn.base import clone
from sklearn.model_selection import KFold,train_test_split
import numpy as np
import pandas as pd
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.metrics import r2_score
from colorama import Fore, Style
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, SplineTransformer, OneHotEncoder
from sklearn.linear_model import Ridge, LinearRegression
import matplotlib.pyplot as plt
import copy
from catboost import CatBoostClassifier,CatBoostRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRFRegressor
from sklearn.linear_model import Ridge, LinearRegression

In [21]:
train = pd.read_csv('dataset/train.csv', index_col='id')
test = pd.read_csv('dataset/test.csv', index_col='id')
#train = pd.read_csv('/kaggle/input/playground-series-s4e5/train.csv', index_col='id')
#test = pd.read_csv('/kaggle/input/playground-series-s4e5/test.csv', index_col='id')

initial_features = list(train.columns)[:-1]

y = train["FloodProbability"].copy().astype(float)
train = train.drop(columns=["FloodProbability"])

for df in [train, test]:
    df['fsum'] = df[BASE_FEATURES].sum(axis=1) # for tree models
    df['special1'] = df['fsum'].isin(np.arange(72, 76)) # for linear models
    # df['total'] = df[BASE_FEATURES].sum(axis=1)
    df['mean'] = df[BASE_FEATURES].mean(axis=1)
    df['std'] = df[BASE_FEATURES].std(axis=1)
    df['max'] = df[BASE_FEATURES].max(axis=1)
    df['min'] = df[BASE_FEATURES].min(axis=1)
    df['median'] = df[BASE_FEATURES].median(axis=1)
    df['ptp'] = df[BASE_FEATURES].values.ptp(axis=1)
    df['q25'] = df[BASE_FEATURES].quantile(0.25, axis=1)
    df['q75'] = df[BASE_FEATURES].quantile(0.75, axis=1)
    
    df['ClimateImpact'] = df['MonsoonIntensity'] + df['ClimateChange']
    df['AnthropogenicPressure'] = df['Deforestation'] + df['Urbanization'] + df['AgriculturalPractices'] + df['Encroachments']
    df['InfrastructureQuality'] = df['DamsQuality'] + df['DrainageSystems'] + df['DeterioratingInfrastructure']
    df['CoastalVulnerabilityTotal'] = df['CoastalVulnerability'] + df['Landslides']
    df['PreventiveMeasuresEfficiency'] = df['RiverManagement'] + df['IneffectiveDisasterPreparedness'] + df['InadequatePlanning']
    df['EcosystemImpact'] = df['WetlandLoss'] + df['Watersheds']
    df['SocioPoliticalContext'] = df['PopulationScore'] * df['PoliticalFactors']


    df['FloodVulnerabilityIndex'] = (df['AnthropogenicPressure'] + df['InfrastructureQuality'] +
                                     df['CoastalVulnerabilityTotal'] + df['PreventiveMeasuresEfficiency']) / 4
    
    df['PopulationDensityImpact'] = df['PopulationScore'] * (df['Urbanization'] + df['Encroachments'])
    
    #df['DeforestationUrbanizationRatio'] = df['Deforestation'] / df['Urbanization']
    
    df['AgriculturalEncroachmentImpact'] = df['AgriculturalPractices'] * df['Encroachments']
    
    df['DamDrainageInteraction'] = df['DamsQuality'] * df['DrainageSystems']
    
    df['LandslideSiltationInteraction'] = df['Landslides'] * df['Siltation']
    
    #df['WatershedWetlandRatio'] = df['Watersheds'] / df['WetlandLoss']
    
    df['PoliticalPreparednessInteraction'] = df['PoliticalFactors'] * df['IneffectiveDisasterPreparedness']
    
    
    df['TopographyDrainageSiltation'] = df['TopographyDrainage'] + df['Siltation']
    
    df['ClimateAnthropogenicInteraction'] = df['ClimateImpact'] * df['AnthropogenicPressure']
    
    df['InfrastructurePreventionInteraction'] = df['InfrastructureQuality'] * df['PreventiveMeasuresEfficiency']
    
    df['CoastalEcosystemInteraction'] = df['CoastalVulnerabilityTotal'] * df['EcosystemImpact']



sorted_features = [f"sort_{i}" for i in np.arange(len(initial_features))]
for df in [train, test]:
    df[sorted_features] = np.sort(df[initial_features], axis=1)

X_train, X_test, y_train, y_test = train_test_split(train, y, test_size=0.2, shuffle=True, random_state=42)

In [22]:
def get_summ_info(df):
    print(f'data shape: {df.shape}')
    summ = pd.DataFrame(df.dtypes, columns=['data type'])
    summ['#missing'] = df.isnull().sum().values
    summ['%missing'] = df.isnull().sum().values / len(df) * 100
    summ['#unique'] = df.nunique().values
    desc = pd.DataFrame(df.describe(include='all').transpose())
    summ['min'] = desc['min'].values
    summ['max'] = desc['max'].values
#     summ['first value'] = df.loc[0].values
#     summ['second value'] = df.loc[1].values
#     summ['third value'] = df.loc[2].values
    return summ.style.background_gradient(cmap='Blues')

In [23]:
model=CatBoostRegressor()
model.fit(X_train,y_train)
sigma1 = model.predict(X_test)
print('r2_score',r2_score(y_test, sigma1))

Learning rate set to 0.119817
0:	learn: 0.0460336	total: 59ms	remaining: 58.9s
1:	learn: 0.0416460	total: 114ms	remaining: 56.9s
2:	learn: 0.0379109	total: 179ms	remaining: 59.6s
3:	learn: 0.0347011	total: 240ms	remaining: 59.7s
4:	learn: 0.0319459	total: 301ms	remaining: 60s
5:	learn: 0.0296447	total: 360ms	remaining: 59.6s
6:	learn: 0.0277026	total: 419ms	remaining: 59.5s
7:	learn: 0.0260728	total: 472ms	remaining: 58.5s
8:	learn: 0.0247176	total: 527ms	remaining: 58s
9:	learn: 0.0235930	total: 581ms	remaining: 57.5s
10:	learn: 0.0226765	total: 637ms	remaining: 57.3s
11:	learn: 0.0219322	total: 693ms	remaining: 57.1s
12:	learn: 0.0213285	total: 747ms	remaining: 56.7s
13:	learn: 0.0208317	total: 802ms	remaining: 56.5s
14:	learn: 0.0204299	total: 867ms	remaining: 57s
15:	learn: 0.0201057	total: 925ms	remaining: 56.9s
16:	learn: 0.0198483	total: 976ms	remaining: 56.4s
17:	learn: 0.0196423	total: 1.03s	remaining: 56.1s
18:	learn: 0.0194679	total: 1.08s	remaining: 55.8s
19:	learn: 0.01933

In [24]:
model=LGBMRegressor()
model.fit(X_train,y_train)
sigma2 = model.predict(X_test)
print('r2_score',r2_score(y_test, sigma2))

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.031817 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2399
[LightGBM] [Info] Number of data points in the train set: 894365, number of used features: 67
[LightGBM] [Info] Start training from score 0.504480
r2_score 0.8686602420870676


In [25]:
model = XGBRFRegressor()
model.fit(X_train,y_train)
sigma3 = model.predict(X_test)
print('r2_score',r2_score(y_test, sigma3))

r2_score 0.8664075030293124


In [26]:
sigma = sigma1 * 0.75 + sigma3 * 0.15 + sigma3 * 0.1
print('r2_score',r2_score(y_test, sigma))

r2_score 0.8685858736946249


In [27]:
model=CatBoostRegressor()
model.fit(train,y)
sigma1 = model.predict(X_test)

model=LGBMRegressor()
model.fit(train,y)
sigma2 = model.predict(X_test)

model=XGBRFRegressor()
model.fit(train,y)
sigma3 = model.predict(X_test)

sigma = sigma1 * 0.8 + sigma3 * 0.1 + sigma3 * 0.1
print('r2_score',r2_score(y_test, sigma))

Learning rate set to 0.124117
0:	learn: 0.0458006	total: 77.2ms	remaining: 1m 17s
1:	learn: 0.0413317	total: 156ms	remaining: 1m 17s
2:	learn: 0.0374967	total: 228ms	remaining: 1m 15s
3:	learn: 0.0342218	total: 295ms	remaining: 1m 13s
4:	learn: 0.0314707	total: 366ms	remaining: 1m 12s
5:	learn: 0.0291484	total: 440ms	remaining: 1m 12s
6:	learn: 0.0272042	total: 514ms	remaining: 1m 12s
7:	learn: 0.0255964	total: 586ms	remaining: 1m 12s
8:	learn: 0.0242854	total: 654ms	remaining: 1m 12s
9:	learn: 0.0232070	total: 733ms	remaining: 1m 12s
10:	learn: 0.0223205	total: 798ms	remaining: 1m 11s
11:	learn: 0.0216220	total: 863ms	remaining: 1m 11s
12:	learn: 0.0210532	total: 930ms	remaining: 1m 10s
13:	learn: 0.0205889	total: 999ms	remaining: 1m 10s
14:	learn: 0.0202201	total: 1.07s	remaining: 1m 10s
15:	learn: 0.0199320	total: 1.14s	remaining: 1m 10s
16:	learn: 0.0197047	total: 1.2s	remaining: 1m 9s
17:	learn: 0.0195153	total: 1.27s	remaining: 1m 9s
18:	learn: 0.0193612	total: 1.33s	remaining: 1

In [28]:
sigma = model.predict(test)
sub = pd.Series(sigma.flatten(), index=test.index, name='FloodProbability')
filename = 'submission.csv'
sub.to_csv(filename)