# Regression with a Flood Prediction Dataset

## Exploring datasets

In [1]:
import os
os.environ['KAGGLE_CONFIG_DIR'] = '.'

In [2]:
!kaggle competitions download -c playground-series-s4e5

Downloading playground-series-s4e5.zip to f:\KaggleCompetition




  0%|          | 0.00/28.0M [00:00<?, ?B/s]
  4%|▎         | 1.00M/28.0M [00:00<00:25, 1.10MB/s]
  7%|▋         | 2.00M/28.0M [00:01<00:16, 1.70MB/s]
 11%|█         | 3.00M/28.0M [00:01<00:10, 2.43MB/s]
 14%|█▍        | 4.00M/28.0M [00:01<00:10, 2.49MB/s]
 18%|█▊        | 5.00M/28.0M [00:02<00:09, 2.59MB/s]
 21%|██▏       | 6.00M/28.0M [00:02<00:08, 2.64MB/s]
 25%|██▌       | 7.00M/28.0M [00:03<00:07, 2.80MB/s]
 29%|██▊       | 8.00M/28.0M [00:03<00:07, 2.85MB/s]
 32%|███▏      | 9.00M/28.0M [00:03<00:06, 2.88MB/s]
 36%|███▌      | 10.0M/28.0M [00:04<00:06, 2.78MB/s]
 39%|███▉      | 11.0M/28.0M [00:04<00:06, 2.84MB/s]
 43%|████▎     | 12.0M/28.0M [00:04<00:06, 2.65MB/s]
 47%|████▋     | 13.0M/28.0M [00:05<00:06, 2.38MB/s]
 50%|█████     | 14.0M/28.0M [00:06<00:07, 1.93MB/s]
 54%|█████▎    | 15.0M/28.0M [00:06<00:07, 1.92MB/s]
 57%|█████▋    | 16.0M/28.0M [00:07<00:06, 1.93MB/s]
 61%|██████    | 17.0M/28.0M [00:07<00:05, 1.93MB/s]
 64%|██████▍   | 18.0M/28.0M [00:08<00:05, 1.95MB/s]
 

In [3]:
from zipfile import ZipFile
with ZipFile('playground-series-s4e5.zip') as f:
    f.extractall('data')

In [4]:
import pandas as pd

In [5]:
raw_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')
sub_df = pd.read_csv('data/sample_submission.csv')

In [6]:
raw_df.drop(columns='id', inplace=True)

In [7]:
raw_df

Unnamed: 0,MonsoonIntensity,TopographyDrainage,RiverManagement,Deforestation,Urbanization,ClimateChange,DamsQuality,Siltation,AgriculturalPractices,Encroachments,...,DrainageSystems,CoastalVulnerability,Landslides,Watersheds,DeterioratingInfrastructure,PopulationScore,WetlandLoss,InadequatePlanning,PoliticalFactors,FloodProbability
0,5,8,5,8,6,4,4,3,3,4,...,5,3,3,5,4,7,5,7,3,0.445
1,6,7,4,4,8,8,3,5,4,6,...,7,2,0,3,5,3,3,4,3,0.450
2,6,5,6,7,3,7,1,5,4,5,...,7,3,7,5,6,8,2,3,3,0.530
3,3,4,6,5,4,8,4,7,6,8,...,2,4,7,4,4,6,5,7,5,0.535
4,5,3,2,6,4,4,3,3,3,3,...,2,2,6,6,4,1,2,3,5,0.415
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1117952,3,3,4,10,4,5,5,7,10,4,...,7,8,7,2,2,1,4,6,4,0.495
1117953,2,2,4,3,9,5,8,1,3,5,...,9,4,4,3,7,4,9,4,5,0.480
1117954,7,3,9,4,6,5,9,1,3,4,...,5,5,5,5,6,5,5,2,4,0.485
1117955,7,3,3,7,5,2,3,4,6,4,...,6,8,5,3,4,6,7,6,4,0.495


In [8]:
raw_df.columns.tolist()

['MonsoonIntensity',
 'TopographyDrainage',
 'RiverManagement',
 'Deforestation',
 'Urbanization',
 'ClimateChange',
 'DamsQuality',
 'Siltation',
 'AgriculturalPractices',
 'Encroachments',
 'IneffectiveDisasterPreparedness',
 'DrainageSystems',
 'CoastalVulnerability',
 'Landslides',
 'Watersheds',
 'DeterioratingInfrastructure',
 'PopulationScore',
 'WetlandLoss',
 'InadequatePlanning',
 'PoliticalFactors',
 'FloodProbability']

In [9]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [10]:
def draw_numeric_features(data, x=None, y=None, overlap=None):
    plt.figure(figsize=(30,10))
    plt.grid()
    sns.lineplot(data, x=x, y=y)
    if overlap is not None:
        for plot in overlap:
            sns.lineplot(data=plot[0], x=plot[1], y=plot[2])

In [11]:
def view_avg_of_each(cat_features: str):
    d = dict()
    cols = raw_df[cat_features].unique()
    for x in cols:
        d[x] = raw_df[raw_df[cat_features]==x]['FloodProbability'].mean()
    return pd.Series(d, index=cols)

In [12]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [13]:
original_cols = raw_df.columns.tolist()[:-1]
len(original_cols)

20

In [15]:
for col in original_cols:
    print(col, raw_df['FloodProbability'].corr(raw_df[col]))

MonsoonIntensity 0.18909829231500538
TopographyDrainage 0.18763504566957082
RiverManagement 0.18713064507266175
Deforestation 0.18400091625151702
Urbanization 0.1808605073024371
ClimateChange 0.18476149035819409
DamsQuality 0.187996149143944
Siltation 0.18678927218265232
AgriculturalPractices 0.1833656526594231
Encroachments 0.17884076840019714
IneffectiveDisasterPreparedness 0.18310869818609768
DrainageSystems 0.17930491217347258
CoastalVulnerability 0.17777438840981968
Landslides 0.185346063879017
Watersheds 0.18190650150828644
DeterioratingInfrastructure 0.19000695837002968
PopulationScore 0.1858896982732984
WetlandLoss 0.18339641617755228
InadequatePlanning 0.18096767964551685
PoliticalFactors 0.1824169447328339


In [16]:
import numpy as np

In [17]:
for df in [raw_df, test_df]:
    df['full_sum'] = df[original_cols].sum(axis=1)

In [18]:
raw_df['FloodProbability'].corr(raw_df['full_sum'])

0.91920481123786

In [20]:
full_sum_mean = raw_df.groupby('full_sum')['FloodProbability'].mean()
full_sum_mean

full_sum
63     0.320000
64     0.319167
65     0.318958
66     0.322000
67     0.324889
         ...   
140    0.712222
141    0.712500
142    0.708333
143    0.723125
144    0.705000
Name: FloodProbability, Length: 82, dtype: float64

In [21]:
for df in [raw_df, test_df]:
    df['full_sum_mean'] = df['full_sum'].apply(lambda x: full_sum_mean.get(x))

In [22]:
raw_df['FloodProbability'].corr(raw_df['full_sum_mean'])

0.9304820103253921

In [23]:
full_sum_std = raw_df.groupby('full_sum')['FloodProbability'].std()
full_sum_std

full_sum
63          NaN
64     0.001946
65     0.002052
66     0.004830
67     0.003452
         ...   
140    0.009052
141    0.011650
142    0.007638
143    0.005303
144    0.007071
Name: FloodProbability, Length: 82, dtype: float64

In [26]:
full_sum_std[63] = 0
full_sum_std

full_sum
63     0.000000
64     0.001946
65     0.002052
66     0.004830
67     0.003452
         ...   
140    0.009052
141    0.011650
142    0.007638
143    0.005303
144    0.007071
Name: FloodProbability, Length: 82, dtype: float64

In [27]:
for df in [raw_df, test_df]:
    df['full_sum_std'] = df['full_sum'].apply(lambda x: full_sum_std.get(x))

In [28]:
raw_df['FloodProbability'].corr(raw_df['full_sum_std'])

-0.23607287351267003

In [29]:
for df in [raw_df, test_df]:
    df['full_max'] = raw_df[original_cols].max(axis=1)
raw_df['FloodProbability'].corr(raw_df['full_max'])

0.314863684049087

In [30]:
full_max_mean = raw_df.groupby('full_max')['FloodProbability'].mean()
for df in [raw_df, test_df]:
    df['full_max_mean'] = df['full_max'].apply(lambda x: full_max_mean.get(x))
raw_df['FloodProbability'].corr(raw_df['full_max_mean'])

0.37499908814329386

In [32]:
sorted_features = [f'sort_{i}' for i in np.arange(len(original_cols))]
for df in [raw_df, test_df]:
    df[sorted_features] = np.sort(df[original_cols], axis=1)

In [33]:
for f in sorted_features:
    print(f, raw_df['FloodProbability'].corr(raw_df[f]))

sort_0 0.34461680103634
sort_1 0.43350913520694995
sort_2 0.4901834680146479
sort_3 0.5310618254056313
sort_4 0.5542028636268266
sort_5 0.5684107939406092
sort_6 0.5912959086233475
sort_7 0.6219988983466883
sort_8 0.632270767511943
sort_9 0.6252090644864967
sort_10 0.6285458133617731
sort_11 0.6410575032526965
sort_12 0.636575479952956
sort_13 0.6183837779318923
sort_14 0.6016824843193823
sort_15 0.5823510303063687
sort_16 0.5504919506917932
sort_17 0.5081958001832132
sort_18 0.44986269715987964
sort_19 0.314863684049087


## Pipeline Preprocessing

In [38]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer

In [62]:
numeric_cols = sorted_features + ['full_sum','full_sum_mean','full_sum_std','full_max','full_max_mean']
target_col = 'FloodProbability'

In [70]:
raw_df[numeric_cols]

Unnamed: 0,sort_0,sort_1,sort_2,sort_3,sort_4,sort_5,sort_6,sort_7,sort_8,sort_9,...,sort_15,sort_16,sort_17,sort_18,sort_19,full_sum,full_sum_mean,full_sum_std,full_max,full_max_mean
0,2,3,3,3,3,3,4,4,4,4,...,6,7,7,8,8,94,0.475482,0.028500,8,0.488569
1,0,2,3,3,3,3,3,4,4,4,...,7,7,8,8,9,94,0.475482,0.028500,9,0.506598
2,1,2,3,3,3,3,4,5,5,5,...,7,7,7,7,8,99,0.507852,0.014107,8,0.488569
3,2,3,4,4,4,4,4,4,5,5,...,7,7,7,8,8,104,0.536978,0.015852,8,0.488569
4,1,2,2,2,2,3,3,3,3,3,...,5,5,6,6,6,72,0.409094,0.019381,6,0.396608
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1117952,1,2,2,3,3,3,4,4,4,4,...,7,7,8,10,10,99,0.507852,0.014107,10,0.516994
1117953,1,2,2,3,3,3,4,4,4,4,...,7,8,9,9,9,96,0.484220,0.014223,9,0.506598
1117954,1,2,3,3,4,4,4,5,5,5,...,6,6,7,9,9,98,0.494022,0.016838,9,0.506598
1117955,2,3,3,3,3,4,4,4,4,5,...,6,7,7,7,8,99,0.507852,0.014107,8,0.488569


In [63]:
len(numeric_cols)

25

In [53]:
num_pre_process_pl = Pipeline(
    steps=[
        ('scaler', MinMaxScaler())
    ]
)

In [71]:
preprocessor = ColumnTransformer(
    transformers = [
        ('num', num_pre_process_pl, numeric_cols)
    ]
)

In [49]:
from sklearn.model_selection import train_test_split
train_df, val_df = train_test_split(raw_df, test_size=0.25, random_state=42)

In [64]:
train_inputs = train_df[numeric_cols]
val_inputs = val_df[numeric_cols]
test_inputs = test_df[numeric_cols]

train_targets = train_df[target_col]
val_targets = val_df[target_col]

## Complete Pipeline with Model

In [59]:
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

In [56]:
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score

In [65]:
regressor = {
    'LinearRegression': LinearRegression(),
    'Ridge': Ridge(),
    'DecisionTreeRegressor': DecisionTreeRegressor(),
    'RandomForestClassifier': RandomForestRegressor(n_jobs=-1),
    'GradientBoostingRegressor': GradientBoostingRegressor(),
    'XGBRegressor': XGBRegressor(n_jobs=-1),
    'LGBMRegressor': LGBMRegressor(n_jobs=-1)
}
result = {'Reg': [], 'R2': []}

In [72]:
for reg_name, reg in regressor.items():
    pl = make_pipeline(preprocessor, reg)
    pl.fit(train_inputs, train_targets)
    val_preds = pl.predict(val_inputs)
    r2 = r2_score(val_targets, val_preds)
    result['Reg'].append(reg_name)
    result['R2'].append(r2)
result_df = pd.DataFrame(result)
result_df

Unnamed: 0,Reg,R2
0,LinearRegression,0.866138
1,Ridge,0.866136
2,DecisionTreeRegressor,0.80389
3,RandomForestClassifier,0.847527
4,GradientBoostingRegressor,0.867643
5,XGBRegressor,0.869039
6,LGBMRegressor,0.869018


In [73]:
def test_params(**params):
    pl = make_pipeline(preprocessor, XGBRegressor(**params))
    pl.fit(train_inputs, train_targets)
    train_preds = pl.predict(train_inputs)
    val_preds = pl.predict(val_inputs)
    train_r2 = r2_score(train_targets, train_preds)
    val_r2 = r2_score(val_targets, val_preds)
    return train_r2, val_r2

In [74]:
test_params(n_jobs=-1)

(0.8707252510172125, 0.8690385596704092)

In [75]:
for i in [None,3,6,9,12,14]:
    print(test_params(n_jobs=-1, max_depth=i))

(0.8707252510172125, 0.8690385596704092)
(0.8687542464409878, 0.8686172364757945)
(0.8707252510172125, 0.8690385596704092)
(0.8756524037962613, 0.8679844578224826)
(0.8885782948172484, 0.8640758611331377)
(0.9017835473430026, 0.8587516305975357)


In [77]:
for i in [50,70,90,100]:
    print(test_params(n_jobs=-1, n_estimators=i))

(0.8701103379321116, 0.8690881287373866)
(0.8703769382641575, 0.8690600674450489)
(0.8706285511550456, 0.8690550134651867)
(0.8707252510172125, 0.8690385596704092)


In [78]:
for i in [0.05,0.1,0.15,0.2,0.3]:
    print(test_params(n_jobs=-1, n_estimators=60,
                      learning_rate=i))

(0.8664912861619115, 0.8663804683008098)
(0.8691568532961609, 0.8688144554225492)
(0.8696994594646364, 0.8690625228046429)
(0.8698963497968445, 0.8690978844065467)
(0.8702399791966462, 0.8690629375541232)


In [79]:
best_xgb_pl = make_pipeline(
    preprocessor, 
    XGBRegressor(n_jobs=-1, n_estimators=60, learning_rate=0.15)
)

In [81]:
best_xgb_pl.fit(train_inputs, train_targets)

In [82]:
val_preds = best_xgb_pl.predict(val_inputs)
r2_score(val_targets, val_preds)

0.8690625228046429

In [83]:
test_preds = best_xgb_pl.predict(test_inputs)
test_preds

array([0.57803506, 0.45390484, 0.45012468, ..., 0.62117004, 0.5488737 ,
       0.5303294 ], dtype=float32)

In [84]:
sub_df['FloodProbability'] = test_preds
sub_df.to_csv('data/sub1.csv', index=None)