In [28]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity
from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.linear_model import ElasticNetCV
from sklearn.model_selection import train_test_split
import pickle
import csv
from tqdm import tqdm

import gc

gc.collect()


2200

In [29]:
test_pd = pd.read_csv("data/test.csv")
train_pd = pd.read_csv("data/train.csv")

train_pd = train_pd.drop(columns=["id"])

train_factors = train_pd.drop(columns=["FloodProbability"])
train_flood_prob = train_pd["FloodProbability"]

In [30]:
var_coors = train_factors.corr()

mask = np.zeros_like(var_coors, dtype=bool)
mask[np.triu_indices_from(mask)] = True

df_corr_viz = var_coors.mask(mask)
fig = px.imshow(df_corr_viz, text_auto=True)

fig.update_layout(height=1000, width=1800, title_text="Heatmap of variable correlations")
fig.show()

In [31]:
train_pd.corr()["FloodProbability"].sort_values()

CoastalVulnerability               0.177774
Encroachments                      0.178841
DrainageSystems                    0.179305
Urbanization                       0.180861
InadequatePlanning                 0.180968
Watersheds                         0.181907
PoliticalFactors                   0.182417
IneffectiveDisasterPreparedness    0.183109
AgriculturalPractices              0.183366
WetlandLoss                        0.183396
Deforestation                      0.184001
ClimateChange                      0.184761
Landslides                         0.185346
PopulationScore                    0.185890
Siltation                          0.186789
RiverManagement                    0.187131
TopographyDrainage                 0.187635
DamsQuality                        0.187996
MonsoonIntensity                   0.189098
DeterioratingInfrastructure        0.190007
FloodProbability                   1.000000
Name: FloodProbability, dtype: float64

In [32]:
num_clusters = np.arange(10, 20, 1)

def kmeans_models(num_clusters, x_vals):
    kms_models = []

    for num in tqdm(num_clusters):
        kms = KMeans(n_clusters=num, n_init="auto", random_state=0, verbose=True)
        kms.fit(x_vals)
        kms_models.append(kms)

    return kms_models

kms_models = kmeans_models(num_clusters, train_factors)

  0%|          | 0/10 [00:00<?, ?it/s]

Initialization complete
Iteration 0, inertia 106607807.0.
Iteration 1, inertia 83565361.15076095.
Iteration 2, inertia 83226356.33686036.
Iteration 3, inertia 82994098.98064014.
Iteration 4, inertia 82789344.13869834.
Iteration 5, inertia 82603623.58193538.
Iteration 6, inertia 82436287.97304167.
Iteration 7, inertia 82283387.25131871.
Iteration 8, inertia 82140841.60998447.
Iteration 9, inertia 82003203.92820272.
Iteration 10, inertia 81869737.61602487.
Iteration 11, inertia 81742648.115749.
Iteration 12, inertia 81623486.93169972.
Iteration 13, inertia 81516005.78382362.
Iteration 14, inertia 81420390.17424251.
Iteration 15, inertia 81337194.68011585.
Iteration 16, inertia 81265700.2311503.
Iteration 17, inertia 81205727.60369273.
Iteration 18, inertia 81156786.42014466.
Iteration 19, inertia 81118639.82688114.
Iteration 20, inertia 81091969.91980386.
Iteration 21, inertia 81074744.43237863.
Iteration 22, inertia 81063040.0026153.
Iteration 23, inertia 81054799.94726726.
Iteration 24

 10%|█         | 1/10 [00:01<00:17,  1.98s/it]

Initialization complete
Iteration 0, inertia 103011814.0.
Iteration 1, inertia 82993065.90920784.
Iteration 2, inertia 82535533.33016083.
Iteration 3, inertia 82272816.0446698.
Iteration 4, inertia 82069601.92522492.
Iteration 5, inertia 81894500.36410598.
Iteration 6, inertia 81732779.4396768.
Iteration 7, inertia 81576333.5024443.
Iteration 8, inertia 81419602.99558163.
Iteration 9, inertia 81260257.80106023.
Iteration 10, inertia 81099951.87903036.
Iteration 11, inertia 80941011.9298828.
Iteration 12, inertia 80790839.57929187.
Iteration 13, inertia 80656180.48355748.
Iteration 14, inertia 80540239.1588392.
Iteration 15, inertia 80444548.24315174.
Iteration 16, inertia 80368487.55781701.
Iteration 17, inertia 80312494.86621433.
Iteration 18, inertia 80272186.68330352.
Iteration 19, inertia 80241605.23571664.
Iteration 20, inertia 80216959.34158832.
Iteration 21, inertia 80195450.80128208.
Iteration 22, inertia 80176377.85307312.
Iteration 23, inertia 80159678.89660464.
Iteration 24,

 20%|██        | 2/10 [00:03<00:15,  1.98s/it]

Initialization complete
Iteration 0, inertia 101592695.0.
Iteration 1, inertia 82287994.86457363.
Iteration 2, inertia 81809500.56159817.
Iteration 3, inertia 81502714.80458987.
Iteration 4, inertia 81263082.52416977.
Iteration 5, inertia 81062955.29413095.
Iteration 6, inertia 80889753.9224478.
Iteration 7, inertia 80731917.500254.
Iteration 8, inertia 80579265.10492367.
Iteration 9, inertia 80427052.92213847.
Iteration 10, inertia 80273201.05828637.
Iteration 11, inertia 80125981.13903932.
Iteration 12, inertia 79992543.75255182.
Iteration 13, inertia 79876963.94606178.
Iteration 14, inertia 79780298.61814307.
Iteration 15, inertia 79701422.52867797.
Iteration 16, inertia 79636324.63813075.
Iteration 17, inertia 79581922.81485231.
Iteration 18, inertia 79535668.66439606.
Iteration 19, inertia 79496913.31587516.
Iteration 20, inertia 79464944.33446772.
Iteration 21, inertia 79437052.34486951.
Iteration 22, inertia 79413329.08018842.
Iteration 23, inertia 79394113.9099024.
Iteration 24

 30%|███       | 3/10 [00:06<00:14,  2.06s/it]

Iteration 63, inertia 79180522.57310301.
Converged at iteration 63: center shift 0.00042286728060318354 within tolerance 0.000430479748744507.
Initialization complete
Iteration 0, inertia 100658730.0.
Iteration 1, inertia 81597990.81626752.
Iteration 2, inertia 81066158.19814284.
Iteration 3, inertia 80728322.91577446.
Iteration 4, inertia 80469494.07122704.
Iteration 5, inertia 80260779.9967217.
Iteration 6, inertia 80087215.44055554.
Iteration 7, inertia 79936195.70399004.
Iteration 8, inertia 79794835.30067325.
Iteration 9, inertia 79657171.92152715.
Iteration 10, inertia 79521195.21929197.
Iteration 11, inertia 79385361.72308785.
Iteration 12, inertia 79250843.66071403.
Iteration 13, inertia 79116327.08360082.
Iteration 14, inertia 78979544.68795383.
Iteration 15, inertia 78845231.26570551.
Iteration 16, inertia 78724192.26893434.
Iteration 17, inertia 78624341.00518616.
Iteration 18, inertia 78549311.68049996.
Iteration 19, inertia 78497908.35523933.
Iteration 20, inertia 78463734

 40%|████      | 4/10 [00:08<00:12,  2.08s/it]

Iteration 51, inertia 78406090.98771355.
Iteration 52, inertia 78405988.77998367.
Iteration 53, inertia 78405899.13615045.
Iteration 54, inertia 78405825.2671353.
Converged at iteration 54: center shift 0.00040226780317925416 within tolerance 0.000430479748744507.
Initialization complete
Iteration 0, inertia 99786318.0.
Iteration 1, inertia 81076403.86342658.
Iteration 2, inertia 80483502.32746905.
Iteration 3, inertia 80096555.93747146.
Iteration 4, inertia 79796987.9862129.
Iteration 5, inertia 79557855.80076668.
Iteration 6, inertia 79365191.48233458.
Iteration 7, inertia 79203495.30670866.
Iteration 8, inertia 79059636.35349905.
Iteration 9, inertia 78924340.12858918.
Iteration 10, inertia 78795893.48233235.
Iteration 11, inertia 78675405.4699726.
Iteration 12, inertia 78561947.67867763.
Iteration 13, inertia 78454074.23087807.
Iteration 14, inertia 78347812.36805612.
Iteration 15, inertia 78241351.16740298.
Iteration 16, inertia 78133687.70333643.
Iteration 17, inertia 78026360.45

 50%|█████     | 5/10 [00:10<00:10,  2.19s/it]

Initialization complete
Iteration 0, inertia 98924591.0.
Iteration 1, inertia 80608952.88028055.
Iteration 2, inertia 80017536.38422425.
Iteration 3, inertia 79639643.93226318.
Iteration 4, inertia 79349722.38509995.
Iteration 5, inertia 79117563.19566433.
Iteration 6, inertia 78929129.75001934.
Iteration 7, inertia 78772307.40555985.
Iteration 8, inertia 78636298.28825483.
Iteration 9, inertia 78512049.53983247.
Iteration 10, inertia 78392113.52061139.
Iteration 11, inertia 78266793.61864844.
Iteration 12, inertia 78129369.5485476.
Iteration 13, inertia 77978625.61978099.
Iteration 14, inertia 77825032.80847028.
Iteration 15, inertia 77684788.90771377.
Iteration 16, inertia 77571171.97631273.
Iteration 17, inertia 77485428.71566671.
Iteration 18, inertia 77424020.2521514.
Iteration 19, inertia 77380617.67075641.
Iteration 20, inertia 77345962.7892378.
Iteration 21, inertia 77308527.53009665.
Iteration 22, inertia 77261026.96406838.
Iteration 23, inertia 77201409.32705441.
Iteration 24

 60%|██████    | 6/10 [00:13<00:09,  2.42s/it]

Iteration 61, inertia 76960709.98495059.
Iteration 62, inertia 76960641.34953128.
Converged at iteration 62: center shift 0.0004224360710946537 within tolerance 0.000430479748744507.
Initialization complete
Iteration 0, inertia 98369399.0.
Iteration 1, inertia 80197058.28835374.
Iteration 2, inertia 79570636.86365676.
Iteration 3, inertia 79163342.8397269.
Iteration 4, inertia 78848718.02681872.
Iteration 5, inertia 78597903.41498812.
Iteration 6, inertia 78389552.71735615.
Iteration 7, inertia 78209892.96788548.
Iteration 8, inertia 78044932.55935575.
Iteration 9, inertia 77885800.87156038.
Iteration 10, inertia 77728648.91650358.
Iteration 11, inertia 77573102.32741022.
Iteration 12, inertia 77420292.23269631.
Iteration 13, inertia 77274331.19550155.
Iteration 14, inertia 77141336.25333793.
Iteration 15, inertia 77028259.56096198.
Iteration 16, inertia 76939899.66010463.
Iteration 17, inertia 76876692.25445172.
Iteration 18, inertia 76836330.03418486.
Iteration 19, inertia 76810498.5

 70%|███████   | 7/10 [00:16<00:07,  2.61s/it]

Iteration 69, inertia 76275508.33124769.
Iteration 70, inertia 76275449.17048188.
Iteration 71, inertia 76275386.81521939.
Iteration 72, inertia 76275327.37472034.
Converged at iteration 72: center shift 0.00042732243928272093 within tolerance 0.000430479748744507.
Initialization complete
Iteration 0, inertia 97346893.0.
Iteration 1, inertia 79723980.91481237.
Iteration 2, inertia 79086266.73050533.
Iteration 3, inertia 78650763.8676265.
Iteration 4, inertia 78297935.22139534.
Iteration 5, inertia 78009975.92617911.
Iteration 6, inertia 77777020.54828438.
Iteration 7, inertia 77589577.52583106.
Iteration 8, inertia 77435243.36822376.
Iteration 9, inertia 77298901.50609645.
Iteration 10, inertia 77169913.27776237.
Iteration 11, inertia 77036068.53505874.
Iteration 12, inertia 76891897.70796205.
Iteration 13, inertia 76739268.61991274.
Iteration 14, inertia 76583734.04146457.
Iteration 15, inertia 76433681.81623511.
Iteration 16, inertia 76298719.27139011.
Iteration 17, inertia 76184090.

 80%|████████  | 8/10 [00:19<00:05,  2.65s/it]

Initialization complete
Iteration 0, inertia 95877678.0.
Iteration 1, inertia 79361180.89061473.
Iteration 2, inertia 78688124.55128185.
Iteration 3, inertia 78212487.3747187.
Iteration 4, inertia 77812514.04255952.
Iteration 5, inertia 77474011.05473635.
Iteration 6, inertia 77189808.51170059.
Iteration 7, inertia 76950283.24647921.
Iteration 8, inertia 76741158.8971562.
Iteration 9, inertia 76548222.75787413.
Iteration 10, inertia 76361926.53742072.
Iteration 11, inertia 76176457.68067585.
Iteration 12, inertia 75986495.76761937.
Iteration 13, inertia 75793165.6958108.
Iteration 14, inertia 75604153.56086044.
Iteration 15, inertia 75430647.47396186.
Iteration 16, inertia 75288550.49332975.
Iteration 17, inertia 75192969.00356197.
Iteration 18, inertia 75142117.04741341.
Iteration 19, inertia 75120864.41138974.
Iteration 20, inertia 75112550.72711153.
Iteration 21, inertia 75108431.52308536.
Iteration 22, inertia 75105678.04575835.
Iteration 23, inertia 75103567.1823863.
Iteration 24,

 90%|█████████ | 9/10 [00:21<00:02,  2.61s/it]

Initialization complete
Iteration 0, inertia 95578391.0.
Iteration 1, inertia 78991674.23007008.
Iteration 2, inertia 78298456.21589334.
Iteration 3, inertia 77798896.47096857.
Iteration 4, inertia 77359434.27794899.
Iteration 5, inertia 76978621.43456964.
Iteration 6, inertia 76657382.29085873.
Iteration 7, inertia 76390001.95215394.
Iteration 8, inertia 76169904.00862499.
Iteration 9, inertia 75986325.66539796.
Iteration 10, inertia 75831978.28310168.
Iteration 11, inertia 75687586.24892844.
Iteration 12, inertia 75537680.65556885.
Iteration 13, inertia 75371467.47345339.
Iteration 14, inertia 75194612.54486404.
Iteration 15, inertia 75019302.9190147.
Iteration 16, inertia 74859409.54226732.
Iteration 17, inertia 74729949.0948384.
Iteration 18, inertia 74640126.25851943.
Iteration 19, inertia 74590319.05325969.
Iteration 20, inertia 74569513.76781821.
Iteration 21, inertia 74561419.8799971.
Iteration 22, inertia 74557561.85345732.
Iteration 23, inertia 74555163.67479183.
Iteration 24

100%|██████████| 10/10 [00:24<00:00,  2.44s/it]


In [33]:
for i, kms_model in enumerate(kms_models):
    train_factors[f"kms_{i+1}"] = kms_model.labels_

In [34]:
X_train, X_test, y_train, y_test = train_test_split(train_factors, train_flood_prob, test_size=0.2)

In [35]:
model = ElasticNetCV(
    cv=10,
    n_jobs=-1,
    verbose=True
)
model.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

In [36]:
train_preds = model.predict(X_train)
train_rmse = metrics.mean_squared_error(y_train, train_preds, squared=False)
print(f"Train RMSE: {train_rmse}")

Train RMSE: 0.02007822320486821


In [37]:
val_preds = model.predict(X_test)
val_rmse = metrics.mean_squared_error(y_test, val_preds, squared=False)
print(f"Validation RMSE: {val_rmse}")

Validation RMSE: 0.020145879460214476


In [38]:
out_pd = test_pd.copy(deep=True)
for i, kms_model in enumerate(kms_models):
    out_pd[f"kms_{i+1}"] = kms_model.predict(test_pd.drop(columns=["id"]))

out_pd = out_pd.join(
    pd.DataFrame({"FloodProbability": model.predict(out_pd.drop(columns=["id"]))})
)
out_pd.head()

Unnamed: 0,id,MonsoonIntensity,TopographyDrainage,RiverManagement,Deforestation,Urbanization,ClimateChange,DamsQuality,Siltation,AgriculturalPractices,...,kms_2,kms_3,kms_4,kms_5,kms_6,kms_7,kms_8,kms_9,kms_10,FloodProbability
0,1117957,4,6,3,5,6,7,8,7,8,...,3,3,5,7,7,3,7,7,7,0.573442
1,1117958,4,4,2,9,5,5,4,7,5,...,8,8,8,8,8,8,8,8,8,0.455271
2,1117959,1,3,6,5,7,2,4,6,4,...,10,4,3,3,6,6,6,6,6,0.454781
3,1117960,2,4,4,6,4,5,4,3,4,...,3,3,8,8,6,6,6,6,6,0.466211
4,1117961,6,3,2,4,6,4,5,5,3,...,6,11,11,11,11,11,11,11,11,0.466144


In [39]:
out_pd.to_csv("simple_elasticnet_with_kmeans_v1.csv", columns=["id", "FloodProbability"], index=False)

# Random Forest

In [40]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [45]:
%%time 

rf_model = RandomForestRegressor(random_state=0)

# cv_params = {
#     "n_estimators" : np.arange(10, 11, 1),
#     "criterion" : ["squared_error", "absolute_error", "friedman_mse", "poisson"],
#     "max_depth" : np.append(np.arange(1, 11, 2), [None]),
#     # "min_samples_split" : np.append(np.arange(1,10), np.arange(0.1, 0.6, 0.1)),
#     # "min_samples_leaf" : np.append(np.arange(1,10), np.arange(0.1, 0.6, 0.1)),
#     "max_features" : ["sqrt", "log2", None],
# }

# scoring = ["neg_mean_squared_error", "neg_root_mean_squared_error", "explained_variance", "r2"]

# rf_cv = GridSearchCV(rf_model, cv_params, scoring=scoring, refit='neg_root_mean_squared_error', verbose=1, n_jobs=-1, cv=5).fit(X_train, y_train)
rf_model.fit(X_train, y_train)

CPU times: total: 21min 56s
Wall time: 22min 9s


In [47]:
out_pd = out_pd.join(
    pd.DataFrame({"FloodProbability_RF": rf_model.predict(out_pd.drop(columns=["id", "FloodProbability"]))})
)
out_pd.to_csv("rf_with_kmeans_v1.csv", columns=["id", "FloodProbability_RF"], index=False)