# Urban Flood Risk

**Problem Statement:**  
Urban flooding poses significant risks to infrastructure, public safety, and economic stability. This project focuses on identifying flood-prone areas within cities, analyzing the underlying causes, and providing actionable insights to support urban planning and disaster management efforts.

**Description:**  
This project utilizes a synthetic dataset cataloging micro-areas (“segments”) across global cities to assess urban pluvial (rainfall-driven) flood risk. Each record represents a spatial segment with geographic coordinates, hydrologic context, drainage infrastructure characteristics, rainfall sources and intensities, and qualitative risk labels. By integrating global elevation and land datasets, local/remote rainfall sources, and infrastructure proximity metrics, the project supports hotspot detection, risk scoring, model training, and operational monitoring for effective flood risk management.


In [None]:
# Install additional required libraries
%pip install seaborn scikit-learn
%pip install pandas
%pip install matplotlib

# Import all necessary libraries for comprehensive analysis
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('default')
sns.set_palette("husl")
%matplotlib inline

print("Libraries imported successfully!")


In [3]:
df=pd.read_csv("D:/intern_dataset/archive/urban_pluvial_flood_risk_dataset.csv")
print(df.head())

  segment_id             city_name    admin_ward   latitude   longitude  \
0  SEG-00001    Colombo, Sri Lanka  Borough East   6.920633   79.912600   
1  SEG-00002        Chennai, India        Ward D  13.076487   80.281774   
2  SEG-00003      Ahmedabad, India     Sector 12  23.019473   72.638578   
3  SEG-00004      Hong Kong, China     Sector 14  22.302602  114.078673   
4  SEG-00005  Durban, South Africa      Sector 5 -29.887602   30.911008   

  catchment_id  elevation_m            dem_source       land_use soil_group  \
0      CAT-136          NaN  Copernicus_EEA-10_v5  Institutional        NaN   
1      CAT-049        -2.19  Copernicus_EEA-10_v5    Residential          D   
2      CAT-023        30.88             SRTM_3arc     Industrial          B   
3      CAT-168        24.28             SRTM_3arc    Residential          B   
4      CAT-171        35.70             SRTM_3arc     Industrial          C   

   drainage_density_km_per_km2  storm_drain_proximity_m storm_drain_type  

In [None]:
df.head(10)

In [None]:

print("URBAN FLOOD RISK DATASET - EXPLORATORY DATA ANALYSIS")
# Load the dataset
df=pd.read_csv("D:/intern_dataset/archive/urban_pluvial_flood_risk_dataset.csv")

print(f"\n1. DATASET OVERVIEW:")
print(f"   • Dataset Shape: {df.shape}")
print(f"   • Number of Rows: {df.shape[0]:,}")
print(f"  Number of Columns: {df.shape[1]}")

print(f"\n2. COLUMN INFORMATION:")
print(df.info())

print(f"\n3. FIRST 5 ROWS:")
print(df.head())

print(f"\n4. LAST 5 ROWS:")
print(df.tail())       


In [None]:


print("\n MISSING VALUES ANALYSIS:")

# Calculate missing values
missing_values = df.isnull().sum()
missing_percentage = (missing_values / len(df)) * 100

missing_df = pd.DataFrame({
    'Column': missing_values.index,
    'Missing_Count': missing_values.values,
    'Missing_Percentage': missing_percentage.values
})

missing_df = missing_df[missing_df['Missing_Count'] > 0].sort_values('Missing_Percentage', ascending=False)

if len(missing_df) > 0:
    print("Columns with missing values:")
    print(missing_df.to_string(index=False))
else:
    print("No missing values found in the dataset!")

print(f"\nTotal missing values: {df.isnull().sum().sum()}")
print(f"Percentage of missing data: {(df.isnull().sum().sum() / (df.shape[0] * df.shape[1])) * 100:.2f}%")


In [None]:


print("\n STATISTICS:")


# Numerical columns summary
numerical_cols = df.select_dtypes(include=[np.number]).columns
print(f"Numerical columns ({len(numerical_cols)}): {list(numerical_cols)}")

if len(numerical_cols) > 0:
    print("\nDescriptive Statistics for Numerical Columns:")
    print(df[numerical_cols].describe().round(2))

# Categorical columns summary
categorical_cols = df.select_dtypes(include=['object']).columns
print(f"\nCategorical columns ({len(categorical_cols)}): {list(categorical_cols)}")

if len(categorical_cols) > 0:
    print("\nCategorical Columns Summary:")
    for col in categorical_cols:
        print(f"\n{col}:")
        print(f"  • Unique values: {df[col].nunique()}")
        print(f"  • Most frequent: {df[col].mode().iloc[0] if not df[col].mode().empty else 'N/A'}")
        if df[col].nunique() <= 10:  # Show value counts for columns with few unique values
            print(f"  • Value counts:")
            print(df[col].value_counts().head())


In [None]:
#statistics
df.describe()

In [None]:
sns.countplot(x='storm_drain_type', data=df)
plt.title('storm drain type')
plt.show()

In [None]:
import math

num_cols = ["latitude","longitude","elevation_m","return_period_years"]
n_cols = 2
n_rows = math.ceil(len(num_cols) / n_cols)

fig, axes = plt.subplots(n_rows, n_cols, figsize=(10, 6))
axes = axes.ravel()

for ax, col in zip(axes, num_cols):
    sns.histplot(df[col].dropna(), ax=ax, kde=True, bins=30)
    ax.set_title(f"{col} distribution")

# remove empty axes if any
for ax in axes[len(num_cols):]:
    ax.remove()

plt.tight_layout()
plt.show()

In [None]:
# Scatter + trend
sns.regplot(data=df, x="elevation_m", y="historical_rainfall_intensity_mm_hr", scatter_kws={"alpha":0.5})
plt.title("Elevation vs Rainfall Intensity"); plt.tight_layout(); plt.show()

# Joint distribution
sns.jointplot(data=df, x="latitude", y="elevation_m", kind="hex", height=5); plt.show()

# Correlation (subset)
cols = ["latitude","longitude","elevation_m","return_period_years"]
sns.heatmap(df[cols].corr(), annot=True, cmap="coolwarm", vmin=-1, vmax=1); plt.tight_layout(); plt.show()


In [None]:
#pairplot
sns.pairplot(df, vars=num_cols, hue='storm_drain_type')
plt.show()

In [None]:
df.columns


In [None]:
#data preprocessing
le=LabelEncoder()
categorical_cols=['city_name','admin_ward','catchment_id','land_use', 'soil_group','drainage_density_km_per_km2', 'storm_drain_proximity_m','storm_drain_type', 'rainfall_source','historical_rainfall_intensity_mm_hr', 'return_period_years']
for col in categorical_cols:
    df(cols)=le.fit_transform(df[col])

In [None]:
df["is_urban"] = df["is_urban"].str.strip().str.lower().map({"true": 1, "false": 0}).astype("Int64")

In [None]:
#feature and target selection
x=df.drop('storm_drain_type',axis=1)
y=df['storm_drain_type']

In [None]:
#split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)



In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
#Scale  num features
scaler= StandardScaler()
X_train[num_cols]=scaler.fit_transform(X_train[num_cols])
X_test[num_cols]=scaler.fit_transform(X_test[num_cols])

In [None]:
X_train