In [1]:
# Import required libraries
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
from collections import Counter

# Read the dataset
df = pd.read_csv('combined_data.csv')
print("Original dataset shape:", df.shape)
print("\nClass distribution before SMOTE:")
print(df['is_flood'].value_counts())

# Prepare the features and target
X = df[['NDVI', 'NDWI']]
y = df['is_flood']

# Initialize SMOTE
smote = SMOTE(sampling_strategy='all', random_state=42)

# Apply SMOTE
X_resampled, y_resampled = smote.fit_resample(X, y)

# Create new dataframe with SMOTE results
df_resampled = pd.DataFrame(X_resampled, columns=['NDVI', 'NDWI'])
df_resampled['is_flood'] = y_resampled

print("\nResampled dataset shape:", df_resampled.shape)
print("\nClass distribution after SMOTE:")
print(df_resampled['is_flood'].value_counts())

# Save the resampled dataset
df_resampled.to_csv('combined_data_smote.csv', index=False)
print("\nSaved resampled dataset to 'combined_data_smote.csv'")

Original dataset shape: (2071, 3)

Class distribution before SMOTE:
False    1570
True      501
Name: is_flood, dtype: int64

Resampled dataset shape: (3140, 3)

Class distribution after SMOTE:
True     1570
False    1570
Name: is_flood, dtype: int64

Saved resampled dataset to 'combined_data_smote.csv'


In [2]:
!pip uninstall scikit-learn --yes
!pip uninstall imblearn --yes


Found existing installation: scikit-learn 1.3.0
Uninstalling scikit-learn-1.3.0:
  Successfully uninstalled scikit-learn-1.3.0




In [3]:
!pip install scikit-learn==1.2.2


Collecting scikit-learn==1.2.2
  Obtaining dependency information for scikit-learn==1.2.2 from https://files.pythonhosted.org/packages/db/98/169b46a84b48f92df2b5e163fce75d471f4df933f8b3d925a61133210776/scikit_learn-1.2.2-cp311-cp311-win_amd64.whl.metadata
  Downloading scikit_learn-1.2.2-cp311-cp311-win_amd64.whl.metadata (11 kB)
Downloading scikit_learn-1.2.2-cp311-cp311-win_amd64.whl (8.3 MB)
   ---------------------------------------- 0.0/8.3 MB ? eta -:--:--
   - -------------------------------------- 0.4/8.3 MB 8.3 MB/s eta 0:00:01
   --- ------------------------------------ 0.7/8.3 MB 8.9 MB/s eta 0:00:01
   ----- ---------------------------------- 1.2/8.3 MB 8.4 MB/s eta 0:00:01
   ------- -------------------------------- 1.5/8.3 MB 8.1 MB/s eta 0:00:01
   ----------- ---------------------------- 2.3/8.3 MB 9.7 MB/s eta 0:00:01
   -------------- ------------------------- 3.0/8.3 MB 10.6 MB/s eta 0:00:01
   ------------------ --------------------- 3.9/8.3 MB 11.7 MB/s eta 0:00:01

In [4]:
!pip install imblearn


Collecting imblearn
  Obtaining dependency information for imblearn from https://files.pythonhosted.org/packages/81/a7/4179e6ebfd654bd0eac0b9c06125b8b4c96a9d0a8ff9e9507eb2a26d2d7e/imblearn-0.0-py2.py3-none-any.whl.metadata
  Downloading imblearn-0.0-py2.py3-none-any.whl.metadata (355 bytes)
Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Installing collected packages: imblearn
Successfully installed imblearn-0.0


In [2]:
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Load dataset
df = pd.read_csv('combined_data.csv')
print("Original dataset shape:", df.shape)
print("\nClass distribution before SMOTE:")
print(df['is_flood'].value_counts())

# Prepare features and target
X = df[['NDVI', 'NDWI']]
y = df['is_flood']

# Step 1: Standardization
standard_scaler = StandardScaler()
X_standardized = standard_scaler.fit_transform(X)

# Step 2: Normalization
minmax_scaler = MinMaxScaler()
X_scaled = minmax_scaler.fit_transform(X_standardized)

# Step 3: SMOTE to reach 10,000 rows evenly across classes
target_total = 10_000
classes = y.value_counts().index.tolist()
n_classes = len(classes)
samples_per_class = target_total // n_classes

sampling_strategy = {label: samples_per_class for label in classes}

# Apply SMOTE
smote = SMOTE(sampling_strategy=sampling_strategy, random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

# Step 4: Save the resampled dataset
df_resampled = pd.DataFrame(X_resampled, columns=['NDVI', 'NDWI'])
df_resampled['is_flood'] = y_resampled

print("\nResampled dataset shape:", df_resampled.shape)
print("\nClass distribution after SMOTE:")
print(df_resampled['is_flood'].value_counts())

df_resampled.to_csv('combined_data_smote_10000_scaled.csv', index=False)
print("\nSaved resampled dataset to 'combined_data_smote_10000_scaled.csv'")


Original dataset shape: (2071, 3)

Class distribution before SMOTE:
False    1570
True      501
Name: is_flood, dtype: int64

Resampled dataset shape: (10000, 3)

Class distribution after SMOTE:
True     5000
False    5000
Name: is_flood, dtype: int64

Saved resampled dataset to 'combined_data_smote_10000_scaled.csv'




In [10]:
# combine the flood_risk_dataset_india.csv combined_data_smote_10000_scaled.csv
satellite_df = pd.read_csv('combined_data_smote_10000_scaled.csv')
flood_risk_df = pd.read_csv('datasets/india_flood_risk/flood_risk_dataset_india.csv')

# normalize and standardize the flood_risk_df
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# First, identify numerical columns for scaling (exclude non-numeric or identifier columns)
# Assuming all columns except specific ones are numeric and need scaling
numeric_cols = flood_risk_df.select_dtypes(include=['float64', 'int64']).columns.tolist()

# Create a copy of the dataframe to avoid modifying the original
flood_risk_scaled_df = flood_risk_df.copy()

# Apply standardization and normalization in one pipeline
# Step 1: Standardize (z-score normalization)
scaler = StandardScaler()
flood_risk_standardized = scaler.fit_transform(flood_risk_df[numeric_cols])

# Step 2: Apply MinMax scaling to the standardized data (normalize to [0,1] range)
normalizer = MinMaxScaler()
flood_risk_normalized = normalizer.fit_transform(flood_risk_standardized)

# Replace the original numeric columns with the scaled values
flood_risk_scaled_df[numeric_cols] = flood_risk_normalized

# Display the result
print("Original data (first 5 rows):")
print(flood_risk_df[numeric_cols].head())
print("\nNormalized and standardized data (first 5 rows):")
print(flood_risk_scaled_df[numeric_cols].head())

# Save the preprocessed data if needed
flood_risk_scaled_df.to_csv('flood_risk_dataset_india_scaled.csv', index=False)


Original data (first 5 rows):
    Latitude  Longitude  Rainfall (mm)  Temperature (°C)  Humidity (%)  \
0  18.861663  78.835584     218.999493         34.144337     43.912963   
1  35.570715  77.654451      55.353599         28.778774     27.585422   
2  29.227824  73.108463     103.991908         43.934956     30.108738   
3  25.361096  85.610733     198.984191         21.569354     34.453690   
4  12.524541  81.822101     144.626803         32.635692     36.292267   

   River Discharge (m³/s)  Water Level (m)  Elevation (m)  Population Density  \
0             4236.182888         7.415552     377.465433         7276.742184   
1             2472.585219         8.811019    7330.608875         6897.736956   
2              977.328053         4.631799    2205.873488         4361.518494   
3             3683.208933         2.891787    2512.277800         6163.069701   
4             2093.390678         3.188466    2001.818223         6167.964591   

   Infrastructure  Historical Floods  

In [17]:
import pandas as pd

# Read the CSV file
df = pd.read_csv('combined_data_smote_10000_scaled.csv')

# Map True to 1 and False to 0
df['is_flood'] = df['is_flood'].map({True: 1, False: 0})

# Save the modified dataframe to a new file
# You can either save to a new file
df.to_csv('combined_data_smote_10000_scaled_numeric.csv', index=False)

# Or overwrite the existing file
# df.to_csv('combined_data_smote_10000_scaled.csv', index=False)

print("Conversion completed successfully!")

Conversion completed successfully!
