<font size="+3"><strong> Ensemble Models: Random Forest</strong></font>

In [1]:
import pickle 

import warnings
import numpy as np

import matplotlib.pyplot as plt
import pandas as pd
from sklearn.pipeline import make_pipeline

warnings.simplefilter(action="ignore", category=FutureWarning)

## Import Data

In [2]:
import pycountry

def wrangle(filepath):
    """
    Read Flood data CSV, clean it safely, and return a cleaned DataFrame.
    """

    # Read data
    df = pd.read_csv(filepath)

    # Split coordinates into longitude and latitude
    if 'coordinates' in df.columns:
        df[['longitude', 'latitude']] = (df['coordinates'].str.strip('[]').str.split(',', expand=True))
        df[['longitude', 'latitude']] = df[['longitude', 'latitude']].astype(float)

    # Convert dates to Africa/Nairobi timezone
    for time in ['fromdate', 'todate']:
        if time in df.columns:
            df[time] = (pd.to_datetime(df[time], errors='coerce', utc=True).dt.tz_convert('Africa/Nairobi'))

    # COUNTRY CLEANING & STANDARDISATION 
    def most_frequent_country(cell):
        cell = str(cell).lower().replace('.', '').strip()
        parts = [c.strip() for c in cell.split('|')]
        return max(set(parts), key=parts.count)

    if 'country' in df.columns:
        df['country'] = df['country'].apply(most_frequent_country)

    def to_iso3(name):
        try:
            return pycountry.countries.lookup(name).alpha_3
        except:
            return np.nan

    # Convert country to ISO-3 and drop invalid entries
    df['country_iso3'] = df['country'].apply(to_iso3)
    df = df[df['country_iso3'].notna()]

    return df

In [3]:
df = wrangle("Global Disaster Events/Unclean/Flood.csv")

### Feature Selection 

In [4]:
# columns =['alertlevel', 'alertscore',"country", "severity"]
columns =['alertlevel', "country", "severity", "country_iso3"]

# # Create a new DataFrame with only the selected columns
floods = df[columns].copy()

In [5]:
floods.head()

Unnamed: 0,alertlevel,country,severity,country_iso3
0,ORANGE,mozambique,7.74,MOZ
1,GREEN,philippines,4.92,PHL
2,GREEN,angola,5.58,AGO
4,ORANGE,mozambique,7.74,MOZ
5,RED,madagascar,6.79,MDG


In [6]:
floods.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2256 entries, 0 to 2440
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   alertlevel    2256 non-null   object 
 1   country       2256 non-null   object 
 2   severity      2256 non-null   float64
 3   country_iso3  2256 non-null   object 
dtypes: float64(1), object(3)
memory usage: 88.1+ KB


In [7]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

## Split Data

Create your feature matrix `X_train` and target vector `y_train`. The target is `"severity"`. 

In [8]:
target = "severity"
features = ["country_iso3"]

X = floods[features]
y = floods[target]

print("X shape:", X.shape)
print("y shape:", y.shape)

X shape: (2256, 1)
y shape: (2256,)


Divide the data (`X` and `y`) into training and test sets using a randomized train-test split. Set test set to 20% of the total data. Set a `random_state` for reproducibility. 

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

X_train shape: (1804, 1)
y_train shape: (1804,)
X_test shape: (452, 1)
y_test shape: (452,)


Divide training data (`X_train` and `y_train`) into training and validation sets using a randomized train-test split. Validation data = 20% of the remaining data. 

# Build Model

## Baseline

<!-- Calculate the baseline accuracy score for the model. -->

Calculate the baseline mean absolute for the model.

In [10]:
from sklearn.metrics import mean_absolute_error

In [11]:
y_mean = y_train.mean()
y_pred_baseline = [y_mean] * len (y_train)
print("Mean Flood Magnitude:", round(y_mean, 2))
print("Baseline MAE:", mean_absolute_error(y_train, y_pred_baseline))

Mean Flood Magnitude: 4.05
Baseline MAE: 2.311582065722391


## Iterate

Instantiate a `OneHotEncoder` named `ohe`. Set the `use_cat_names` argument to `True`. Fit transformer to the feature matrix `X_train` and use encoder to transform the feature matrix `X_train`, and assign the transformed data to the variable `XT_train`.

In [12]:
from sklearn.preprocessing import OneHotEncoder

In [13]:
# Instantiate
ohe = OneHotEncoder()
# Transform
ohe.fit(X_train)
XT_train = ohe.transform(X_train)
print(XT_train.shape)

# Convert "XT_train" to DataFrame
xtrain = pd.DataFrame(XT_train.toarray()) 
# xtrain.head() 

(1804, 142)


Create a pipeline named `rfr` that contains a `OneHotEncoder`, and `RandomForestRegressor` predictor.

In [14]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline

# rf = make_pipeline(ohe, RandomForestRegressor())
rfr = make_pipeline(ohe, RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1))
rfr.fit(X_train, y_train)

## Evaluation

Create a list of predictions for the observations in your feature matrix `X_train`. Name this list `y_pred_training`. Then calculate the training mean absolute error for your predictions in `y_pred_training` as compared to the true targets in `y_train`.


In [15]:
y_pred_training = rfr.predict(X_train)
mae_training = mean_absolute_error(y_train, y_pred_training)
print("Training MAE:", round(mae_training, 2))

Training MAE: 2.02


In [16]:
y_pred_training[:5]

array([5.09718393, 4.95765992, 3.45737908, 5.81075849, 3.29595718])

In [17]:
# Access the trained RandomForestRegressor
rf_model = rfr.named_steps['randomforestregressor']

### Extract the feature names of your encoded data from the `OneHotEncoder` in the model.

In [18]:
# # Get feature importances
importances = rf_model.feature_importances_

feature_names = ohe.get_feature_names_out()

# Combine importances with feature names
feature_importance_dict = dict(zip(feature_names, importances))

In [19]:
# Sort features by importance
# sorted_features = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)

In [20]:
# Convert feature importance dictionary to a DataFrame
features_ = pd.DataFrame(list(feature_importance_dict.items()), columns=["Feature", "Importance"])

# Sort features by importance (highest first)
features_ = features_.sort_values(by="Importance", ascending=False)

In [23]:
features_.head()

Unnamed: 0,Feature,Importance
23,country_iso3_CHN,0.057878
60,country_iso3_ITA,0.051591
98,country_iso3_PER,0.041675
26,country_iso3_COL,0.040825
8,country_iso3_BDI,0.03809


## Author

<a href="https://www.linkedin.com/in/andrew-kalumba-harris/">Andrew Kalumba</a><br>
<a href =""> </a>


| Date (YYYY-MM-DD) | Prepared By     | 
| ----------------- | --------------  | 
| 2025-12-20        | Author          | 


## <h3 align="center"> © Data Science 2025. <h3/>