In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path
import sys

import pandas as pd
from sklearn.model_selection import TimeSeriesSplit
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
import shap

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go

sys.path.append("../../")
from src.outbreak import create_outbreak_summary

# Outbreak prediction using Random Forest Experiment

Steps:
1. Split data by timestamp into training and testing set
2. Run to RF model
3. SHAP

Possible iterations: removing the static values, removing the annual values, removing both.

RandomForest in sklearn does not handle nulls, would need to remove the nulls. **In this run, the preprocessing includes:**
- Removing rows that did not contain Age statistics (Age_mean, Age_median, etc.) --- this essentially dropped the added time series rows that had no health data to them at all
- Dropped Ookla columns for now since they only match 2 years with the dataset
- RWI also removed and too many nulls --- also expected that the values would be repetitive since it is too coarse (bring back this feature if done at city level)
- population density columns are also removed for now since they contain plenty of nulls --- apparently it looks like if there are population count values in the raster, does not mean there are population density values too?

Try for next iteration:
- remove other feature
- city-level linked data?

In [3]:
PROCESSED_DIR = Path("../../data/03-processed/")
LINKED_DATA = PROCESSED_DIR / "linked_df_v2.csv"

## Load Linked Dataset

In [4]:
linked_df = pd.read_csv(LINKED_DATA)
linked_df.head()

Unnamed: 0,start_of_week,year,ADM4_PCODE,NumCases,Age_min,Age_max,Age_mean,Age_median,Age_std,Female,...,google_bldgs_count_gt_200_sqm,google_bldgs_density,google_bldgs_pct_built_up_area,travel_time,hospital_pop_reached_total,hospital_pop_reached_pct,healthcenter_pop_reached_total,healthcenter_pop_reached_pct,rhu_pop_reached_total,rhu_pop_reached_pct
0,2013-01-07,2013,PH097332001,0.0,,,,,,,...,35,0.003804,15.039279,5,0.0,0.0,3611.67,76.28,0.0,0.0
1,2013-01-07,2013,PH097332002,3.0,5.333333,22.333333,15.027778,17.416667,8.748148,0.0,...,210,0.001999,14.409458,5,0.0,0.0,9497.76,64.74,10518.68,71.7
2,2013-01-07,2013,PH097332004,2.0,5.666667,10.333333,8.0,8.0,3.299832,2.0,...,390,0.004072,37.941451,5,5930.91,23.38,25016.24,98.61,23340.23,92.01
3,2013-01-07,2013,PH097332005,0.0,,,,,,,...,16,4.3e-05,0.176463,5,0.0,0.0,838.27,11.97,0.0,0.0
4,2013-01-07,2013,PH097332010,0.0,,,,,,,...,125,0.00066,4.093568,5,358.76,5.64,2953.82,46.47,0.0,0.0


In [37]:
linked_df.shape

(45493, 299)

In [5]:
linked_df["start_of_week"] = pd.to_datetime(linked_df["start_of_week"])
linked_df = linked_df.sort_values(by=["start_of_week"])

## Add lag

In [6]:
# Group by customer_id and return the date of the previous order
lagged_filtered_df = linked_df.assign(
    prev_1_wk_numcases=lambda x: x.groupby(["ADM4_PCODE"])["NumCases"].shift(
        1, fill_value=0
    ),
    prev_2_wk_numcases=lambda x: x.groupby(["ADM4_PCODE"])["NumCases"].shift(
        2, fill_value=0
    ),
    prev_3_wk_numcases=lambda x: x.groupby(["ADM4_PCODE"])["NumCases"].shift(
        3, fill_value=0
    ),
    prev_1_mo_numcases=lambda x: x.groupby(["ADM4_PCODE"])["NumCases"].shift(
        4, fill_value=0
    ),
)
lagged_filtered_df

Unnamed: 0,start_of_week,year,ADM4_PCODE,NumCases,Age_min,Age_max,Age_mean,Age_median,Age_std,Female,...,hospital_pop_reached_total,hospital_pop_reached_pct,healthcenter_pop_reached_total,healthcenter_pop_reached_pct,rhu_pop_reached_total,rhu_pop_reached_pct,prev_1_wk_numcases,prev_2_wk_numcases,prev_3_wk_numcases,prev_1_mo_numcases
0,2013-01-07,2013,PH097332001,0.0,,,,,,,...,0.00,0.00,3611.67,76.28,0.00,0.00,0.0,0.0,0.0,0.0
70,2013-01-07,2013,PH097332078,0.0,,,,,,,...,0.00,0.00,218.73,5.40,0.00,0.00,0.0,0.0,0.0,0.0
69,2013-01-07,2013,PH097332077,0.0,,,,,,,...,4859.95,102.01,4859.95,102.01,4348.94,91.28,0.0,0.0,0.0,0.0
68,2013-01-07,2013,PH097332076,9.0,5.000000,72.083333,26.37963,10.916667,27.227568,4.0,...,11814.54,34.21,6192.50,17.93,9878.40,28.60,0.0,0.0,0.0,0.0
67,2013-01-07,2013,PH097332075,5.0,0.416667,31.416667,16.05000,12.000000,12.552750,2.0,...,6583.66,88.94,7287.22,98.45,7339.76,99.16,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45423,2021-12-27,2021,PH097332035,0.0,,,,,,,...,0.00,0.00,789.85,16.09,0.00,0.00,0.0,0.0,0.0,0.0
45422,2021-12-27,2021,PH097332034,0.0,,,,,,,...,0.00,0.00,776.75,22.14,0.00,0.00,0.0,0.0,0.0,0.0
45421,2021-12-27,2021,PH097332033,0.0,,,,,,,...,0.00,0.00,42.40,0.45,6221.26,66.72,0.0,1.0,0.0,0.0
45418,2021-12-27,2021,PH097332030,0.0,,,,,,,...,0.00,0.00,810.47,56.31,0.00,0.00,0.0,0.0,0.0,0.0


In [7]:
lagged_filtered_df[lagged_filtered_df["ADM4_PCODE"] == "PH097332075"][
    [
        "ADM4_PCODE",
        "start_of_week",
        "NumCases",
        "prev_1_wk_numcases",
        "prev_2_wk_numcases",
        "prev_3_wk_numcases",
        "prev_1_mo_numcases",
    ]
]

Unnamed: 0,ADM4_PCODE,start_of_week,NumCases,prev_1_wk_numcases,prev_2_wk_numcases,prev_3_wk_numcases,prev_1_mo_numcases
67,PH097332075,2013-01-07,5.0,0.0,0.0,0.0,0.0
164,PH097332075,2013-01-14,5.0,5.0,0.0,0.0,0.0
261,PH097332075,2013-01-21,3.0,5.0,5.0,0.0,0.0
358,PH097332075,2013-01-28,2.0,3.0,5.0,5.0,0.0
455,PH097332075,2013-02-04,7.0,2.0,3.0,5.0,5.0
...,...,...,...,...,...,...,...
45075,PH097332075,2021-11-29,1.0,0.0,0.0,1.0,1.0
45172,PH097332075,2021-12-06,0.0,1.0,0.0,0.0,1.0
45269,PH097332075,2021-12-13,0.0,0.0,1.0,0.0,0.0
45366,PH097332075,2021-12-20,0.0,0.0,0.0,1.0,0.0


## Inspect missing data

**Listing down the reasons for nulls in the data**

- populated the missing timestamps in the data --- though the number of cases are 0, they are many nans such as the aggregation/age statistics (mean, min, max, etc.)
- male and female column breakdowns --- remove these columns --- no difference in trends
- some climate variables were Nan's (meaning that there are some weeks that did not match)
- tower_nearest_y ? <-- remove
- ookla contains nans since we only have data from 2019-2021
- missing nightlights data

In [8]:
lagged_filtered_df.columns[lagged_filtered_df.isnull().any()].tolist()

['Age_min',
 'Age_max',
 'Age_mean',
 'Age_median',
 'Age_std',
 'Female',
 'Male',
 'NDVI_AVG',
 'NDVI_MIN',
 'NDVI_MAX',
 'PNP_AVG',
 'PNP_MIN',
 'PNP_MAX',
 'PNP_STD',
 'PR_AVG',
 'PR_MIN',
 'PR_MAX',
 'PR_STD',
 'SPI3_AVG',
 'SPI3_MIN',
 'SPI3_MAX',
 'SPI3_STD',
 'SPI6_AVG',
 'SPI6_MIN',
 'SPI6_MAX',
 'SPI6_STD',
 'poi_count',
 'clinic_count',
 'clinic_nearest',
 'dentist_count',
 'dentist_nearest',
 'doctors_count',
 'doctors_nearest',
 'hospital_count',
 'hospital_nearest',
 'optician_count',
 'optician_nearest',
 'pharmacy_count',
 'pharmacy_nearest',
 'atm_count',
 'atm_nearest',
 'bank_count',
 'bank_nearest',
 'college_count',
 'college_nearest',
 'community_centre_count',
 'community_centre_nearest',
 'comms_tower_count',
 'comms_tower_nearest',
 'convenience_count',
 'convenience_nearest',
 'fire_station_count',
 'fire_station_nearest',
 'kindergarten_count',
 'kindergarten_nearest',
 'lighthouse_count',
 'lighthouse_nearest',
 'market_place_count',
 'market_place_nearest',

In [9]:
# try filtering to check how much we have
filtered_linked_df = lagged_filtered_df.copy()

In [10]:
# filter to year with population
filtered_linked_df = filtered_linked_df[
    (filtered_linked_df["year"] >= 2014) & (filtered_linked_df["year"] <= 2020)
]
# try removing those without health data (age, etc)
filtered_linked_df = filtered_linked_df.dropna(
    subset=[
        "Age_min",
        "Age_max",
        "Age_mean",
        "Age_median",
        "Age_std",
    ]
)
# remove ookla for now
filtered_linked_df = filtered_linked_df.drop(
    columns=[
        # ookla
        "fixed_mean_avg_d_kbps_mean",
        "fixed_mean_avg_u_kbps_mean",
        "fixed_mean_avg_lat_ms_mean",
        "fixed_mean_num_tests_mean",
        "fixed_mean_num_devices_mean",
        "mobile_mean_avg_d_kbps_mean",
        "mobile_mean_avg_u_kbps_mean",
        "mobile_mean_avg_lat_ms_mean",
        "mobile_mean_num_tests_mean",
        "mobile_mean_num_devices_mean",
        # rwi
        "RWI_max",
        "RWI_mean",
        "RWI_median",
        "RWI_min",
        "RWI_std",
        # others
        "Year",
        "tower_nearest_y",
        "year",
        "Female",
        "Male",
        # population density
        # apparently there are nulls
        # even if there's population count values
        "pop_density_mean",
        "pop_density_median",
        "pop_density_stdev",
        "pop_density_min",
        "pop_density_max",
        # unneeded osm features
        "atm_count",
        "atm_nearest",
        "bank_count",
        "bank_nearest",
        "college_count",
        "college_nearest",
        "community_centre_count",
        "community_centre_nearest",
        "comms_tower_count",
        "comms_tower_nearest",
        "convenience_count",
        "convenience_nearest",
        "fire_station_count",
        "fire_station_nearest",
        "kindergarten_count",
        "kindergarten_nearest",
        "lighthouse_count",
        "lighthouse_nearest",
        "market_place_count",
        "market_place_nearest",
        "park_count",
        "park_nearest",
        "public_building_count",
        "public_building_nearest",
        "police_count",
        "police_nearest",
        "school_count",
        "school_nearest",
        "shelter_count",
        "shelter_nearest",
        "supermarket_count",
        "supermarket_nearest",
        "telephone_count",
        "telephone_nearest",
        "tower_count",
        "tower_nearest",
        "town_hall_count",
        "town_hall_nearest",
        "university_count",
        "university_nearest",
        "cable_count",
        "cable_nearest",
        "compensator_count",
        "compensator_nearest",
        "connection_count",
        "connection_nearest",
        "converter_count",
        "converter_nearest",
        "generator_count",
        "generator_nearest",
        "insulator_count",
        "insulator_nearest",
        "line_count",
        "line_nearest",
        "busbar_count",
        "busbar_nearest",
        "bay_count",
        "bay_nearest",
        "minor_line_count",
        "minor_line_nearest",
        "plant_count",
        "plant_nearest",
        "pole_count",
        "pole_nearest",
        "portal_count",
        "portal_nearest",
        "substation_count",
        "substation_nearest",
        "tower_count_y",
        "transformer_count",
        "transformer_nearest",
        "exchange_count",
        "exchange_nearest",
        "connection_point_count",
        "connection_point_nearest",
        "distribution_point_count",
        "distribution_point_nearest",
        "service_device_count",
        "service_device_nearest",
        "data_center_count",
        "data_center_nearest",
        # landuse
        "pct_area_builtup",
        "pct_area_cropland",
        "pct_area_grassland",
        "pct_area_herbaceous_wetland",
        "pct_area_mangroves",
        "pct_area_permanent_water_bodies",
        "pct_area_shrubland",
        "pct_area_tree_cover",
        # Age
        "Age_min",
        "Age_max",
        "Age_mean",
        "Age_median",
        "Age_std",
    ]
)
filtered_linked_df

Unnamed: 0,start_of_week,ADM4_PCODE,NumCases,outbreak,CO_AVG,CO_MIN,CO_MAX,CO_STD,HI_AVG,HI_MIN,...,hospital_pop_reached_total,hospital_pop_reached_pct,healthcenter_pop_reached_total,healthcenter_pop_reached_pct,rhu_pop_reached_total,rhu_pop_reached_pct,prev_1_wk_numcases,prev_2_wk_numcases,prev_3_wk_numcases,prev_1_mo_numcases
5111,2014-01-06,PH097332075,5.0,1,0.071500,0.0610,0.0815,0.008019,27.432857,26.26,...,6583.66,88.94,7287.22,98.45,7339.76,99.16,1.0,6.0,1.0,2.0
5112,2014-01-06,PH097332076,2.0,0,0.071500,0.0610,0.0815,0.008019,27.432857,26.26,...,11814.54,34.21,6192.50,17.93,9878.40,28.60,0.0,0.0,2.0,6.0
5110,2014-01-06,PH097332074,5.0,0,0.071500,0.0610,0.0815,0.008019,27.602857,26.48,...,3710.70,82.11,3997.88,88.47,3997.88,88.47,0.0,0.0,0.0,0.0
5123,2014-01-06,PH097332087,2.0,0,0.071500,0.0610,0.0815,0.008019,27.432857,26.26,...,17253.49,63.46,2831.49,10.41,24367.76,89.62,0.0,1.0,3.0,4.0
5104,2014-01-06,PH097332068,8.0,1,0.071500,0.0610,0.0815,0.008019,27.432857,26.26,...,5639.72,96.68,6146.98,105.37,5341.58,91.57,0.0,3.0,1.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38845,2020-09-07,PH097332053,2.0,0,0.051214,0.0508,0.0516,0.000308,29.618571,28.55,...,2084.10,8.00,5434.42,20.86,7115.67,27.32,0.0,0.0,1.0,0.0
38952,2020-09-14,PH097332063,2.0,0,0.057414,0.0513,0.0634,0.004339,28.008571,26.51,...,4396.44,109.39,4396.44,109.39,4396.44,109.39,0.0,0.0,0.0,0.0
38976,2020-09-14,PH097332087,2.0,0,0.057414,0.0513,0.0634,0.004339,28.008571,26.51,...,17253.49,63.46,2831.49,10.41,24367.76,89.62,0.0,0.0,1.0,0.0
39499,2020-10-26,PH097332026,2.0,0,0.063386,0.0607,0.0675,0.002484,28.287143,27.18,...,1399.33,16.90,5307.62,64.09,0.00,0.00,0.0,0.0,0.0,0.0


In [11]:
# remove rows that have no climate match
filtered_linked_df = filtered_linked_df.dropna(
    subset=[
        "NDVI_AVG",
        "NDVI_MIN",
        "NDVI_MAX",
        "PNP_AVG",
        "PNP_MIN",
        "PNP_MAX",
        "PNP_STD",
        "PR_AVG",
        "PR_MIN",
        "PR_MAX",
        "PR_STD",
        "SPI3_AVG",
        "SPI3_MIN",
        "SPI3_MAX",
        "SPI3_STD",
        "SPI6_AVG",
        "SPI6_MIN",
        "SPI6_MAX",
        "SPI6_STD",
    ]
)
ntl_features = [
    "avg_rad_min",
    "avg_rad_max",
    "avg_rad_mean",
    "avg_rad_std",
    "avg_rad_median",
]
filtered_linked_df[ntl_features] = filtered_linked_df[ntl_features].fillna(0)
filtered_linked_df.shape


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


(1013, 184)

## Split training and testing

In [12]:
filtered_linked_df = filtered_linked_df.set_index("start_of_week")
filtered_linked_df.head(2)

Unnamed: 0_level_0,ADM4_PCODE,NumCases,outbreak,CO_AVG,CO_MIN,CO_MAX,CO_STD,HI_AVG,HI_MIN,HI_MAX,...,hospital_pop_reached_total,hospital_pop_reached_pct,healthcenter_pop_reached_total,healthcenter_pop_reached_pct,rhu_pop_reached_total,rhu_pop_reached_pct,prev_1_wk_numcases,prev_2_wk_numcases,prev_3_wk_numcases,prev_1_mo_numcases
start_of_week,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2014-01-27,PH097332076,2.0,0,0.074671,0.0711,0.082,0.004529,25.911429,25.53,26.56,...,11814.54,34.21,6192.5,17.93,9878.4,28.6,0.0,1.0,2.0,0.0
2014-01-27,PH097332075,4.0,1,0.074671,0.0711,0.082,0.004529,25.911429,25.53,26.56,...,6583.66,88.94,7287.22,98.45,7339.76,99.16,7.0,4.0,5.0,1.0


In [13]:
# split features and target label
X = filtered_linked_df.drop(labels=["outbreak"], axis=1)
y = filtered_linked_df["outbreak"]

In [14]:
tss = TimeSeriesSplit(n_splits=3)
for train_index, test_index in tss.split(X):
    X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

In [15]:
# set aside the pcodes and numcases
brgy_tests = X_test["ADM4_PCODE"]
numcases_test = X_test["NumCases"]

X_train = X_train.drop(columns=["ADM4_PCODE", "NumCases"])
X_test = X_test.drop(columns=["ADM4_PCODE", "NumCases"])

In [16]:
X_train

Unnamed: 0_level_0,CO_AVG,CO_MIN,CO_MAX,CO_STD,HI_AVG,HI_MIN,HI_MAX,HI_STD,NDVI_AVG,NDVI_MIN,...,hospital_pop_reached_total,hospital_pop_reached_pct,healthcenter_pop_reached_total,healthcenter_pop_reached_pct,rhu_pop_reached_total,rhu_pop_reached_pct,prev_1_wk_numcases,prev_2_wk_numcases,prev_3_wk_numcases,prev_1_mo_numcases
start_of_week,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2014-01-27,0.074671,0.0711,0.0820,0.004529,25.911429,25.53,26.56,0.372623,0.491429,0.46,...,11814.54,34.21,6192.50,17.93,9878.40,28.60,0.0,1.0,2.0,0.0
2014-01-27,0.074671,0.0711,0.0820,0.004529,25.911429,25.53,26.56,0.372623,0.491429,0.46,...,6583.66,88.94,7287.22,98.45,7339.76,99.16,7.0,4.0,5.0,1.0
2014-01-27,0.074671,0.0711,0.0820,0.004529,25.911429,25.53,26.56,0.372623,0.491429,0.46,...,6117.18,113.87,6117.18,113.87,5193.52,96.68,4.0,1.0,1.0,0.0
2014-01-27,0.074671,0.0711,0.0820,0.004529,25.911429,25.53,26.56,0.372623,0.602857,0.52,...,14769.66,67.41,19570.04,89.32,8525.57,38.91,4.0,2.0,0.0,0.0
2014-01-27,0.074671,0.0711,0.0820,0.004529,25.911429,25.53,26.56,0.372623,0.554286,0.52,...,0.00,0.00,18134.69,80.63,4344.26,19.32,8.0,5.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-04-29,0.061843,0.0602,0.0654,0.001820,29.938571,29.44,30.61,0.437204,0.400000,0.38,...,17253.49,63.46,2831.49,10.41,24367.76,89.62,3.0,1.0,1.0,2.0
2019-04-29,0.061843,0.0602,0.0654,0.001820,30.228571,29.57,30.98,0.536608,0.447143,0.40,...,0.00,0.00,1823.79,4.86,13529.96,36.03,4.0,3.0,2.0,4.0
2019-04-29,0.061843,0.0602,0.0654,0.001820,30.520000,29.70,31.68,0.746905,0.551429,0.52,...,824.15,16.43,3741.79,74.58,661.47,13.18,1.0,2.0,2.0,1.0
2019-04-29,0.061843,0.0602,0.0654,0.001820,29.938571,29.44,30.61,0.437204,0.400000,0.38,...,3514.11,63.34,5639.59,101.66,5630.38,101.49,3.0,0.0,1.0,2.0


In [17]:
X_test

Unnamed: 0_level_0,CO_AVG,CO_MIN,CO_MAX,CO_STD,HI_AVG,HI_MIN,HI_MAX,HI_STD,NDVI_AVG,NDVI_MIN,...,hospital_pop_reached_total,hospital_pop_reached_pct,healthcenter_pop_reached_total,healthcenter_pop_reached_pct,rhu_pop_reached_total,rhu_pop_reached_pct,prev_1_wk_numcases,prev_2_wk_numcases,prev_3_wk_numcases,prev_1_mo_numcases
start_of_week,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-04-29,0.061843,0.0602,0.0654,0.001820,29.938571,29.44,30.61,0.437204,0.548571,0.51,...,0.00,0.00,20722.16,85.22,145.25,0.60,2.0,3.0,1.0,1.0
2019-04-29,0.061843,0.0602,0.0654,0.001820,29.938571,29.44,30.61,0.437204,0.515714,0.48,...,0.00,0.00,9497.76,64.74,10518.68,71.70,5.0,1.0,1.0,1.0
2019-04-29,0.061843,0.0602,0.0654,0.001820,29.938571,29.44,30.61,0.437204,0.441429,0.42,...,2413.31,19.86,8616.51,70.91,11066.99,91.07,2.0,0.0,0.0,1.0
2019-04-29,0.061843,0.0602,0.0654,0.001820,30.520000,29.70,31.68,0.746905,0.425714,0.39,...,0.00,0.00,3611.67,76.28,0.00,0.00,0.0,0.0,2.0,0.0
2019-04-29,0.061843,0.0602,0.0654,0.001820,30.520000,29.70,31.68,0.746905,0.484286,0.44,...,2084.10,8.00,5434.42,20.86,7115.67,27.32,2.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-02-24,0.066543,0.0649,0.0687,0.001486,29.767143,29.28,30.19,0.362386,0.451429,0.42,...,0.00,0.00,18134.69,80.63,4344.26,19.32,1.0,0.0,1.0,0.0
2020-02-24,0.066543,0.0649,0.0687,0.001486,29.767143,29.28,30.19,0.362386,0.520000,0.49,...,0.00,0.00,9497.76,64.74,10518.68,71.70,1.0,0.0,0.0,1.0
2020-02-24,0.066543,0.0649,0.0687,0.001486,30.385714,29.88,30.82,0.388581,0.542857,0.47,...,2084.10,8.00,5434.42,20.86,7115.67,27.32,1.0,1.0,2.0,4.0
2020-04-27,0.067614,0.0663,0.0693,0.001299,30.310000,29.81,30.78,0.376342,0.411429,0.39,...,0.00,0.00,18134.69,80.63,4344.26,19.32,0.0,1.0,2.0,0.0


## RF Classifier

In [18]:
X_train.columns.tolist()

['CO_AVG',
 'CO_MIN',
 'CO_MAX',
 'CO_STD',
 'HI_AVG',
 'HI_MIN',
 'HI_MAX',
 'HI_STD',
 'NDVI_AVG',
 'NDVI_MIN',
 'NDVI_MAX',
 'NDVI_STD',
 'NO2_AVG',
 'NO2_MIN',
 'NO2_MAX',
 'NO2_STD',
 'O3_AVG',
 'O3_MIN',
 'O3_MAX',
 'O3_STD',
 'PM10_AVG',
 'PM10_MIN',
 'PM10_MAX',
 'PM10_STD',
 'PM25_AVG',
 'PM25_MIN',
 'PM25_MAX',
 'PM25_STD',
 'PNP_AVG',
 'PNP_MIN',
 'PNP_MAX',
 'PNP_STD',
 'PR_AVG',
 'PR_MIN',
 'PR_MAX',
 'PR_STD',
 'RH_AVG',
 'RH_MIN',
 'RH_MAX',
 'RH_STD',
 'SO2_AVG',
 'SO2_MIN',
 'SO2_MAX',
 'SO2_STD',
 'SPI3_AVG',
 'SPI3_MIN',
 'SPI3_MAX',
 'SPI3_STD',
 'SPI6_AVG',
 'SPI6_MIN',
 'SPI6_MAX',
 'SPI6_STD',
 'SR_AVG',
 'SR_MIN',
 'SR_MAX',
 'SR_STD',
 'Tave_AVG',
 'Tave_MIN',
 'Tave_MAX',
 'Tave_STD',
 'Tmax_AVG',
 'Tmax_MIN',
 'Tmax_MAX',
 'Tmax_STD',
 'Tmin_AVG',
 'Tmin_MIN',
 'Tmin_MAX',
 'Tmin_STD',
 'UVR_AVG',
 'UVR_MIN',
 'UVR_MAX',
 'UVR_STD',
 'WS_AVG',
 'WS_MIN',
 'WS_MAX',
 'WS_STD',
 'poi_count',
 'clinic_count',
 'clinic_nearest',
 'dentist_count',
 'dentist_neares

In [19]:
clf_regressor = RandomForestClassifier(n_jobs=-1, n_estimators=400, random_state=42)
# Train the regressor on the training data
clf_regressor.fit(X_train, y_train)
# Make predictions on the test set
y_pred = clf_regressor.predict(X_test)
y_pred_proba = clf_regressor.predict_proba(X_test)

In [20]:
# Evaluate the model
score = clf_regressor.score(X_test, y_test)
score  # mean accuracy

0.8537549407114624

### Basic plots to understand model

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

cm = confusion_matrix(y_test, y_pred, labels=clf_regressor.classes_)
disp = ConfusionMatrixDisplay(
    confusion_matrix=cm, display_labels=clf_regressor.classes_
)
disp.plot()
plt.show()

In [None]:
prediction = y_pred_proba[:, 1]  # prediction for outbreak
plt.figure(figsize=(15, 7))
plt.hist(prediction[y_test == 0], bins=50, label="No Outbreak")
plt.hist(prediction[y_test == 1], bins=50, label="Outbreak", alpha=0.7, color="r")
plt.xlabel("Probability of being Outbreak Class", fontsize=10)
plt.ylabel("Number of records in each bucket", fontsize=10)
plt.legend(fontsize=15, title="Actual Labels")
plt.tick_params(axis="both", labelsize=8, pad=5)
plt.show()

### Understanding the results

In [23]:
analyze_df = X_test.copy()
analyze_df["ADM4_PCODE"] = brgy_tests
analyze_df["NumCases"] = numcases_test
analyze_df["actual_class"] = y_test
analyze_df["predicted_class"] = y_pred
analyze_df["predicted_proba_outbreak"] = y_pred_proba[:, 1]
analyze_df = analyze_df.reset_index()
analyze_df.head()

Unnamed: 0,start_of_week,CO_AVG,CO_MIN,CO_MAX,CO_STD,HI_AVG,HI_MIN,HI_MAX,HI_STD,NDVI_AVG,...,rhu_pop_reached_pct,prev_1_wk_numcases,prev_2_wk_numcases,prev_3_wk_numcases,prev_1_mo_numcases,ADM4_PCODE,NumCases,actual_class,predicted_class,predicted_proba_outbreak
0,2019-04-29,0.061843,0.0602,0.0654,0.00182,29.938571,29.44,30.61,0.437204,0.548571,...,0.6,2.0,3.0,1.0,1.0,PH097332016,4.0,0,0,0.0525
1,2019-04-29,0.061843,0.0602,0.0654,0.00182,29.938571,29.44,30.61,0.437204,0.515714,...,71.7,5.0,1.0,1.0,1.0,PH097332002,3.0,0,0,0.1675
2,2019-04-29,0.061843,0.0602,0.0654,0.00182,29.938571,29.44,30.61,0.437204,0.441429,...,91.07,2.0,0.0,0.0,1.0,PH097332031,3.0,0,0,0.1425
3,2019-04-29,0.061843,0.0602,0.0654,0.00182,30.52,29.7,31.68,0.746905,0.425714,...,0.0,0.0,0.0,2.0,0.0,PH097332001,3.0,0,0,0.0425
4,2019-04-29,0.061843,0.0602,0.0654,0.00182,30.52,29.7,31.68,0.746905,0.484286,...,27.32,2.0,1.0,0.0,0.0,PH097332053,4.0,0,0,0.11


#### Outbreak periods

In [24]:
actual_outbreaks_summary = create_outbreak_summary(analyze_df, "actual_class")
actual_outbreaks_summary["category"] = "actual"
predicted_outbreaks_summary = create_outbreak_summary(analyze_df, "predicted_class")
predicted_outbreaks_summary["category"] = "predicted"

In [25]:
actual_outbreaks_summary.head(3)

Unnamed: 0,ADM4_PCODE,outbreak_group,start_date,end_date,actual_length_weeks,category
0,PH097332002,1,2019-07-01,2019-08-26,3,actual
1,PH097332004,3,2019-05-27,2019-08-26,4,actual
2,PH097332010,5,2019-07-29,2019-08-26,2,actual


In [28]:
predicted_outbreaks_summary.head(3)

Unnamed: 0,ADM4_PCODE,outbreak_group,start_date,end_date,actual_length_weeks,category
0,PH097332002,1,2019-07-01,2019-08-26,3,predicted
1,PH097332004,3,2019-07-01,2019-08-26,3,predicted
2,PH097332010,5,2019-07-29,2019-07-29,1,predicted


In [29]:
outbreak_lengths_results = pd.concat(
    [actual_outbreaks_summary, predicted_outbreaks_summary]
)
outbreak_lengths_results

Unnamed: 0,ADM4_PCODE,outbreak_group,start_date,end_date,actual_length_weeks,category
0,PH097332002,1,2019-07-01,2019-08-26,3,actual
1,PH097332004,3,2019-05-27,2019-08-26,4,actual
2,PH097332010,5,2019-07-29,2019-08-26,2,actual
3,PH097332011,7,2019-08-26,2019-08-26,1,actual
4,PH097332013,7,2019-07-01,2019-07-29,2,actual
...,...,...,...,...,...,...
22,PH097332085,45,2019-05-27,2019-08-26,4,predicted
23,PH097332087,47,2019-05-27,2019-08-26,4,predicted
24,PH097332087,49,2019-11-25,2019-11-25,1,predicted
25,PH097332092,51,2019-05-27,2019-08-26,4,predicted


In [30]:
# processing
sample_brgy = outbreak_lengths_results.copy()
sample_brgy = sample_brgy[sample_brgy["ADM4_PCODE"] == "PH097332087"]
sample_brgy

Unnamed: 0,ADM4_PCODE,outbreak_group,start_date,end_date,actual_length_weeks,category
30,PH097332087,55,2019-05-27,2019-09-30,5,actual
31,PH097332087,57,2019-11-25,2019-11-25,1,actual
23,PH097332087,47,2019-05-27,2019-08-26,4,predicted
24,PH097332087,49,2019-11-25,2019-11-25,1,predicted


In [None]:
import plotly.express as px

class_colors = {"predicted": "#ee472f", "actual": "#53bed0"}

fig = px.scatter(
    sample_brgy,
    x=["start_date", "end_date"],
    y="category",
    color="category",
    size="actual_length_weeks",
    labels={"x": "Date"},
    category_orders={"category": ["actual", "predicted"]},
    color_discrete_map=class_colors,
)
for i, row in sample_brgy.iterrows():
    fig.add_trace(
        go.Scatter(
            mode="lines",
            # layer="below",
            line=dict(
                color=sample_brgy["category"].map(class_colors)[i],
                width=3,
                dash="solid",
            ),
            # connect the two markers
            # x0=row["start_date"], y0=row["category"],
            # x1=row["end_date"], y1=row["category"],
            x=[row["start_date"], row["end_date"]],
            y=[row["category"], row["category"]],
            showlegend=False,
        )
    )
# Update layout with title and axis titles
fig.update_layout(
    title="Dengue Outbreak Periods for Brgy. Tetuan in Zamboanga",
    xaxis_title="Date",
    yaxis_title="Category",
)
fig.show()

#### Line + Bar plot 

In [32]:
sample_brgy = analyze_df.copy()
sample_brgy = sample_brgy[sample_brgy["ADM4_PCODE"] == "PH097332087"]
sample_brgy["readable_date"] = sample_brgy["start_of_week"].dt.date

In [33]:
sample_brgy

Unnamed: 0,start_of_week,CO_AVG,CO_MIN,CO_MAX,CO_STD,HI_AVG,HI_MIN,HI_MAX,HI_STD,NDVI_AVG,...,prev_2_wk_numcases,prev_3_wk_numcases,prev_1_mo_numcases,ADM4_PCODE,NumCases,actual_class,predicted_class,predicted_proba_outbreak,outbreak_group,readable_date
22,2019-05-27,0.055143,0.0524,0.0583,0.00243,30.57,29.16,31.53,0.815332,0.39,...,2.0,1.0,2.0,PH097332087,9.0,1,1,0.595,47,2019-05-27
57,2019-07-01,0.056957,0.0526,0.0599,0.002284,28.234286,26.17,30.2,1.524608,0.451429,...,5.0,11.0,3.0,PH097332087,13.0,1,1,0.6625,47,2019-07-01
106,2019-07-29,0.0574,0.0523,0.0644,0.004923,28.622857,27.3,29.93,0.971146,0.462857,...,27.0,11.0,13.0,PH097332087,21.0,1,1,0.78,47,2019-07-29
156,2019-08-26,0.062814,0.0608,0.0667,0.00214,28.047143,26.97,29.31,0.976878,0.412857,...,19.0,16.0,21.0,PH097332087,10.0,1,1,0.7825,47,2019-08-26
193,2019-09-30,0.053829,0.0519,0.0576,0.002006,28.402857,27.0,30.0,0.951555,0.432857,...,2.0,7.0,11.0,PH097332087,7.0,1,0,0.365,48,2019-09-30
210,2019-10-28,0.059843,0.0553,0.0776,0.007986,27.978571,27.03,29.12,0.84454,0.428571,...,5.0,5.0,7.0,PH097332087,3.0,0,0,0.4475,48,2019-10-28
231,2019-11-25,0.071986,0.0632,0.0888,0.008862,29.807143,28.78,30.78,0.730382,0.408571,...,6.0,2.0,3.0,PH097332087,3.0,1,1,0.625,49,2019-11-25
241,2020-01-27,0.0828,0.0642,0.1197,0.024305,29.05,28.42,29.79,0.481733,0.42,...,1.0,2.0,0.0,PH097332087,4.0,0,0,0.275,50,2020-01-27


In [None]:
fig, ax = plt.subplots(figsize=(16, 6))


# Define colors for each class
# shakespeare, teraccota
class_colors = {0: "#53bed0", 1: "#ee472f"}

# Plot the bar plot
ax.bar(
    sample_brgy["readable_date"],
    sample_brgy["NumCases"],
    alpha=0.5,
    label="Bar Plot",
    width=8,
    color=[class_colors[c] for c in sample_brgy["predicted_class"]],
)

# Plot the line plot
ax.plot(
    sample_brgy["readable_date"],
    sample_brgy["NumCases"],
    marker="o",
    color="red",
    label="Line Plot",
)


# max_y = max(sample_brgy['NumCases'])
# ax1.set_ylim(0, max_y * 1.1)  # Adjust multiplier as needed
# ax2.set_ylim(0, max_y * 1.1)

# Rotate x-axis labels
plt.xticks(rotation=30)

# Create legend handles and labels for bar plot
legend_handles = [
    plt.Rectangle((0, 0), 1, 1, color=color) for color in class_colors.values()
]
legend_labels = list(class_colors.keys())

# Add legend for bar plot
ax.legend(legend_handles, legend_labels, fontsize="large", title="Predicted Outbreak")
# Add title and axis labels
ax.set_title("Dengue Outbreak Periods for Brgy. Tetuan in Zamboanga")
ax.set_xlabel("Date")
ax.set_ylabel("Number of Cases")
plt.show()

### SHAP results

In [38]:
explainer = shap.Explainer(clf_regressor)
shap_values = explainer.shap_values(X_test)

In [None]:
shap.summary_plot(shap_values[0], X_test)