In [33]:
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt

# Filter, select and create features
After linking all the variables in a single `.csv` file, we select features that (1) have sufficient nonmissing data and (2) would be the most meaningful to the target prediction.
Furthermore, we also create lagged cases values as new features

In [34]:
PROCESSED_DIR = Path("../../data/03-processed/")
# LINKED_DATA = PROCESSED_DIR / "linked_df_city_weekly_dengue.csv"
LINKED_DATA = (
    PROCESSED_DIR / "linked_training_data/linked_df_pidsr_city_weekly_dengue.csv"
)

## Load Linked Dataset

In [35]:
linked_df = pd.read_csv(LINKED_DATA)
linked_df.head()

Unnamed: 0,Source,year,Month,Week,Date,Region,PSGC_Region,Municipality,ADM3_PCODE,ICD,...,google_bldgs_density_mean,google_bldgs_pct_built_up_area_sum,google_bldgs_pct_built_up_area_mean,travel_time,hospital_pop_reached_total,hospital_pct_population_reached,healthcenter_pop_reached_total,healthcenter_pct_population_reached,rhu_pop_reached_total,rhu_pct_population_reached
0,PIDSR-DOH,2008.0,1.0,1,2008-01-07,Region IX,PH090000000,Zamboanga City,PH097332000,A90-A91,...,0.001116,908.037478,8.99047,5,124434.91,14.46,456862.05,53.09,301674.11,35.06
1,PIDSR-DOH,2008.0,1.0,2,2008-01-07,Region IX,PH090000000,Zamboanga City,PH097332000,A90-A91,...,0.001116,908.037478,8.99047,5,124434.91,14.46,456862.05,53.09,301674.11,35.06
2,PIDSR-DOH,2008.0,1.0,3,2008-01-14,Region IX,PH090000000,Zamboanga City,PH097332000,A90-A91,...,0.001116,908.037478,8.99047,5,124434.91,14.46,456862.05,53.09,301674.11,35.06
3,PIDSR-DOH,2008.0,1.0,4,2008-01-21,Region IX,PH090000000,Zamboanga City,PH097332000,A90-A91,...,0.001116,908.037478,8.99047,5,124434.91,14.46,456862.05,53.09,301674.11,35.06
4,PIDSR-DOH,2008.0,1.0,5,2008-01-28,Region IX,PH090000000,Zamboanga City,PH097332000,A90-A91,...,0.001116,908.037478,8.99047,5,124434.91,14.46,456862.05,53.09,301674.11,35.06


In [36]:
linked_df["Date"] = pd.to_datetime(linked_df["Date"])
linked_df = linked_df.sort_values(by=["Date"])
# linked_df = linked_df.drop(columns=["start_of_week"])
linked_df = linked_df[["Date"] + linked_df.columns.tolist()[:-1]]

In [37]:
linked_df

Unnamed: 0,Date,Source,year,Month,Week,Date.1,Region,PSGC_Region,Municipality,ADM3_PCODE,...,google_bldgs_density_sum,google_bldgs_density_mean,google_bldgs_pct_built_up_area_sum,google_bldgs_pct_built_up_area_mean,travel_time,hospital_pop_reached_total,hospital_pct_population_reached,healthcenter_pop_reached_total,healthcenter_pct_population_reached,rhu_pop_reached_total
0,2008-01-07,PIDSR-DOH,2008.0,1.0,1,2008-01-07,Region IX,PH090000000,Zamboanga City,PH097332000,...,0.112676,0.001116,908.037478,8.99047,5,124434.91,14.46,456862.05,53.09,301674.11
1,2008-01-07,PIDSR-DOH,2008.0,1.0,2,2008-01-07,Region IX,PH090000000,Zamboanga City,PH097332000,...,0.112676,0.001116,908.037478,8.99047,5,124434.91,14.46,456862.05,53.09,301674.11
2,2008-01-14,PIDSR-DOH,2008.0,1.0,3,2008-01-14,Region IX,PH090000000,Zamboanga City,PH097332000,...,0.112676,0.001116,908.037478,8.99047,5,124434.91,14.46,456862.05,53.09,301674.11
3,2008-01-21,PIDSR-DOH,2008.0,1.0,4,2008-01-21,Region IX,PH090000000,Zamboanga City,PH097332000,...,0.112676,0.001116,908.037478,8.99047,5,124434.91,14.46,456862.05,53.09,301674.11
4,2008-01-28,PIDSR-DOH,2008.0,1.0,5,2008-01-28,Region IX,PH090000000,Zamboanga City,PH097332000,...,0.112676,0.001116,908.037478,8.99047,5,124434.91,14.46,456862.05,53.09,301674.11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
787,2022-11-21,PIDSR-DOH,2022.0,11.0,48,2022-11-21,Region IX,PH090000000,Zamboanga City,PH097332000,...,0.112676,0.001116,908.037478,8.99047,5,124434.91,14.46,456862.05,53.09,301674.11
788,2022-11-28,PIDSR-DOH,2022.0,11.0,49,2022-11-28,Region IX,PH090000000,Zamboanga City,PH097332000,...,0.112676,0.001116,908.037478,8.99047,5,124434.91,14.46,456862.05,53.09,301674.11
789,2022-12-05,PIDSR-DOH,2022.0,12.0,50,2022-12-05,Region IX,PH090000000,Zamboanga City,PH097332000,...,0.112676,0.001116,908.037478,8.99047,5,124434.91,14.46,456862.05,53.09,301674.11
790,2022-12-12,PIDSR-DOH,2022.0,12.0,51,2022-12-12,Region IX,PH090000000,Zamboanga City,PH097332000,...,0.112676,0.001116,908.037478,8.99047,5,124434.91,14.46,456862.05,53.09,301674.11


## Inspect missing data

**Listing down the reasons for nulls in the data**

- populated the missing timestamps in the data --- though the number of cases are 0, they are many nans such as the aggregation/age statistics (mean, min, max, etc.)
- male and female column breakdowns --- remove these columns --- no difference in trends
- some climate variables were Nan's (meaning that there are some weeks that did not match)
- tower_nearest_y ? <-- remove
- ookla contains nans since we only have data from 2019-2021
- missing nightlights data

In [38]:
linked_df.columns[linked_df.isnull().any()].tolist()

['Claims',
 'Deaths',
 'start_of_week',
 'CO_AVG_mean',
 'CO_MIN_mean',
 'CO_MAX_mean',
 'CO_STD_mean',
 'WEIGHTED_AVG_CO_mean',
 'HI_AVG_mean',
 'HI_MIN_mean',
 'HI_MAX_mean',
 'HI_STD_mean',
 'WEIGHTED_AVG_HI_mean',
 'NDVI_AVG_mean',
 'NDVI_MIN_mean',
 'NDVI_MAX_mean',
 'NDVI_STD_mean',
 'WEIGHTED_AVG_NDVI_mean',
 'NO2_AVG_mean',
 'NO2_MIN_mean',
 'NO2_MAX_mean',
 'NO2_STD_mean',
 'WEIGHTED_AVG_NO2_mean',
 'O3_AVG_mean',
 'O3_MIN_mean',
 'O3_MAX_mean',
 'O3_STD_mean',
 'WEIGHTED_AVG_O3_mean',
 'PM10_AVG_mean',
 'PM10_MIN_mean',
 'PM10_MAX_mean',
 'PM10_STD_mean',
 'WEIGHTED_AVG_PM10_mean',
 'PM25_AVG_mean',
 'PM25_MIN_mean',
 'PM25_MAX_mean',
 'PM25_STD_mean',
 'WEIGHTED_AVG_PM25_mean',
 'PNP_AVG_mean',
 'PNP_MIN_mean',
 'PNP_MAX_mean',
 'PNP_STD_mean',
 'WEIGHTED_AVG_PNP_mean',
 'PR_AVG_mean',
 'PR_MIN_mean',
 'PR_MAX_mean',
 'PR_STD_mean',
 'WEIGHTED_AVG_PR_mean',
 'RH_AVG_mean',
 'RH_MIN_mean',
 'RH_MAX_mean',
 'RH_STD_mean',
 'WEIGHTED_AVG_RH_mean',
 'SO2_AVG_mean',
 'SO2_MIN_mea

## Filter data and drop columns

In [39]:
# try filtering to check how much we have
filtered_linked_df = linked_df.copy()

### 1. Filter to year with existing population and cases data

In [40]:
filtered_linked_df = filtered_linked_df[
    (filtered_linked_df["year"] >= 2014) & (filtered_linked_df["year"] <= 2020)
]

In [41]:
filtered_linked_df.shape

(364, 255)

In [42]:
print(filtered_linked_df.columns.tolist())

['Date', 'Source', 'year', 'Month', 'Week', 'Date', 'Region', 'PSGC_Region', 'Municipality', 'ADM3_PCODE', 'ICD', 'Disease', 'Cases', 'Claims', 'Deaths', 'Case_Type', 'Date_Type', 'outbreak', 'outbreak_group', 'start_of_week', 'CO_AVG_mean', 'CO_MIN_mean', 'CO_MAX_mean', 'CO_STD_mean', 'WEIGHTED_AVG_CO_mean', 'HI_AVG_mean', 'HI_MIN_mean', 'HI_MAX_mean', 'HI_STD_mean', 'WEIGHTED_AVG_HI_mean', 'NDVI_AVG_mean', 'NDVI_MIN_mean', 'NDVI_MAX_mean', 'NDVI_STD_mean', 'WEIGHTED_AVG_NDVI_mean', 'NO2_AVG_mean', 'NO2_MIN_mean', 'NO2_MAX_mean', 'NO2_STD_mean', 'WEIGHTED_AVG_NO2_mean', 'O3_AVG_mean', 'O3_MIN_mean', 'O3_MAX_mean', 'O3_STD_mean', 'WEIGHTED_AVG_O3_mean', 'PM10_AVG_mean', 'PM10_MIN_mean', 'PM10_MAX_mean', 'PM10_STD_mean', 'WEIGHTED_AVG_PM10_mean', 'PM25_AVG_mean', 'PM25_MIN_mean', 'PM25_MAX_mean', 'PM25_STD_mean', 'WEIGHTED_AVG_PM25_mean', 'PNP_AVG_mean', 'PNP_MIN_mean', 'PNP_MAX_mean', 'PNP_STD_mean', 'WEIGHTED_AVG_PNP_mean', 'PR_AVG_mean', 'PR_MIN_mean', 'PR_MAX_mean', 'PR_STD_mean', '

### 2. Remove irrelevant columns

In [43]:
print(filtered_linked_df.columns[filtered_linked_df.isnull().any()].tolist())

['Claims', 'Deaths', 'PNP_AVG_mean', 'PNP_MIN_mean', 'PNP_MAX_mean', 'PNP_STD_mean', 'WEIGHTED_AVG_PNP_mean', 'SPI3_AVG_mean', 'SPI3_MIN_mean', 'SPI3_MAX_mean', 'SPI3_STD_mean', 'WEIGHTED_AVG_SPI3_mean', 'SPI6_AVG_mean', 'SPI6_MIN_mean', 'SPI6_MAX_mean', 'SPI6_STD_mean', 'WEIGHTED_AVG_SPI6_mean']


In [44]:
# remove ookla for now
filtered_linked_df = filtered_linked_df.drop(
    columns=[
        "Date",
        "Source",
        "year",
        "Month",
        "Week",
        "Region",
        "PSGC_Region",
        "Municipality",
        "ICD",
        "Disease",
        "Claims",
        "Deaths",
        "Case_Type",
        "Date_Type",
        "outbreak_group",
        # remove climate features with nulls
        "doh_pois_count_sum",
        "doh_pois_count_mean",
        "doh_brgy_health_station_count_sum",
        "doh_brgy_health_station_count_mean",
        "doh_rural_health_unit_count_sum",
        "doh_rural_health_unit_count_mean",
        "doh_hospital_count_sum",
        "doh_hospital_count_mean",
        "doh_birthing_home_lying_in_clinic_count_sum",
        "doh_birthing_home_lying_in_clinic_count_mean",
        "doh_infirmary_count_sum",
        "doh_infirmary_count_mean",
        "doh_drug_abuse_treatment_rehabilitation_center_count_sum",
        "doh_drug_abuse_treatment_rehabilitation_center_count_mean",
        "doh_social_hygiene_clinic_count_sum",
        "doh_social_hygiene_clinic_count_mean",
        "doh_medical_clinic_count_sum",
        "doh_medical_clinic_count_mean",
        "weighted_avg_doh_brgy_health_station_nearest_mean",
        "weighted_avg_doh_rural_health_unit_nearest_mean",
        "weighted_avg_doh_hospital_nearest_mean",
        "weighted_avg_doh_birthing_home_lying_in_clinic_nearest_mean",
        "weighted_avg_doh_infirmary_nearest_mean",
        "weighted_avg_doh_drug_abuse_treatment_rehabilitation_center_nearest_mean",
        "weighted_avg_doh_social_hygiene_clinic_nearest_mean",
        "weighted_avg_doh_medical_clinic_nearest_mean",
        "PNP_AVG_mean",
        "PNP_MIN_mean",
        "PNP_MAX_mean",
        "PNP_STD_mean",
        "WEIGHTED_AVG_PNP_mean",
        "SPI3_AVG_mean",
        "SPI3_MIN_mean",
        "SPI3_MAX_mean",
        "SPI3_STD_mean",
        "WEIGHTED_AVG_SPI3_mean",
        "SPI6_AVG_mean",
        "SPI6_MIN_mean",
        "SPI6_MAX_mean",
        "SPI6_STD_mean",
        "WEIGHTED_AVG_SPI6_mean",
        # remove static columns
        "pct_area_flood_hazard_100yr_low_mean",
        "pct_area_flood_hazard_100yr_med_mean",
        "pct_area_flood_hazard_100yr_high_mean",
        "pct_area_flood_hazard_25yr_low_mean",
        "pct_area_flood_hazard_25yr_med_mean",
        "pct_area_flood_hazard_25yr_high_mean",
        "pct_area_flood_hazard_5yr_low_mean",
        "pct_area_flood_hazard_5yr_med_mean",
        "pct_area_flood_hazard_5yr_high_mean",
        "pct_area_landslide_hazard_low_mean",
        "pct_area_landslide_hazard_med_mean",
        "pct_area_landslide_hazard_high_mean",
        "pct_area_bare_sparse_vegetation_mean",
        "pct_area_builtup_mean",
        "pct_area_cropland_mean",
        "pct_area_grassland_mean",
        "pct_area_herbaceous_wetland_mean",
        "pct_area_mangroves_mean",
        "pct_area_permanent_water_bodies_mean",
        "pct_area_shrubland_mean",
        "pct_area_tree_cover_mean",
        "google_bldgs_count_sum",
        "google_bldgs_count_mean",
        "google_bldgs_area_total_sum",
        "google_bldgs_area_total_mean",
        "google_bldgs_area_mean_sum",
        "google_bldgs_area_mean_mean",
        "google_bldgs_count_lt100_sqm_sum",
        "google_bldgs_count_lt100_sqm_mean",
        "google_bldgs_count_100_200_sqm_sum",
        "google_bldgs_count_100_200_sqm_mean",
        "google_bldgs_count_gt_200_sqm_sum",
        "google_bldgs_count_gt_200_sqm_mean",
        "google_bldgs_density_sum",
        "google_bldgs_density_mean",
        "google_bldgs_pct_built_up_area_sum",
        "google_bldgs_pct_built_up_area_mean",
        "travel_time",
        "hospital_pop_reached_total",
        "hospital_pct_population_reached",
        "healthcenter_pop_reached_total",
        "healthcenter_pct_population_reached",
        "rhu_pop_reached_total",
    ]
)
filtered_linked_df

Unnamed: 0,ADM3_PCODE,Cases,outbreak,start_of_week,CO_AVG_mean,CO_MIN_mean,CO_MAX_mean,CO_STD_mean,WEIGHTED_AVG_CO_mean,HI_AVG_mean,...,weighted_avg_osm_dock_nearest_mean,pop_count_total,pop_density_per_m2,brgy_pop_count_mean,brgy_total_area_mean,avg_rad_min_mean,avg_rad_max_mean,avg_rad_mean_mean,avg_rad_std_mean,avg_rad_median_mean
312,PH097332000,0.0,0,2014-01-06,0.071684,0.061561,0.081937,0.007905,0.071684,27.839929,...,10000.0,780762.749132,510.070301,7730.324249,1.515541e+07,1.738257,5.266028,2.946094,1.036254,2.805252
313,PH097332000,0.0,0,2014-01-06,0.071684,0.061561,0.081937,0.007905,0.071684,27.839929,...,10000.0,780762.749132,510.070301,7730.324249,1.515541e+07,1.738257,5.266028,2.946094,1.036254,2.805252
314,PH097332000,0.0,0,2014-01-13,0.080858,0.069232,0.093571,0.008738,0.080858,26.511542,...,10000.0,780762.749132,510.070301,7730.324249,1.515541e+07,1.738257,5.266028,2.946094,1.036254,2.805252
315,PH097332000,0.0,0,2014-01-20,0.081604,0.075696,0.091231,0.005616,0.081604,26.305700,...,10000.0,780762.749132,510.070301,7730.324249,1.515541e+07,1.738257,5.266028,2.946094,1.036254,2.805252
316,PH097332000,0.0,0,2014-01-27,0.074839,0.071346,0.081812,0.004384,0.074839,26.145516,...,10000.0,780762.749132,510.070301,7730.324249,1.515541e+07,1.738257,5.266028,2.946094,1.036254,2.805252
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
671,PH097332000,0.0,0,2020-11-23,0.064094,0.059018,0.071787,0.004241,0.064094,29.145997,...,10000.0,860499.694844,562.162243,8519.798959,1.515541e+07,2.278330,5.602449,3.458333,0.987384,3.328969
672,PH097332000,0.0,0,2020-11-30,0.073589,0.060608,0.082439,0.008929,0.073589,29.262999,...,10000.0,860499.694844,562.162243,8519.798959,1.515541e+07,2.278330,5.602449,3.458333,0.987384,3.328969
673,PH097332000,5.0,0,2020-12-07,0.062223,0.059087,0.068834,0.003309,0.062223,29.446082,...,10000.0,860499.694844,562.162243,8519.798959,1.515541e+07,2.278330,5.602449,3.458333,0.987384,3.328969
674,PH097332000,2.0,0,2020-12-14,0.058720,0.056033,0.062671,0.002353,0.058720,28.973819,...,10000.0,860499.694844,562.162243,8519.798959,1.515541e+07,2.278330,5.602449,3.458333,0.987384,3.328969


### 3. Handle rows with null values
- Drop rows with null values for `Climate Variables`: NDVI, PNP, PR, SPI3, SPI6
- Replace null values for `Nightlight` features with 0

In [45]:
# remove rows that have no climate match
# filtered_linked_df = filtered_linked_df.dropna(
#     subset=[
#         'PNP_AVG_mean',
#         'PNP_MIN_mean',
#         'PNP_MAX_mean',
#         'PNP_STD_mean',
#         'SPI3_AVG_mean',
#         'SPI3_MIN_mean',
#         'SPI3_MAX_mean',
#         'SPI3_STD_mean',
#         'SPI6_AVG_mean',
#         'SPI6_MIN_mean',
#         'SPI6_MAX_mean',
#         'SPI6_STD_mean'
#     ]
# )
# clim_cols_to_interpolate = [
#     "PNP_AVG_mean",
#     "PNP_MIN_mean",
#     "PNP_MAX_mean",
#     "PNP_STD_mean",
#     "SPI3_AVG_mean",
#     "SPI3_MIN_mean",
#     "SPI3_MAX_mean",
#     "SPI3_STD_mean",
#     "SPI6_AVG_mean",
#     "SPI6_MIN_mean",
#     "SPI6_MAX_mean",
#     "SPI6_STD_mean",
# ]
# filtered_linked_df[clim_cols_to_interpolate] = filtered_linked_df[
#     clim_cols_to_interpolate
# ].interpolate(method="linear")

# filtered_linked_df[clim_cols_to_interpolate] = filtered_linked_df[clim_cols_to_interpolate].fillna(0)

ntl_features = [
    "avg_rad_min_mean",
    "avg_rad_max_mean",
    "avg_rad_mean_mean",
    "avg_rad_std_mean",
    "avg_rad_median_mean",
]
filtered_linked_df[ntl_features] = filtered_linked_df[ntl_features].fillna(0)
filtered_linked_df.shape

(364, 155)

## Create features

### 1. Add lagged cases as features

In [46]:
# Group by barangay and return the date of the previous number of cases
filtered_linked_df = filtered_linked_df.assign(
    prev_1_wk_numcases=filtered_linked_df["Cases"].shift(1, fill_value=0),
    prev_2_wk_numcases=filtered_linked_df["Cases"].shift(2, fill_value=0),
    prev_3_wk_numcases=filtered_linked_df["Cases"].shift(3, fill_value=0),
    prev_1_mo_numcases=filtered_linked_df["Cases"].shift(4, fill_value=0),
)
filtered_linked_df.shape

(364, 159)

In [47]:
# print out final columns
print(list(filtered_linked_df.columns))

['ADM3_PCODE', 'Cases', 'outbreak', 'start_of_week', 'CO_AVG_mean', 'CO_MIN_mean', 'CO_MAX_mean', 'CO_STD_mean', 'WEIGHTED_AVG_CO_mean', 'HI_AVG_mean', 'HI_MIN_mean', 'HI_MAX_mean', 'HI_STD_mean', 'WEIGHTED_AVG_HI_mean', 'NDVI_AVG_mean', 'NDVI_MIN_mean', 'NDVI_MAX_mean', 'NDVI_STD_mean', 'WEIGHTED_AVG_NDVI_mean', 'NO2_AVG_mean', 'NO2_MIN_mean', 'NO2_MAX_mean', 'NO2_STD_mean', 'WEIGHTED_AVG_NO2_mean', 'O3_AVG_mean', 'O3_MIN_mean', 'O3_MAX_mean', 'O3_STD_mean', 'WEIGHTED_AVG_O3_mean', 'PM10_AVG_mean', 'PM10_MIN_mean', 'PM10_MAX_mean', 'PM10_STD_mean', 'WEIGHTED_AVG_PM10_mean', 'PM25_AVG_mean', 'PM25_MIN_mean', 'PM25_MAX_mean', 'PM25_STD_mean', 'WEIGHTED_AVG_PM25_mean', 'PR_AVG_mean', 'PR_MIN_mean', 'PR_MAX_mean', 'PR_STD_mean', 'WEIGHTED_AVG_PR_mean', 'RH_AVG_mean', 'RH_MIN_mean', 'RH_MAX_mean', 'RH_STD_mean', 'WEIGHTED_AVG_RH_mean', 'SO2_AVG_mean', 'SO2_MIN_mean', 'SO2_MAX_mean', 'SO2_STD_mean', 'WEIGHTED_AVG_SO2_mean', 'SR_AVG_mean', 'SR_MIN_mean', 'SR_MAX_mean', 'SR_STD_mean', 'WEIGHT

## Save data

In [48]:
filtered_linked_df.to_csv(
    PROCESSED_DIR / "filtered_linked_df_city_weekly_dengue.csv", index=False
)