In [1]:
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt

# Filter, select and create features
After linking all the variables in a single `.csv` file, we select features that (1) have sufficient nonmissing data and (2) would be the most meaningful to the target prediction.
Furthermore, we also create lagged cases values as new features

In [2]:
PROCESSED_DIR = Path("../../data/03-processed/")
LINKED_DATA = PROCESSED_DIR / "linked_df_city_weekly_dengue.csv"

## Load Linked Dataset

In [3]:
linked_df = pd.read_csv(LINKED_DATA)
linked_df.head()

Unnamed: 0,start_of_week,ADM3_PCODE,year,NumCases,Age_min,Age_max,Age_mean,Age_median,Age_std,Female,...,google_bldgs_density_mean,google_bldgs_pct_built_up_area_sum,google_bldgs_pct_built_up_area_mean,travel_time,hospital_pop_reached_total_mean,hospital_pop_reached_pct_mean,healthcenter_pop_reached_total_mean,healthcenter_pop_reached_pct_mean,rhu_pop_reached_total_mean,rhu_pop_reached_pct_mean
0,2013-01-01,PH097332000,2013,66,0.5,68.083333,17.587121,14.75,14.592766,28,...,0.001116,908.037478,8.99047,5,1160.556535,15.182673,4263.402871,44.363465,2829.889802,23.311584
1,2013-01-07,PH097332000,2013,77,0.416667,72.083333,15.054113,10.416667,15.442763,34,...,0.001116,908.037478,8.99047,5,1160.556535,15.182673,4263.402871,44.363465,2829.889802,23.311584
2,2013-01-14,PH097332000,2013,83,0.666667,45.0,13.447791,11.333333,10.454911,38,...,0.001116,908.037478,8.99047,5,1160.556535,15.182673,4263.402871,44.363465,2829.889802,23.311584
3,2013-01-21,PH097332000,2013,75,0.416667,49.583333,12.897778,10.25,9.506395,37,...,0.001116,908.037478,8.99047,5,1160.556535,15.182673,4263.402871,44.363465,2829.889802,23.311584
4,2013-01-28,PH097332000,2013,85,0.25,49.583333,12.851961,11.333333,9.896491,47,...,0.001116,908.037478,8.99047,5,1160.556535,15.182673,4263.402871,44.363465,2829.889802,23.311584


In [4]:
linked_df["date"] = pd.to_datetime(linked_df["start_of_week"])
linked_df = linked_df.sort_values(by=["date"])
linked_df = linked_df.drop(columns=["start_of_week"])
linked_df = linked_df[["date"] + linked_df.columns.tolist()[:-1]]

In [5]:
linked_df

Unnamed: 0,date,ADM3_PCODE,year,NumCases,Age_min,Age_max,Age_mean,Age_median,Age_std,Female,...,google_bldgs_density_mean,google_bldgs_pct_built_up_area_sum,google_bldgs_pct_built_up_area_mean,travel_time,hospital_pop_reached_total_mean,hospital_pop_reached_pct_mean,healthcenter_pop_reached_total_mean,healthcenter_pop_reached_pct_mean,rhu_pop_reached_total_mean,rhu_pop_reached_pct_mean
0,2013-01-01,PH097332000,2013,66,0.500000,68.083333,17.587121,14.750000,14.592766,28,...,0.001116,908.037478,8.99047,5,1160.556535,15.182673,4263.402871,44.363465,2829.889802,23.311584
1,2013-01-07,PH097332000,2013,77,0.416667,72.083333,15.054113,10.416667,15.442763,34,...,0.001116,908.037478,8.99047,5,1160.556535,15.182673,4263.402871,44.363465,2829.889802,23.311584
2,2013-01-14,PH097332000,2013,83,0.666667,45.000000,13.447791,11.333333,10.454911,38,...,0.001116,908.037478,8.99047,5,1160.556535,15.182673,4263.402871,44.363465,2829.889802,23.311584
3,2013-01-21,PH097332000,2013,75,0.416667,49.583333,12.897778,10.250000,9.506395,37,...,0.001116,908.037478,8.99047,5,1160.556535,15.182673,4263.402871,44.363465,2829.889802,23.311584
4,2013-01-28,PH097332000,2013,85,0.250000,49.583333,12.851961,11.333333,9.896491,47,...,0.001116,908.037478,8.99047,5,1160.556535,15.182673,4263.402871,44.363465,2829.889802,23.311584
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
470,2021-11-29,PH097332000,2021,27,0.580000,79.000000,16.784074,12.000000,16.000713,11,...,0.001116,908.037478,8.99047,5,1160.556535,15.182673,4263.402871,44.363465,2829.889802,23.311584
471,2021-12-06,PH097332000,2021,26,0.580000,66.000000,15.176154,10.000000,15.970134,12,...,0.001116,908.037478,8.99047,5,1160.556535,15.182673,4263.402871,44.363465,2829.889802,23.311584
472,2021-12-13,PH097332000,2021,35,3.000000,32.000000,14.200000,12.000000,8.407979,17,...,0.001116,908.037478,8.99047,5,1160.556535,15.182673,4263.402871,44.363465,2829.889802,23.311584
473,2021-12-20,PH097332000,2021,27,0.500000,32.000000,10.388889,10.000000,7.766215,13,...,0.001116,908.037478,8.99047,5,1160.556535,15.182673,4263.402871,44.363465,2829.889802,23.311584


## Inspect missing data

**Listing down the reasons for nulls in the data**

- populated the missing timestamps in the data --- though the number of cases are 0, they are many nans such as the aggregation/age statistics (mean, min, max, etc.)
- male and female column breakdowns --- remove these columns --- no difference in trends
- some climate variables were Nan's (meaning that there are some weeks that did not match)
- tower_nearest_y ? <-- remove
- ookla contains nans since we only have data from 2019-2021
- missing nightlights data

In [6]:
linked_df.columns[linked_df.isnull().any()].tolist()

['Age_std',
 'PNP_AVG_mean',
 'PNP_MIN_mean',
 'PNP_MAX_mean',
 'PNP_STD_mean',
 'SPI3_AVG_mean',
 'SPI3_MIN_mean',
 'SPI3_MAX_mean',
 'SPI3_STD_mean',
 'SPI6_AVG_mean',
 'SPI6_MIN_mean',
 'SPI6_MAX_mean',
 'SPI6_STD_mean',
 'poi_count_sum',
 'poi_count_mean',
 'clinic_count_sum',
 'clinic_count_mean',
 'clinic_nearest_sum',
 'clinic_nearest_mean',
 'dentist_count_sum',
 'dentist_count_mean',
 'dentist_nearest_sum',
 'dentist_nearest_mean',
 'doctors_count_sum',
 'doctors_count_mean',
 'doctors_nearest_sum',
 'doctors_nearest_mean',
 'hospital_count_sum',
 'hospital_count_mean',
 'hospital_nearest_sum',
 'hospital_nearest_mean',
 'optician_count_sum',
 'optician_count_mean',
 'optician_nearest_sum',
 'optician_nearest_mean',
 'pharmacy_count_sum',
 'pharmacy_count_mean',
 'pharmacy_nearest_sum',
 'pharmacy_nearest_mean',
 'drinking_water_count_sum',
 'drinking_water_count_mean',
 'drinking_water_nearest_sum',
 'drinking_water_nearest_mean',
 'water_mill_count_sum',
 'water_mill_count_m

## Filter data and drop columns

In [30]:
# try filtering to check how much we have
filtered_linked_df = linked_df.copy()

### 1. Filter to year with existing population and cases data

In [31]:
filtered_linked_df = filtered_linked_df[
    (filtered_linked_df["year"] >= 2014) & (filtered_linked_df["year"] <= 2020)
]

In [32]:
filtered_linked_df.shape

(369, 281)

### 2. Remove irrelevant columns

In [33]:
print(filtered_linked_df.columns[filtered_linked_df.isnull().any()].tolist())

['Age_std', 'PNP_AVG_mean', 'PNP_MIN_mean', 'PNP_MAX_mean', 'PNP_STD_mean', 'SPI3_AVG_mean', 'SPI3_MIN_mean', 'SPI3_MAX_mean', 'SPI3_STD_mean', 'SPI6_AVG_mean', 'SPI6_MIN_mean', 'SPI6_MAX_mean', 'SPI6_STD_mean']


In [34]:
# remove ookla for now
filtered_linked_df = filtered_linked_df.drop(
    columns=[
        # ookla
        # "fixed_mean_avg_d_kbps_mean",
        # "fixed_mean_avg_u_kbps_mean",
        # "fixed_mean_avg_lat_ms_mean",
        # "fixed_mean_num_tests_mean",
        # "fixed_mean_num_devices_mean",
        # "mobile_mean_avg_d_kbps_mean",
        # "mobile_mean_avg_u_kbps_mean",
        # "mobile_mean_avg_lat_ms_mean",
        # "mobile_mean_num_tests_mean",
        # "mobile_mean_num_devices_mean",
        # # rwi
        # "RWI_max",
        # "RWI_mean",
        # "RWI_median",
        # "RWI_min",
        # "RWI_std",
        # # others
        # "year",
        # "tower_nearest_y",
        # # population density
        # # apparently there are nulls
        # # even if there's population count values
        # "pop_density_mean",
        # "pop_density_median",
        # "pop_density_stdev",
        # "pop_density_min",
        # "pop_density_max",
        # # unneeded osm features
        # "atm_count",
        # "atm_nearest",
        # "bank_count",
        # "bank_nearest",
        # "college_count",
        # "college_nearest",
        # "community_centre_count",
        # "community_centre_nearest",
        # "comms_tower_count",
        # "comms_tower_nearest",
        # "convenience_count",
        # "convenience_nearest",
        # "fire_station_count",
        # "fire_station_nearest",
        # "kindergarten_count",
        # "kindergarten_nearest",
        # "lighthouse_count",
        # "lighthouse_nearest",
        # "market_place_count",
        # "market_place_nearest",
        # "park_count",
        # "park_nearest",
        # "public_building_count",
        # "public_building_nearest",
        # "police_count",
        # "police_nearest",
        # "school_count",
        # "school_nearest",
        # "shelter_count",
        # "shelter_nearest",
        # "supermarket_count",
        # "supermarket_nearest",
        # "telephone_count",
        # "telephone_nearest",
        # "tower_count",
        # "tower_nearest",
        # "town_hall_count",
        # "town_hall_nearest",
        # "university_count",
        # "university_nearest",
        # "cable_count",
        # "cable_nearest",
        # "compensator_count",
        # "compensator_nearest",
        # "connection_count",
        # "connection_nearest",
        # "converter_count",
        # "converter_nearest",
        # "generator_count",
        # "generator_nearest",
        # "insulator_count",
        # "insulator_nearest",
        # "line_count",
        # "line_nearest",
        # "busbar_count",
        # "busbar_nearest",
        # "bay_count",
        # "bay_nearest",
        # "minor_line_count",
        # "minor_line_nearest",
        # "plant_count",
        # "plant_nearest",
        # "pole_count",
        # "pole_nearest",
        # "portal_count",
        # "portal_nearest",
        # "substation_count",
        # "substation_nearest",
        # "tower_count_y",
        # "transformer_count",
        # "transformer_nearest",
        # "exchange_count",
        # "exchange_nearest",
        # "connection_point_count",
        # "connection_point_nearest",
        # "distribution_point_count",
        # "distribution_point_nearest",
        # "service_device_count",
        # "service_device_nearest",
        # "data_center_count",
        # "data_center_nearest",
        # # landuse
        # "pct_area_builtup",
        # "pct_area_cropland",
        # "pct_area_grassland",
        # "pct_area_herbaceous_wetland",
        # "pct_area_mangroves",
        # "pct_area_permanent_water_bodies",
        # "pct_area_shrubland",
        # "pct_area_tree_cover",
        # # Age
        # "Age_min",
        # "Age_max",
        # "Age_mean",
        # "Age_median",
        "Age_std",
    ]
)
filtered_linked_df

Unnamed: 0,date,ADM3_PCODE,year,NumCases,Age_min,Age_max,Age_mean,Age_median,Female,Male,...,google_bldgs_density_mean,google_bldgs_pct_built_up_area_sum,google_bldgs_pct_built_up_area_mean,travel_time,hospital_pop_reached_total_mean,hospital_pop_reached_pct_mean,healthcenter_pop_reached_total_mean,healthcenter_pop_reached_pct_mean,rhu_pop_reached_total_mean,rhu_pop_reached_pct_mean
53,2014-01-01,PH097332000,2014,30,1.00,50.0,13.000000,9.0,17,13,...,0.001116,908.037478,8.99047,5,1160.556535,15.182673,4263.402871,44.363465,2829.889802,23.311584
54,2014-01-06,PH097332000,2014,50,0.00,65.0,13.478400,9.5,21,29,...,0.001116,908.037478,8.99047,5,1160.556535,15.182673,4263.402871,44.363465,2829.889802,23.311584
55,2014-01-13,PH097332000,2014,65,0.00,69.0,15.517949,11.0,24,41,...,0.001116,908.037478,8.99047,5,1160.556535,15.182673,4263.402871,44.363465,2829.889802,23.311584
56,2014-01-20,PH097332000,2014,107,0.58,83.0,13.867570,11.0,57,50,...,0.001116,908.037478,8.99047,5,1160.556535,15.182673,4263.402871,44.363465,2829.889802,23.311584
57,2014-01-27,PH097332000,2014,64,0.42,67.0,16.556042,14.5,34,30,...,0.001116,908.037478,8.99047,5,1160.556535,15.182673,4263.402871,44.363465,2829.889802,23.311584
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
417,2020-11-23,PH097332000,2020,6,0.33,13.0,4.055000,2.5,3,3,...,0.001116,908.037478,8.99047,5,1160.556535,15.182673,4263.402871,44.363465,2829.889802,23.311584
418,2020-12-07,PH097332000,2020,3,1.00,14.0,9.000000,12.0,0,3,...,0.001116,908.037478,8.99047,5,1160.556535,15.182673,4263.402871,44.363465,2829.889802,23.311584
419,2020-12-14,PH097332000,2020,3,1.00,51.0,18.666667,4.0,2,1,...,0.001116,908.037478,8.99047,5,1160.556535,15.182673,4263.402871,44.363465,2829.889802,23.311584
420,2020-12-21,PH097332000,2020,2,3.00,7.0,5.000000,5.0,2,0,...,0.001116,908.037478,8.99047,5,1160.556535,15.182673,4263.402871,44.363465,2829.889802,23.311584


### 3. Handle rows with null values
- Drop rows with null values for `Climate Variables`: NDVI, PNP, PR, SPI3, SPI6
- Replace null values for `Nightlight` features with 0

In [35]:
# remove rows that have no climate match
# filtered_linked_df = filtered_linked_df.dropna(
#     subset=[
#         'PNP_AVG_mean',
#         'PNP_MIN_mean',
#         'PNP_MAX_mean',
#         'PNP_STD_mean',
#         'SPI3_AVG_mean',
#         'SPI3_MIN_mean',
#         'SPI3_MAX_mean',
#         'SPI3_STD_mean',
#         'SPI6_AVG_mean',
#         'SPI6_MIN_mean',
#         'SPI6_MAX_mean',
#         'SPI6_STD_mean'
#     ]
# )
clim_cols_to_interpolate = [
    "PNP_AVG_mean",
    "PNP_MIN_mean",
    "PNP_MAX_mean",
    "PNP_STD_mean",
    "SPI3_AVG_mean",
    "SPI3_MIN_mean",
    "SPI3_MAX_mean",
    "SPI3_STD_mean",
    "SPI6_AVG_mean",
    "SPI6_MIN_mean",
    "SPI6_MAX_mean",
    "SPI6_STD_mean",
]
filtered_linked_df[clim_cols_to_interpolate] = filtered_linked_df[
    clim_cols_to_interpolate
].interpolate(method="linear")


ntl_features = [
    "avg_rad_min_mean",
    "avg_rad_max_mean",
    "avg_rad_mean_mean",
    "avg_rad_std_mean",
    "avg_rad_median_mean",
]
filtered_linked_df[ntl_features] = filtered_linked_df[ntl_features].fillna(0)
filtered_linked_df.shape

(369, 280)

## Create features

### 1. Add lagged cases as features

In [39]:
# Group by barangay and return the date of the previous number of cases
filtered_linked_df = filtered_linked_df.assign(
    prev_1_wk_numcases=filtered_linked_df["NumCases"].shift(1, fill_value=0),
    prev_2_wk_numcases=filtered_linked_df["NumCases"].shift(2, fill_value=0),
    prev_3_wk_numcases=filtered_linked_df["NumCases"].shift(3, fill_value=0),
    prev_1_mo_numcases=filtered_linked_df["NumCases"].shift(4, fill_value=0),
)
filtered_linked_df.shape

(369, 284)

In [40]:
# print out final columns
print(list(filtered_linked_df.columns))

['date', 'ADM3_PCODE', 'year', 'NumCases', 'Age_min', 'Age_max', 'Age_mean', 'Age_median', 'Female', 'Male', 'outbreak', 'CO_AVG_mean', 'CO_MIN_mean', 'CO_MAX_mean', 'CO_STD_mean', 'HI_AVG_mean', 'HI_MIN_mean', 'HI_MAX_mean', 'HI_STD_mean', 'NDVI_AVG_mean', 'NDVI_MIN_mean', 'NDVI_MAX_mean', 'NDVI_STD_mean', 'NO2_AVG_mean', 'NO2_MIN_mean', 'NO2_MAX_mean', 'NO2_STD_mean', 'O3_AVG_mean', 'O3_MIN_mean', 'O3_MAX_mean', 'O3_STD_mean', 'PM10_AVG_mean', 'PM10_MIN_mean', 'PM10_MAX_mean', 'PM10_STD_mean', 'PM25_AVG_mean', 'PM25_MIN_mean', 'PM25_MAX_mean', 'PM25_STD_mean', 'PNP_AVG_mean', 'PNP_MIN_mean', 'PNP_MAX_mean', 'PNP_STD_mean', 'PR_AVG_mean', 'PR_MIN_mean', 'PR_MAX_mean', 'PR_STD_mean', 'RH_AVG_mean', 'RH_MIN_mean', 'RH_MAX_mean', 'RH_STD_mean', 'SO2_AVG_mean', 'SO2_MIN_mean', 'SO2_MAX_mean', 'SO2_STD_mean', 'SPI3_AVG_mean', 'SPI3_MIN_mean', 'SPI3_MAX_mean', 'SPI3_STD_mean', 'SPI6_AVG_mean', 'SPI6_MIN_mean', 'SPI6_MAX_mean', 'SPI6_STD_mean', 'SR_AVG_mean', 'SR_MIN_mean', 'SR_MAX_mean', 'S

## Save data

In [42]:
filtered_linked_df.to_csv(
    PROCESSED_DIR / "filtered_linked_df_city_weekly_dengue.csv", index=False
)