In [1]:
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt

# Filter, select and create features
After linking all the variables in a single `.csv` file, we select features that (1) have sufficient nonmissing data and (2) would be the most meaningful to the target prediction.
Furthermore, we also create lagged cases values as new features

In [2]:
PROCESSED_DIR = Path("../../data/03-processed/")
LINKED_DATA = PROCESSED_DIR / "linked_df_v2.csv"

## Load Linked Dataset

In [3]:
linked_df = pd.read_csv(LINKED_DATA)
linked_df.head()

Unnamed: 0,start_of_week,year,ADM4_PCODE,NumCases,Age_min,Age_max,Age_mean,Age_median,Age_std,Female,...,google_bldgs_count_gt_200_sqm,google_bldgs_density,google_bldgs_pct_built_up_area,travel_time,hospital_pop_reached_total,hospital_pop_reached_pct,healthcenter_pop_reached_total,healthcenter_pop_reached_pct,rhu_pop_reached_total,rhu_pop_reached_pct
0,2013-01-07,2013,PH097332001,0.0,,,,,,,...,35,0.003804,15.039279,5,0.0,0.0,3611.67,76.28,0.0,0.0
1,2013-01-07,2013,PH097332002,3.0,5.333333,22.333333,15.027778,17.416667,8.748148,0.0,...,210,0.001999,14.409458,5,0.0,0.0,9497.76,64.74,10518.68,71.7
2,2013-01-07,2013,PH097332004,2.0,5.666667,10.333333,8.0,8.0,3.299832,2.0,...,390,0.004072,37.941451,5,5930.91,23.38,25016.24,98.61,23340.23,92.01
3,2013-01-07,2013,PH097332005,0.0,,,,,,,...,16,4.3e-05,0.176463,5,0.0,0.0,838.27,11.97,0.0,0.0
4,2013-01-07,2013,PH097332010,0.0,,,,,,,...,125,0.00066,4.093568,5,358.76,5.64,2953.82,46.47,0.0,0.0


In [4]:
linked_df["date"] = pd.to_datetime(linked_df["start_of_week"])
linked_df = linked_df.sort_values(by=["date","ADM4_PCODE"])
linked_df = linked_df.drop(columns=["start_of_week"])
linked_df = linked_df[['date']+linked_df.columns.tolist()[:-1]]

## Inspect missing data

**Listing down the reasons for nulls in the data**

- populated the missing timestamps in the data --- though the number of cases are 0, they are many nans such as the aggregation/age statistics (mean, min, max, etc.)
- male and female column breakdowns --- remove these columns --- no difference in trends
- some climate variables were Nan's (meaning that there are some weeks that did not match)
- tower_nearest_y ? <-- remove
- ookla contains nans since we only have data from 2019-2021
- missing nightlights data

In [5]:
linked_df.columns[linked_df.isnull().any()].tolist()

['Age_min',
 'Age_max',
 'Age_mean',
 'Age_median',
 'Age_std',
 'Female',
 'Male',
 'NDVI_AVG',
 'NDVI_MIN',
 'NDVI_MAX',
 'PNP_AVG',
 'PNP_MIN',
 'PNP_MAX',
 'PNP_STD',
 'PR_AVG',
 'PR_MIN',
 'PR_MAX',
 'PR_STD',
 'SPI3_AVG',
 'SPI3_MIN',
 'SPI3_MAX',
 'SPI3_STD',
 'SPI6_AVG',
 'SPI6_MIN',
 'SPI6_MAX',
 'SPI6_STD',
 'poi_count',
 'clinic_count',
 'clinic_nearest',
 'dentist_count',
 'dentist_nearest',
 'doctors_count',
 'doctors_nearest',
 'hospital_count',
 'hospital_nearest',
 'optician_count',
 'optician_nearest',
 'pharmacy_count',
 'pharmacy_nearest',
 'atm_count',
 'atm_nearest',
 'bank_count',
 'bank_nearest',
 'college_count',
 'college_nearest',
 'community_centre_count',
 'community_centre_nearest',
 'comms_tower_count',
 'comms_tower_nearest',
 'convenience_count',
 'convenience_nearest',
 'fire_station_count',
 'fire_station_nearest',
 'kindergarten_count',
 'kindergarten_nearest',
 'lighthouse_count',
 'lighthouse_nearest',
 'market_place_count',
 'market_place_nearest',

## Filter data and drop columns

In [6]:
# try filtering to check how much we have
filtered_linked_df = linked_df.copy()

### 1. Filter to year with existing population and cases data

In [7]:
filtered_linked_df = filtered_linked_df[
    (filtered_linked_df["year"] >= 2014) & (filtered_linked_df["year"] <= 2020)
]

### 2. Remove irrelevant columns

In [8]:
# remove ookla for now
filtered_linked_df = filtered_linked_df.drop(
    columns=[
        # ookla
        "fixed_mean_avg_d_kbps_mean",
        "fixed_mean_avg_u_kbps_mean",
        "fixed_mean_avg_lat_ms_mean",
        "fixed_mean_num_tests_mean",
        "fixed_mean_num_devices_mean",
        "mobile_mean_avg_d_kbps_mean",
        "mobile_mean_avg_u_kbps_mean",
        "mobile_mean_avg_lat_ms_mean",
        "mobile_mean_num_tests_mean",
        "mobile_mean_num_devices_mean",
        # rwi
        "RWI_max",
        "RWI_mean",
        "RWI_median",
        "RWI_min",
        "RWI_std",
        # others
        "year",
        "tower_nearest_y",
        # population density
        # apparently there are nulls
        # even if there's population count values
        "pop_density_mean",
        "pop_density_median",
        "pop_density_stdev",
        "pop_density_min",
        "pop_density_max",
        # unneeded osm features
        "atm_count",
        "atm_nearest",
        "bank_count",
        "bank_nearest",
        "college_count",
        "college_nearest",
        "community_centre_count",
        "community_centre_nearest",
        "comms_tower_count",
        "comms_tower_nearest",
        "convenience_count",
        "convenience_nearest",
        "fire_station_count",
        "fire_station_nearest",
        "kindergarten_count",
        "kindergarten_nearest",
        "lighthouse_count",
        "lighthouse_nearest",
        "market_place_count",
        "market_place_nearest",
        "park_count",
        "park_nearest",
        "public_building_count",
        "public_building_nearest",
        "police_count",
        "police_nearest",
        "school_count",
        "school_nearest",
        "shelter_count",
        "shelter_nearest",
        "supermarket_count",
        "supermarket_nearest",
        "telephone_count",
        "telephone_nearest",
        "tower_count",
        "tower_nearest",
        "town_hall_count",
        "town_hall_nearest",
        "university_count",
        "university_nearest",
        "cable_count",
        "cable_nearest",
        "compensator_count",
        "compensator_nearest",
        "connection_count",
        "connection_nearest",
        "converter_count",
        "converter_nearest",
        "generator_count",
        "generator_nearest",
        "insulator_count",
        "insulator_nearest",
        "line_count",
        "line_nearest",
        "busbar_count",
        "busbar_nearest",
        "bay_count",
        "bay_nearest",
        "minor_line_count",
        "minor_line_nearest",
        "plant_count",
        "plant_nearest",
        "pole_count",
        "pole_nearest",
        "portal_count",
        "portal_nearest",
        "substation_count",
        "substation_nearest",
        "tower_count_y",
        "transformer_count",
        "transformer_nearest",
        "exchange_count",
        "exchange_nearest",
        "connection_point_count",
        "connection_point_nearest",
        "distribution_point_count",
        "distribution_point_nearest",
        "service_device_count",
        "service_device_nearest",
        "data_center_count",
        "data_center_nearest",
        # landuse
        "pct_area_builtup",
        "pct_area_cropland",
        "pct_area_grassland",
        "pct_area_herbaceous_wetland",
        "pct_area_mangroves",
        "pct_area_permanent_water_bodies",
        "pct_area_shrubland",
        "pct_area_tree_cover",
        # Age
        "Age_min",
        "Age_max",
        "Age_mean",
        "Age_median",
        "Age_std",
    ]
)
filtered_linked_df

Unnamed: 0,date,ADM4_PCODE,NumCases,Female,Male,outbreak,CO_AVG,CO_MIN,CO_MAX,CO_STD,...,google_bldgs_count_gt_200_sqm,google_bldgs_density,google_bldgs_pct_built_up_area,travel_time,hospital_pop_reached_total,hospital_pop_reached_pct,healthcenter_pop_reached_total,healthcenter_pop_reached_pct,rhu_pop_reached_total,rhu_pop_reached_pct
5044,2014-01-06,PH097332001,1.0,0.0,1.0,0,0.071500,0.0610,0.0815,0.008019,...,35,0.003804,15.039279,5,0.00,0.00,3611.67,76.28,0.00,0.00
5045,2014-01-06,PH097332002,1.0,1.0,0.0,0,0.071500,0.0610,0.0815,0.008019,...,210,0.001999,14.409458,5,0.00,0.00,9497.76,64.74,10518.68,71.70
5046,2014-01-06,PH097332004,1.0,0.0,1.0,0,0.071500,0.0610,0.0815,0.008019,...,390,0.004072,37.941451,5,5930.91,23.38,25016.24,98.61,23340.23,92.01
5047,2014-01-06,PH097332005,0.0,,,0,0.071500,0.0610,0.0815,0.008019,...,16,0.000043,0.176463,5,0.00,0.00,838.27,11.97,0.00,0.00
5048,2014-01-06,PH097332010,0.0,,,0,0.071500,0.0610,0.0815,0.008019,...,125,0.000660,4.093568,5,358.76,5.64,2953.82,46.47,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40444,2020-12-28,PH097332100,0.0,,,0,0.060600,0.0595,0.0632,0.001753,...,23,0.000036,0.158494,5,0.00,0.00,1031.79,16.21,0.00,0.00
40445,2020-12-28,PH097332101,0.0,,,0,0.059275,0.0574,0.0601,0.001261,...,136,0.004811,30.891231,5,2729.87,20.76,8667.49,65.93,6707.37,51.02
40446,2020-12-28,PH097332102,1.0,0.0,1.0,0,0.059275,0.0574,0.0601,0.001261,...,131,0.000981,6.676404,5,2200.08,88.13,1967.59,78.82,0.00,0.00
40447,2020-12-28,PH097332103,1.0,1.0,0.0,0,0.059250,0.0575,0.0604,0.001240,...,20,0.000124,0.677485,5,0.00,0.00,1499.95,39.13,422.64,11.03


### 3. Handle rows with null values
- Drop rows with null values for `Climate Variables`: NDVI, PNP, PR, SPI3, SPI6
- Replace null values for `Nightlight` features with 0

In [9]:
# remove rows that have no climate match
filtered_linked_df = filtered_linked_df.dropna(
    subset=[
        "NDVI_AVG",
        "NDVI_MIN",
        "NDVI_MAX",
        "PNP_AVG",
        "PNP_MIN",
        "PNP_MAX",
        "PNP_STD",
        "PR_AVG",
        "PR_MIN",
        "PR_MAX",
        "PR_STD",
        "SPI3_AVG",
        "SPI3_MIN",
        "SPI3_MAX",
        "SPI3_STD",
        "SPI6_AVG",
        "SPI6_MIN",
        "SPI6_MAX",
        "SPI6_STD",
    ]
)
ntl_features = [
    "avg_rad_min",
    "avg_rad_max",
    "avg_rad_mean",
    "avg_rad_std",
    "avg_rad_median",
]
filtered_linked_df[ntl_features] = filtered_linked_df[ntl_features].fillna(0)
filtered_linked_df.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_linked_df[ntl_features] = filtered_linked_df[ntl_features].fillna(0)


(7488, 183)

## Create features

### 1. Add lagged cases as features

In [10]:
# Group by barangay and return the date of the previous number of cases
filtered_linked_df = filtered_linked_df.assign(
    prev_1_wk_numcases=lambda x: x.groupby(["ADM4_PCODE"])["NumCases"].shift(
        1, fill_value=0
    ),
    prev_2_wk_numcases=lambda x: x.groupby(["ADM4_PCODE"])["NumCases"].shift(
        2, fill_value=0
    ),
    prev_3_wk_numcases=lambda x: x.groupby(["ADM4_PCODE"])["NumCases"].shift(
        3, fill_value=0
    ),
    prev_1_mo_numcases=lambda x: x.groupby(["ADM4_PCODE"])["NumCases"].shift(
        4, fill_value=0
    ),
)
filtered_linked_df.shape

(7488, 187)

In [11]:
# print out final columns
print(list(filtered_linked_df.columns))

['date', 'ADM4_PCODE', 'NumCases', 'Female', 'Male', 'outbreak', 'CO_AVG', 'CO_MIN', 'CO_MAX', 'CO_STD', 'HI_AVG', 'HI_MIN', 'HI_MAX', 'HI_STD', 'NDVI_AVG', 'NDVI_MIN', 'NDVI_MAX', 'NDVI_STD', 'NO2_AVG', 'NO2_MIN', 'NO2_MAX', 'NO2_STD', 'O3_AVG', 'O3_MIN', 'O3_MAX', 'O3_STD', 'PM10_AVG', 'PM10_MIN', 'PM10_MAX', 'PM10_STD', 'PM25_AVG', 'PM25_MIN', 'PM25_MAX', 'PM25_STD', 'PNP_AVG', 'PNP_MIN', 'PNP_MAX', 'PNP_STD', 'PR_AVG', 'PR_MIN', 'PR_MAX', 'PR_STD', 'RH_AVG', 'RH_MIN', 'RH_MAX', 'RH_STD', 'SO2_AVG', 'SO2_MIN', 'SO2_MAX', 'SO2_STD', 'SPI3_AVG', 'SPI3_MIN', 'SPI3_MAX', 'SPI3_STD', 'SPI6_AVG', 'SPI6_MIN', 'SPI6_MAX', 'SPI6_STD', 'SR_AVG', 'SR_MIN', 'SR_MAX', 'SR_STD', 'Tave_AVG', 'Tave_MIN', 'Tave_MAX', 'Tave_STD', 'Tmax_AVG', 'Tmax_MIN', 'Tmax_MAX', 'Tmax_STD', 'Tmin_AVG', 'Tmin_MIN', 'Tmin_MAX', 'Tmin_STD', 'UVR_AVG', 'UVR_MIN', 'UVR_MAX', 'UVR_STD', 'WS_AVG', 'WS_MIN', 'WS_MAX', 'WS_STD', 'poi_count', 'clinic_count', 'clinic_nearest', 'dentist_count', 'dentist_nearest', 'doctors_cou

## Save data

In [12]:
filtered_linked_df.to_csv(PROCESSED_DIR / "filtered_linked_df.csv", index=False)