
> This notebook prepare the data for the poverty mapping model for the Philippines.

In [2]:
import os
from functools import reduce
import pandas as pd
import yaml
import subprocess

## Set the configuration for this notebook

In [6]:
prepare_config = dict(
    save_path="../data/outputs/",
    repo_path="../data/SVII_PH_KH_MM_TL",
    download_gcs_uri="gs://poverty-mapping/outputs/",
    output_gcs_uri="gs://poverty-mapping/outputs/",
    data_dir="ph",
    country="ph",
    ookla_folder="ookla_ph",
    hdx_folder="hdx_ph",
    dhs_folder="dhs_ph",
    osm_folder="osm_ph",
    dhs_geo_zip_folder="PHGE71FL",
    dhs_zip_folder="PHHR71DT",
    training_folder="training_ph",
    viirs_folder="viirs_ph",
    # crs="4683",
    # ookla_feature="avg_d_mbps",
    # boundary_file="phl_adminboundaries_candidate_adm3",
    year="2020",
    quarter="2",
    sample=False,
    random_sample=False,
    no_samples=60,
    random_seed=42,
    # clust_rad=2000,
    # plot_ookla_features=True,
    # adm_level=3,
    # use_pcode=True,
    # shape_label="ADM3_PCODE",
    # bins=6,
    # show_legend=False,
    use_ookla=True,
    use_viirs=True,
    use_osm=True,
)


## Download the different datasets from the cloud storage bucket

In [7]:
dataset_folder_keys = ["ookla_folder", "dhs_folder", "viirs_folder", "osm_folder", "training_folder"]

for key in dataset_folder_keys:
    gcs_download_folder = prepare_config['download_gcs_uri'] +  prepare_config[key] 
    save_path = prepare_config['save_path']
    subprocess.call([f'gsutil -m cp -n -r {gcs_download_folder} {save_path}'], shell=True)

Skipping existing item: file://../data/outputs/ookla_ph/PHGE71FL_cluster_coords.csv
Skipping existing item: file://../data/outputs/ookla_ph/ph_2020_2_avg_d_mbps.csv
Skipping existing item: file://../data/outputs/ookla_ph/ph_2020_2_avg_d_mbps_by_pcode_adm3.geojson
Skipping existing item: file://../data/outputs/ookla_ph/phl_adminboundaries_candidate_adm3_avg_d_mbps.jpeg
Skipping existing item: file://../data/outputs/dhs_ph/PHGE71FL_cluster_coords.csv
Skipping existing item: file://../data/outputs/dhs_ph/PHHR71DT_PHGE71FL_by_cluster.csv
Skipping existing item: file://../data/outputs/dhs_ph/PHHR71DT_PHGE71FL_by_cluster.geojson
Skipping existing item: file://../data/outputs/dhs_ph/PHHR71DT_base.csv
Skipping existing item: file://../data/outputs/dhs_ph/PHHR71DT_raw.csv
Skipping existing item: file://../data/outputs/viirs_ph/ph_2017_viirs_avg_rad_zonal_stats.csv
Skipping existing item: file://../data/outputs/viirs_ph/ph_2017_viirs_avg_rad_zonal_stats.gpkg
Skipping existing item: file://../dat

## Load the datasets

In [71]:
## Filepaths
## hardcoded for now
## TODO: Integrate into config once working
cluster_coords_labels_filepath = "../data/outputs/ph/dhs_ph/PHGE71FL_cluster_coords.csv"
ookla_filepath = '../data/outputs/ph/ookla_ph/ph_2020_2_avg_d_mbps.csv'
viirs_filepath = '../data/outputs/ph/viirs_ph/ph_2017_viirs_avg_rad_zonal_stats.csv'
osm_filepath = '../data/outputs/ph/osm_ph/PHGE71FL_cluster_coords_osm_agg.csv'

## Process datasets

In [72]:
## Config flags
## hardcoded for now
## TODO: Integrate into config once working

sample=True
no_samples=60
random_sample=False
random_seed=42

In [73]:
save_path = prepare_config['save_path']

# Load the cluster centroid df
cluster_centroid_df = pd.read_csv(
    os.path.join(cluster_coords_labels_filepath)
)


# sample clusters
if prepare_config["sample"]:
    no_samples = prepare_config["no_samples"]
    seed = prepare_config["random_seed"]
    if prepare_config["random_sample"]:
        cluster_centroid_df = cluster_centroid_df.sample(
            no_samples, random_state=seed
        )
    else:
        cluster_centroid_df = cluster_centroid_df.head(no_samples)



# dataframes to add
data_frames = []
# first add our cluster df
data_frames.append(cluster_centroid_df)

# ookla
if prepare_config["use_ookla"]:

    ## NOTE: Commented out these code cells for building the filepaths
    ## to save time; opting to hardcode instead 
    ## TODO: Redo these cells when we have figured out the desired folder
    ## structure

    # ookla_feature = "avg_d_mbps"
    # country = prepare_config["country"]
    # year = prepare_config["year"]
    # quarter = prepare_config["quarter"]
    # # # from repo dir
    # # result_file_path = os.path.join(
    # #     save_path, f"{country}_{year}_{quarter}_{ookla_feature}.csv"
    # # )
    # # from guild run output
    # result_file_path = f"{country}_{year}_{quarter}_{ookla_feature}.csv"

    mean_download_by_cluster = pd.read_csv(ookla_filepath)
    # append
    data_frames.append(mean_download_by_cluster)

# viirs
if prepare_config["use_viirs"]:

    ## NOTE: Commented out these code cells for building the filepaths
    ## to save time; opting to hardcode instead 
    ## TODO: Redo these cells when we have figured out the desired folder
    ## structure
    # satellite_result_file_name = f"{cluster_coords_filename}_gee_agg.csv"

    # # from repo dir
    # satellite_result_save_path = os.path.join(save_path, satellite_result_file_name)
    # from guild run
    # satellite_result_save_path = satellite_result_file_name

    viirs_result = pd.read_csv(viirs_filepath)
    data_frames.append(viirs_result)

# # osm
if prepare_config["use_osm"]:
    ## NOTE: Commented out these code cells for building the filepaths
    ## to save time; opting to hardcode instead 
    ## TODO: Redo these cells when we have figured out the desired folder
    ## structure

    # osm_result_file_name = f"{cluster_coords_filename}_osm_agg.csv"
    # # from repo dir
    # osm_result_save_path = os.path.join(save_path, osm_result_file_name)
    # from guild run
    # osm_result_save_path = osm_result_file_name

    osm_result = pd.read_csv(osm_filepath)
    data_frames.append(osm_result)


df_merged = reduce(
    lambda left, right: pd.merge(left, right, on=["DHSID"], how="outer"),
    data_frames,
)

# filter out redundant columns
# TODO: find edge cases and change comprehension below accordingly
relev_columns = [col for col in df_merged.columns if col.split("_")[-1] not in "xy"]
df_merged = df_merged[relev_columns]

unnamed_col_labels = [col for col in df_merged.columns if "Unnamed" in col]
df_merged = df_merged.drop(unnamed_col_labels, axis=1)

data_labels_filepath = os.path.join(save_path, "data_labels.csv") 
df_merged.to_csv(data_labels_filepath)

df_merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1250 entries, 0 to 1249
Data columns (total 65 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   DHSID                   1250 non-null   object 
 1   DHSCC                   1250 non-null   object 
 2   DHSYEAR                 1250 non-null   float64
 3   DHSCLUST                1250 non-null   float64
 4   CCFIPS                  0 non-null      float64
 5   ADM1FIPS                0 non-null      float64
 6   ADM1FIPSNA              0 non-null      float64
 7   ADM1SALBNA              0 non-null      float64
 8   ADM1SALBCO              0 non-null      float64
 9   ADM1DHS                 1250 non-null   float64
 10  ADM1NAME                1250 non-null   object 
 11  DHSREGCO                1250 non-null   float64
 12  DHSREGNA                1250 non-null   object 
 13  SOURCE                  1250 non-null   object 
 14  URBAN_RURA              1250 non-null   

In [76]:
! gsutil cp {data_labels_filepath} {prepare_config['download_gcs_uri'] + 'training_ph/'}

Copying file://../data/outputs/ph/data_labels.csv [Content-Type=text/csv]...
- [1 files][421.9 KiB/421.9 KiB]                                                
Operation completed over 1 objects/421.9 KiB.                                    
