In [None]:
import os
import sys

sys.path.append("../../../")

import getpass
import json
from pathlib import Path

import geopandas as gpd
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

from povertymapping import dhs, feature_engineering, iso3, nightlights, settings

%reload_ext autoreload
%autoreload 2

## Set Parameters

In [None]:
# These configure some params for the feature engineering steps.
COUNTRY_CODE = 'tl'
COUNTRY_OSM = 'east-timor'
OOKLA_YEAR = 2019
NIGHTLIGHTS_YEAR = 2016
ROLLOUT_DATE = None
DHS_DTA_PREFIX = 'dhs/tl/TLHR71DT/TLHR71FL'
DHS_GEO_PREFIX = 'dhs/tl/TLGE71FL/TLGE71FL'

In [None]:
DATA_DIR = settings.DATA_DIR.resolve()
ROLLOUT_DIR = DATA_DIR/f"rollout/{COUNTRY_CODE}"
ROLLOUT_DIR.mkdir(parents=True,exist_ok=True)

In [None]:
# You need to download these datasets from the DHS website before running this notebook.
DHS_HOUSEHOLD_DTA_PATH = Path(settings.DATA_DIR / f"{DHS_DTA_PREFIX}.DTA")
DHS_GEOGRAPHIC_SHP_PATH = Path(settings.DATA_DIR / f"{DHS_GEO_PREFIX}.shp")

# Indicate name of column from dhs data that will be used as the label for training
DHS_LABEL_COL = "Wealth Index"
OUTPUT_LABEL_COL = "Wealth Index - Scaled"

# Tile size for the area around a DHS household cluster to generate features for.
# We set this to 2.4km to match target rollout size of Bing Tile Zoom Level 14 (~2.4km)
TILE_SIZE_KM = 2.4

# For convenience, consider the rollout version as the date of the root notebook folder.
ROLLOUT_DATE = "-".join(os.getcwd().split("/")[-2].split("-")[:3]) if ROLLOUT_DATE is None else ROLLOUT_DATE

## Set up EOG API Access
The following cell will prompt you to enter your EOG username and password. See [this page](https://eogdata.mines.edu/products/register/) to learn how to set-up your EOG account.

In [None]:
#papermill_description="EOG Token Setup"
# Log-in using EOG credentials
username = os.environ.get("EOG_USER", None)
username = username if username is not None else input("Username?")
password = os.environ.get("EOG_PASSWORD", None)
password = password if password is not None else getpass.getpass("Password?")

# set save_token to True so that access token gets stored in ~/.eog_creds/eog_access_token
access_token = nightlights.get_eog_access_token(username, password, save_token=True)

## Load DHS Data

In [None]:
#papermill_description="Generate DHS Cluster level data"
dhs_gdf = dhs.generate_dhs_cluster_level_data(
    DHS_HOUSEHOLD_DTA_PATH,
    DHS_GEOGRAPHIC_SHP_PATH,
    col_rename_config=COUNTRY_CODE,
    convert_geoms_to_bbox=True,
    bbox_size_km=TILE_SIZE_KM,
).reset_index(drop=True)

In [None]:
# Uncomment to view interactive map
# dhs_gdf.explore()

In [None]:
# Inspect the generated data
dhs_gdf.head(3)

## Feature Engineering and Label Pre-processing

If this is your first time running this notebook for this specific country, expect a long runtime for the following cell as it will download and cache the required datasets. It will then process the relevant features for each area specified. On subsequent runs, the runtime will be much faster as the data is already stored in your filesystem. 

In [None]:
%%time
#papermill_description="Scale label columns"
scaler = MinMaxScaler
country_data = dhs_gdf.copy()

# Create labels dataframe by scaling the specified dhs_label_col (ex. Wealth Index)
labels = scaler().fit_transform(country_data[[DHS_LABEL_COL]])
labels = pd.DataFrame(labels, columns=[OUTPUT_LABEL_COL])

In [None]:
#papermill_description="Create features dataframe using generate_features module"
features = feature_engineering.generate_features(
    country_data,
    country_osm=COUNTRY_OSM,
    ookla_year=OOKLA_YEAR,
    nightlights_year=NIGHTLIGHTS_YEAR,
    scale=False,
    features_only=True,
)

In [None]:
features.describe()

In [None]:
labels.hist()

# Save training data and column metadata

In [None]:
#papermill_description="Write training data"
data_gdf = pd.concat([dhs_gdf, features, labels], axis=1)
data_gdf.to_csv(f"{ROLLOUT_DIR}/{ROLLOUT_DATE}-training-data.csv", index=False)

In [None]:
#papermill_description="Write training data columns"
feature_cols = features.columns.tolist()
metadata_cols = [
    col for col in data_gdf if col not in feature_cols and col != OUTPUT_LABEL_COL
]

metadata = {
    "features": feature_cols,
    "label": OUTPUT_LABEL_COL,
    "metadata": metadata_cols,
}

filepath = f"{ROLLOUT_DIR}/{ROLLOUT_DATE}-training-data-columns.json"
with open(filepath, "w") as file:
    json.dump(metadata, file, indent=4)