> exploration of the sample train test data in `test_data/inputs/inputs/tl/prepare_train_test_tl` 


In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import geopandas as gpd
import shutil
from pathlib import Path
import os



In [3]:
import povertymapping.process_train_test as ptrain

In [4]:
import sys

### Configuration 
> setup a  configuration object (which can be dictionary or any dict-like object)

The dhs configuration assumes a file structure like the following:

* These output files will be created

```
+ 
+save_path +
           + data_final.pkl  # the preprocessed dataset for model training 
           + <dhs_geo_zip_folder>_labels.pkl  # the preprocessed labels for model training
           + <dhs_geo_zip_folder>_features.pkl  # the preprocessed labels for model training
```

* The input files are required:

```
+save_path +
           + <dhs_geo_zip_folder>_cluster_coords.csv # the geotagged cluster data 
           + <dhs_geo_zip_folder>_cluster_coords_osm_agg.csv # the aggregated osm data
           + <ntl_path>  # the aggregated night lights data.
               
```

In [5]:
config = dict(
        save_path="../test_data/test_outputs/tl", # output directory
        dhs_geo_zip_folder="TLGE71FL", # folder holding DHS shape files
        use_ntl=True,
        use_filt_clt=False,
        multiprocess=False,
        ntl_path = "TLGE71FL_cluster_coords_gee_agg.csv",
        sample=False,
        no_samples=60,
        random_sample=False,
        random_seed=42,
        crs="4683",
        clust_rad=2000

    )

# you can also create a yaml file or json file
# and load it in.

Run the `process_ookla_data`, passing your config object

In [6]:
# uncomment and run the following to clear out the preprocessed files 
# !rm -rf {config['save_path']}
# !mkdir -p {config['save_path']}

In [7]:
save_path = config['save_path']

In [8]:
# shutil.rmtree(save_path, ignore_errors=True)
# #
# os.makedirs(Path(save_path))
# # copy dhs preproc data from output of tl process_dhs_data - TLGE71FL_cluster_coords.csv to save path
# shutil.copy("../data/outputs/dhs_tl/TLGE71FL_cluster_coords.csv",(Path(save_path)/"TLGE71FL_cluster_coords.csv").as_posix())
# shutil.copy("../data/outputs/osm_tl/TLGE71FL_cluster_coords_osm_agg.csv",(Path(save_path)/"TLGE71FL_cluster_coords_osm_agg.csv").as_posix())
# shutil.copy("../data/outputs/ntl_tl/TLGE71FL_cluster_coords_gee_agg.csv",(Path(save_path)/"TLGE71FL_cluster_coords_gee_agg.csv").as_posix())
# shutil.copy("../test_data/inputs/tl/prepare_train_tl/data_labels.csv",(Path(save_path)/"data_labels.csv").as_posix())


In [9]:
# !cp ../data/outputs/dhs_tl/{ookla_config['dhs_geo_zip_folder']}_cluster_coords.csv {ookla_config['save_path']}/.

In [10]:
%%time
# ptrain.process_train_test(config)

CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 7.39 µs


Check that the preprocessed files have been created

In [11]:
from pathlib import Path

In [12]:
final_df = pd.read_pickle(Path(config['save_path'])/'data_final.pkl')

In [13]:
len(final_df)

455

In [14]:
final_df.head()

Unnamed: 0.1,DHSID,Wealth Index,sur_refl_b01_min,avg_rad_kurtosis,sur_refl_b02_var,Unnamed: 0,avg_rad_min,sur_refl_b02_mean,sur_refl_b01_mean,sur_refl_b01_var,...,camp_site_count,city_count,convenience_count,supermarket_count,car_repair_count,department_store_count,computer_count,playground_count,monument_count,avg_d_mbps
0,TL201600000001,32166.6,320,1.920641,1447538.0,0,0.0,3488.282958,1675.051447,2825906.0,...,0,0,0,0,0,0,0,0,0,0.0
1,TL201600000002,-34063.923077,280,1.982492,1821108.0,1,-0.062518,3373.07717,1765.601286,3146018.0,...,0,0,0,0,0,0,0,0,0,0.0
2,TL201600000003,39230.590909,316,0.757446,1588555.0,2,0.0,3297.003215,1680.636656,2779579.0,...,0,0,0,0,0,0,0,0,0,0.0
3,TL201600000004,-82140.227273,209,0.531066,2128854.0,3,-0.089588,3774.241158,2036.620579,4333556.0,...,0,0,0,0,0,0,0,0,0,0.0
4,TL201600000005,-56203.423077,220,-1.021963,1713456.0,4,-0.019353,3708.511254,1824.990354,3674980.0,...,0,0,0,0,0,0,0,0,0,0.0


In [15]:
data_labels_df = pd.read_csv(Path(save_path)/'data_labels.csv')

In [17]:
data_labels_df.columns

Index(['Unnamed: 0', 'DHSCLUST', 'Wealth Index', 'DHSID', 'avg_d_mbps',
       'avg_rad_min', 'avg_rad_max', 'avg_rad_mean', 'avg_rad_median',
       'avg_rad_kurtosis', 'avg_rad_var', 'sur_refl_b01_min',
       'sur_refl_b01_max', 'sur_refl_b01_mean', 'sur_refl_b01_median',
       'sur_refl_b01_kurtosis', 'sur_refl_b01_var', 'sur_refl_b02_min',
       'sur_refl_b02_max', 'sur_refl_b02_mean', 'sur_refl_b02_median',
       'sur_refl_b02_kurtosis', 'sur_refl_b02_var', 'ST_B6_min', 'ST_B6_max',
       'ST_B6_mean', 'ST_B6_median', 'ST_B6_kurtosis', 'ST_B6_var',
       'ST_ATRAN_min', 'ST_ATRAN_max', 'ST_ATRAN_mean', 'ST_ATRAN_median',
       'ST_ATRAN_kurtosis', 'ST_ATRAN_var', 'ST_QA_min', 'ST_QA_max',
       'ST_QA_mean', 'ST_QA_median', 'ST_QA_kurtosis', 'ST_QA_var',
       'longitude', 'latitude', 'building_ct', 'no_roads', 'no_primary_roads',
       'no_trunk_roads', 'marketplace_count', 'charging_station_count',
       'post_box_count', 'post_office_count', 'pharmacy_count',
       