## Imports and Setup

In [1]:
import os
import pandas as pd
import geopandas as gpd

import sys
sys.path.insert(0, "../utils/")
import clean_utils

%load_ext autoreload
%autoreload 2

## Clean Schools Dataset

In [43]:
schools = clean_utils.deduplicate_data(
    data_dir='data/vectors/school', 
    out_dir='data/vectors/school/clean',
    out_file="clean.geojson",
    buffer_size=25,
    threshold=85,
    matching_names_buffer_size=150
)
schools.sample(3)

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/39 [00:00<?, ?it/s]

Data dimensions: (459942, 9), CRS: EPSG:4326
Generated /home/itingzon.unicef/giga/data/vectors/school/clean.geojson


Unnamed: 0,UID,source,iso,country,region,subregion,name,giga_id_school,geometry
244849,UNICEF-ZAF-SCHOOL-00029021,UNICEF,ZAF,South Africa,Africa,Sub-Saharan Africa,Jacobsdal Landbouskool,ac846541-9410-315b-8459-2079809c025c,POINT (24.74843 -29.12195)
440564,OVERTURE-ZWE-SCHOOL-00000016,OVERTURE,ZWE,Zimbabwe,Africa,Sub-Saharan Africa,Tutorial Home Schools - SA & ZW,,POINT (31.11224 -17.76360)
440407,UNICEF-ZWE-SCHOOL-00003917,UNICEF,ZWE,Zimbabwe,Africa,Sub-Saharan Africa,MADHODHA,8c105f42-f7d5-3767-af3a-f79502e0176c,POINT (30.09549 -18.50988)


## Clean Non-schools Dataset

In [50]:
nonschools = clean_utils.filter_nonschools_within_school_vicinity(
    school_data_file="data/vectors/school/clean.geojson",
    nonschool_data_dir="data/vectors/non_school/",
    out_dir="data/vectors/non_school/filtered/",
    out_file="filtered.geojson",
    buffer_size=150
)
nonschools = clean_utils.deduplicate_data(
    data_file='data/vectors/non_school/filtered.geojson', 
    out_dir='data/vectors/non_school/clean',
    buffer_size=25,
    matching_names_buffer_size=150
)
nonschools.sample(3)

  0%|          | 0/39 [00:00<?, ?it/s]

Data dimensions: (529323, 8), CRS: EPSG:4326
Generated /home/itingzon.unicef/giga/data/vectors/non_school/clean.geojson


Unnamed: 0,UID,source,iso,country,region,subregion,name,geometry
492220,OSM-HUN-NON_SCHOOL-00011409,OSM,HUN,Hungary,Europe,Eastern Europe,Gesztenyés vendéglő,POINT (18.42335 46.15374)
144223,OSM-THA-NON_SCHOOL-00040517,OSM,THA,Thailand,Asia,South-eastern Asia,,POINT (101.13157 16.87428)
332645,OSM-BRA-NON_SCHOOL-00054556,OSM,BRA,Brazil,Americas,Latin America and the Caribbean,Auditório de Ambiental,POINT (-48.43676 -1.45811)


## Data Inspection

In [51]:
schools.region.value_counts()

region
Africa      233783
Americas    158710
Asia         52459
Europe       14882
Oceania        108
Name: count, dtype: int64

In [52]:
schools.subregion.value_counts()

subregion
Sub-Saharan Africa                 233783
Latin America and the Caribbean    158710
South-eastern Asia                  33305
Central Asia                        17943
Eastern Europe                      13026
Southern Europe                      1856
Eastern Asia                         1211
Micronesia                            108
Name: count, dtype: int64

In [53]:
schools.country.value_counts()

country
Brazil                              127346
Nigeria                             102725
South Africa                         43861
Thailand                             33305
Niger                                17123
Honduras                             15738
Ghana                                14829
Kenya                                13036
Uzbekistan                           10297
Ukraine                               9992
Zimbabwe                              8768
Kazakhstan                            7646
Malawi                                7370
Benin                                 6865
Sierra Leone                          5763
El Salvador                           5303
Costa Rica                            4835
South Sudan                           4533
Rwanda                                3556
Panama                                3293
Hungary                               3034
Namibia                               2141
Bosnia and Herzegovina                1856
Gui

In [54]:
nonschools.country.value_counts()

country
Brazil                              242638
Thailand                             79895
Ukraine                              50034
Hungary                              27088
Nigeria                              19733
Kazakhstan                           14123
Kenya                                11790
Uzbekistan                           11520
Costa Rica                           10565
Ghana                                10165
South Africa                          7122
Bosnia and Herzegovina                6408
El Salvador                           3992
Honduras                              3989
Mongolia                              3634
Panama                                3238
Trinidad and Tobago                   3078
Benin                                 2758
Zimbabwe                              2725
Guinea                                1984
Botswana                              1953
Niger                                 1713
Malawi                                1522
Nam