## Imports and Setup

In [1]:
import os
import pandas as pd

import sys
sys.path.insert(0, "../utils/")
import data_utils
import config_utils

import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

%load_ext autoreload
%autoreload 2

## Load Data Files

In [35]:
giga_schools = data_utils.load_files(
    data_dir="data/schools/giga", 
    out_file="giga.geojson"
)
giga_schools.sample(3)

Number of CSV files: 39


  0%|          | 0/39 [00:00<?, ?it/s]

Data dimensions: (437362, 9), CRS: EPSG:4326
Generated /home/itingzon.unicef/giga/data/schools/giga.geojson


Unnamed: 0,UID,source,iso,country,region,subregion,name,geometry,giga_id_school
248478,UNICEF-BRA-SCHOOL-00016960,UNICEF,BRA,Brazil,Americas,Latin America and the Caribbean,E M E I TURMA DA MONICA,POINT (-49.90907 -6.08198),f3c15ee4-f01b-3260-abb1-35925d5c1c9a
217392,UNICEF-ZAF-SCHOOL-00024147,UNICEF,ZAF,South Africa,Africa,Sub-Saharan Africa,Fairview Junior Primary School,POINT (27.67765 -32.12339),3f691e4b-5715-3bdb-9469-db3ff8e08e75
266345,UNICEF-BRA-SCHOOL-00036070,UNICEF,BRA,Brazil,Americas,Latin America and the Caribbean,OSMAR DE OLIVEIRA FONTES E E F,POINT (-39.73294 -5.13004),cefd3148-3100-3f7d-843c-c43d531fb619


## Download Overture Maps Data

In [3]:
overture_schools = data_utils.download_overture(
    giga_schools.iso.unique(), 
    out_dir='data/schools/overture', 
    category="SCHOOL"
)
overture_schools.sample(3)

  0%|          | 0/39 [00:00<?, ?it/s]

Data dimensions: (61183, 8), CRS: EPSG:4326
Generated /home/itingzon.unicef/giga/data/schools/overture.geojson


Unnamed: 0,UID,source,iso,country,region,subregion,name,geometry
1591,OVERTURE-BIH-SCHOOL-00000004,OVERTURE,BIH,Bosnia and Herzegovina,Europe,Southern Europe,Akademija likovnih umjetnosti Sarajevo / Acade...,POINT (18.41769 43.85603)
53705,OVERTURE-MWI-SCHOOL-00000299,OVERTURE,MWI,Malawi,Africa,Sub-Saharan Africa,ECS Business College,POINT (33.77731 -13.89489)
51923,OVERTURE-BRA-SCHOOL-00003528,OVERTURE,BRA,Brazil,Americas,Latin America and the Caribbean,Apple Developer Academy - Porto Alegre,POINT (-51.17164 -30.05966)


## Download OpenStreetMap Data

In [5]:
osm_schools = data_utils.download_osm(
    giga_schools.iso.unique(), 
    out_dir='data/schools/osm',
    category="SCHOOL"
)
osm_schools.sample(3)

  0%|          | 0/39 [00:00<?, ?it/s]

Data dimensions: (40648, 8), CRS: EPSG:4326
Generated /home/itingzon.unicef/giga/data/schools/osm.geojson


Unnamed: 0,UID,source,iso,country,region,subregion,name,geometry
3531,OSM-UKR-SCHOOL-00000763,OSM,UKR,Ukraine,Europe,Eastern Europe,№3,POINT (39.12366 48.39751)
31826,OSM-MWI-SCHOOL-00000453,OSM,MWI,Malawi,Africa,Sub-Saharan Africa,Mtinjitinji School,POINT (35.02087 -14.53988)
14393,OSM-BRA-SCHOOL-00004599,OSM,BRA,Brazil,Americas,Latin America and the Caribbean,CE Espaço Livre,POINT (-40.27130 -19.82297)


In [None]:
osm_non_schools = data_utils.download_osm(
    giga_schools.iso.unique(), 
    out_dir='data/non_schools/osm',
    category="NON_SCHOOL"
)
osm_nonschools.sample(3)

## Data Cleaning

In [52]:
clean_data = data_utils.deduplicate_data(
    data_dir='data/schools', 
    out_dir='data/schools/clean',
    buffer_size=50
)
clean_data.sample(3)

  0%|          | 0/39 [00:00<?, ?it/s]

Data dimensions: (448897, 9), CRS: EPSG:4326
Generated /home/itingzon.unicef/giga/data/schools/clean.geojson


Unnamed: 0,UID,source,iso,country,region,subregion,name,geometry,giga_id_school
145271,UNICEF-UZB-SCHOOL-00002036,UNICEF,UZB,Uzbekistan,Asia,Central Asia,"12,Besharyk,Ferghana",POINT (70.54675 40.41799),8b390c4c-887d-30c1-b2d8-fd5c4bfe0959
345826,UNICEF-BRA-SCHOOL-00004397,UNICEF,BRA,Brazil,Americas,Latin America and the Caribbean,ESCOLA MUNICIPAL NOSSA SENHORA DE NAZARE,POINT (-58.77146 -4.10128),b2ee84e6-b76a-3d39-9b44-fa2f4a5c9bef
385777,UNICEF-NER-SCHOOL-00018446,UNICEF,NER,Niger,Africa,Sub-Saharan Africa,CEG de Sabon Garin Kan,POINT (6.92689 13.48172),1ae4b2f2-8976-3308-96c2-b6879f33a184


## Data Inspection

In [5]:
clean_data.region.value_counts()

region
Africa      232314
Americas    150167
Asia         51636
Europe       14676
Oceania        104
Name: count, dtype: int64

In [6]:
clean_data.subregion.value_counts()

subregion
Sub-Saharan Africa                 232314
Latin America and the Caribbean    150167
South-eastern Asia                  32423
Central Asia                        18052
Eastern Europe                      12839
Southern Europe                      1837
Eastern Asia                         1161
Micronesia                            104
Name: count, dtype: int64

In [7]:
clean_data.country.value_counts()

country
Brazil                              119959
Nigeria                             102916
South Africa                         43406
Thailand                             32423
Niger                                16961
Honduras                             14883
Ghana                                14265
Kenya                                12496
Uzbekistan                           10475
Ukraine                               9892
Zimbabwe                              8769
Malawi                                7708
Kazakhstan                            7577
Benin                                 6770
Sierra Leone                          5799
El Salvador                           5135
Costa Rica                            4785
South Sudan                           4314
Rwanda                                3533
Panama                                3247
Hungary                               2947
Namibia                               2158
Bosnia and Herzegovina                1837
Bot