In [None]:
from google.colab import drive
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
drive.mount('/content/drive')

Mounted at /content/drive


# Generate train/test/validation split

In this notebook we generate splits using stratified sampling and ensuring that we have the same rural/urban distribution on each split

In [None]:
train_split = 0.7
val_split = 0.2
test_split = 0.1

In [None]:
node_ids = pd.read_excel('/content/drive/MyDrive/UNICEF & NYU Giga initiative - data sharing/data/metadata_corrected_nodeIds.xlsx')
node_ids.head()

Unnamed: 0,Filtered,Country,OSM Node ID,Latitude,Longitude,Urban/Rural,Owner,Tagged \nYes/No
0,Yes,CN,1550575000.0,,,,,
1,Yes,CN,1550575000.0,34.952279,106.632764,rural,Alejandro,
2,Yes,CN,1608952000.0,34.953372,106.631477,rural,Alejandro,
3,Yes,CN,1668499000.0,35.9095,107.627963,urban,Alejandro,
4,Yes,CN,2207437000.0,38.07599,114.250362,rural,Alejandro,


In [None]:
node_ids_filtered = node_ids[node_ids['Filtered'] == 'Yes']
len(node_ids_filtered)

3005

In [None]:
X = node_ids_filtered.dropna(
    subset=['Urban/Rural'])
n_examples = len(X)
print(n_examples)

3004


In [None]:
strat = X['Urban/Rural'].to_numpy()

In [None]:
# We initially split into train ant test, we will further split train into train and val
X_train, X_test, _, _ = train_test_split(X, np.zeros(len(X)), test_size=0.1, random_state=1, stratify=strat)

In [None]:
strat_train = X_train['Urban/Rural'].to_numpy()

In [None]:
X_train, X_val, _, _ = train_test_split(X_train, np.zeros(len(X_train)), test_size=0.11, random_state=1, stratify=strat_train)

In [None]:
# Checking that we have the same urban/rural distribution on all splits
X_train['Urban/Rural'].value_counts()/len(X_train)

rural    0.706029
urban    0.293971
Name: Urban/Rural, dtype: float64

In [None]:
X_val['Urban/Rural'].value_counts()/len(X_val)

rural    0.704698
urban    0.295302
Name: Urban/Rural, dtype: float64

In [None]:
X_test['Urban/Rural'].value_counts()/len(X_test)

rural    0.704319
urban    0.295681
Name: Urban/Rural, dtype: float64

In [None]:
# Making sure we end up with a 80/10/10% split
print(len(X_train)/len(X))
print(len(X_train))

0.8005992010652463
2405


In [None]:
print(len(X_val)/len(X))
print(len(X_val))

0.09920106524633822
298


In [None]:
print(len(X_test)/len(X))
print(len(X_test))

0.10019973368841545
301


In [None]:
train_ids = set(X_train['OSM Node ID'].to_numpy())
test_ids = set(X_test['OSM Node ID'].to_numpy())
val_ids = set(X_val['OSM Node ID'].to_numpy())

In [None]:
assert len(train_ids.intersection(test_ids)) == 0, 'Intersection between train and test'
assert len(train_ids.intersection(val_ids)) == 0, 'Intersection between train and val'
assert len(val_ids.intersection(test_ids)) == 0, 'Intersection between val and test'
assert len(train_ids) + len(test_ids) + len(val_ids) == n_examples, 'Missing examples'

In [None]:
train_test_val_col = 'Train/Test/Val'
X_train[train_test_val_col] = 'train'
X_test[train_test_val_col] = 'test'
X_val[train_test_val_col] = 'val'

In [None]:
node_ids_split = pd.concat([X_train, X_test, X_val])

In [None]:
node_ids_split.to_excel('/content/drive/MyDrive/UNICEF & NYU Giga initiative - data sharing/data/metadata_splits.xlsx')