# Preprocessing Data for EC2 Instance

This code is used to preprocess the data on the EC2 Instance. You should only have to run it **ONCE**, the cleaned data should be saved to this folder on the instance: [TODO: insert folder].

Let me know if you have any questions! - R

In [1]:
!pip install python-magic
!pip install torch

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
You should consider upgrading via the '/home/ubuntu/anaconda3/envs/pytorch_latest_p37/bin/python -m pip install --upgrade pip' command.[0m
Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
You should consider upgrading via the '/home/ubuntu/anaconda3/envs/pytorch_latest_p37/bin/python -m pip install --upgrade pip' command.[0m


In [2]:
import sys
from concurrent.futures import ThreadPoolExecutor
import os
import magic

import numpy as np
import pandas as pd
import sklearn
from tqdm.auto import tqdm

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import sampler

import torchvision.datasets as dset
import torchvision.transforms as T

import torch.nn.functional as F  # useful stateless functions

In [3]:
# dhs_final_labels.csv file in 231nproj
df = pd.read_csv('../../../../231nproj/dhs_final_labels.csv')
df['survey'] = df['DHSID_EA'].str[:10]
df['cc'] = df['DHSID_EA'].str[:2]

In [4]:
data_dir = '../../../../231nproj/data/'
df['path'] = data_dir + df['survey'] + '/' + df['DHSID_EA'] + '.npz'

In [5]:
path_years = df[['DHSID_EA', 'path', 'year']].apply(tuple, axis=1)
df.set_index('DHSID_EA', verify_integrity=True, inplace=True, drop=False) #had to add drop=False to keep column from disappearing  -- R
print(df['path'].iloc[0])
df.info()

../../../../231nproj/data/AL-2008-5#/AL-2008-5#-00000001.npz
<class 'pandas.core.frame.DataFrame'>
Index: 117644 entries, AL-2008-5#-00000001 to ZW-2015-7#-00000400
Data columns (total 24 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   DHSID_EA          117644 non-null  object 
 1   cname             117644 non-null  object 
 2   year              117644 non-null  int64  
 3   lat               117644 non-null  float64
 4   lon               117644 non-null  float64
 5   n_asset           86936 non-null   float64
 6   asset_index       86936 non-null   float64
 7   n_water           87938 non-null   float64
 8   water_index       87938 non-null   float64
 9   n_sanitation      89271 non-null   float64
 10  sanitation_index  89271 non-null   float64
 11  under5_mort       105582 non-null  float64
 12  n_under5_mort     105582 non-null  float64
 13  women_edu         117062 non-null  float64
 14  women_bmi         94866 non-n

In [6]:
def paths_to_X(paths):  # -> (N, C, H, W) model input X
  '''
    Args
    - paths: array (N, 1)
      - path: str, path to npz file containing single entry 'x'
        representing a (C, H, W) image

    Returns: X, input matrix (N, C, H, W)
    '''
  N = len(paths)  # should be 117644
  print(N)
  C, H, W = 8, 255, 255
  
  imgs = []
  for n in range(N):
    npz_path = paths[n][0]
    imgs.append(np.load(npz_path)['x'])  # shape (C, H, W)
    if n % 2000  == 0:
        print('On example', n)
  
  return np.stack(imgs, axis=0)

In [7]:
label = "n_under5_mort"

SPLITS = {
    'train': [
        'AL', 'BD', 'CD', 'CM', 'GH', 'GU', 'HN', 'IA', 'ID', 'JO', 'KE', 'KM',
        'LB', 'LS', 'MA', 'MB', 'MD', 'MM', 'MW', 'MZ', 'NG', 'NI', 'PE', 'PH',
        'SN', 'TG', 'TJ', 'UG', 'ZM', 'ZW'],
    'val': [
        'BF', 'BJ', 'BO', 'CO', 'DR', 'GA', 'GN', 'GY', 'HT', 'NM', 'SL', 'TD',
        'TZ'],
    'test': [
        'AM', 'AO', 'BU', 'CI', 'EG', 'ET', 'KH', 'KY', 'ML', 'NP', 'PK', 'RW',
        'SZ']
}

SPLITS['trainval'] = SPLITS['train'] + SPLITS['val']

#partial splits

SPLITS['train_partial'] = SPLITS['train'][:5]
SPLITS['val_partial'] = SPLITS['train'][:2]
SPLITS['test_partial'] = SPLITS['train'][:2]
SPLITS['trainval_partial'] = SPLITS['train_partial'] + SPLITS['val_partial']


def get_data_split(label, split):
    train_dhsids = df.index[df['cc'].isin(SPLITS[split]) & df[label].notna()]
    
    train_X_paths = df.loc[train_dhsids, 'path'].values.reshape(-1, 1)
    train_X = paths_to_X(train_X_paths)
    train_Y = df.loc[train_dhsids, label].values
    
    # knn.fit(train_X, train_Y)
    # preds = knn.predict(test_X)
    return train_X, train_Y

In [8]:
train_X, train_Y = get_data_split(label, 'train_partial')
print("train_X: ", train_X.shape)
print("train_Y: ", train_Y.shape)
print('Saving data in folder' + str(data_dir) + '/data_clean/train_partial')
np.savez_compressed(str(data_dir) + 'data_clean/train_partial', train_X=train_X, train_Y=train_Y)

val_X, val_Y = get_data_split(label, 'val_partial')
print("val_X: ", val_X.shape)
print("val_Y: ", val_Y.shape)
print('Saving data in folder' + str(data_dir) + '/data_clean/val_partial')
np.savez_compressed(str(data_dir) + 'data_clean/val_partial', val_X=val_X, val_Y=val_Y)

test_X, test_Y = get_data_split(label, 'test_partial')
print("test_X: ", test_X.shape)
print("test_Y: ", test_Y.shape)
print('Saving data in folder' + str(data_dir) + '/data_clean/test_partial')
np.savez_compressed(str(data_dir) + 'data_clean/test_partial', test_X=test_X, test_Y=test_Y)


# train_X, train_Y = get_data_split(label, 'train')
# print("train_X: ", train_X.shape)
# print("train_Y: ", train_Y.shape)
# print('Saving data in folder' + str(data_dir) + '/data_clean/train')
# np.savez_compressed(str(data_dir) + 'data_clean/train', train_X=train_X, train_Y=train_Y)

# val_X, val_Y = get_data_split(label, 'val')
# print("val_X: ", val_X.shape)
# print("val_Y: ", val_Y.shape)
# print('Saving data in folder' + str(data_dir) + '/data_clean/val')
# np.savez_compressed(str(data_dir) + 'data_clean/val', val_X=val_X, val_Y=val_Y)

# test_X, test_Y = get_data_split(label, 'test')
# print("test_X: ", test_X.shape)
# print("test_Y: ", test_Y.shape)
# print('Saving data in folder' + str(data_dir) + '/data_clean/test')
# np.savez_compressed(str(data_dir) + 'data_clean/test', test_X=test_X, test_Y=test_Y)

7377
On example 0
On example 2000
On example 4000
On example 6000


MemoryError: Unable to allocate 14.3 GiB for an array with shape (7377, 8, 255, 255) and data type float32