# DataSet and DataLoaders for SustainBench Data

Links I referenced:

*   https://pytorch.org/tutorials/beginner/basics/data_tutorial.html
*   https://pytorch.org/docs/stable/data.html
*   https://stanford.edu/~shervine/blog/pytorch-how-to-generate-data-parallel
*   https://stanford.edu/~shervine/blog/pytorch-how-to-generate-data-parallel


**PLEASE NOTE:** You will need to change the datasets (train_dataset, val_dataset) to use the corresponding splits ('train', 'val', etc.) below. Currently all datasets use the 'partial' version of the splits.

**NOTE:** The "check_accuracy" function only uses accuracy, not the r^2 score.

Right now, it is taking a LONG

In [1]:
# This mounts your Google Drive to the Colab VM.
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Enter the foldername for the data set (added shortcut to 230/231N folder)
FOLDERNAME = 'Shareddrives/CS 230 231N/public_datasets'
assert FOLDERNAME is not None, "[!] Enter the foldername."

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
import sys
dataset_path = '/content/drive/My Drive/{}'.format(FOLDERNAME)
# sys.path.append(dataset_path)

In [5]:
from concurrent.futures import ThreadPoolExecutor
import os

import numpy as np
import pandas as pd
import sklearn
from tqdm.auto import tqdm

In [6]:
dataset_root_dir = '/content/drive/Shareddrives/CS 230 231N/sustainbench-main/sustainbench-main/dataset_preprocessing/dhs_lsms'

In [7]:
label_cols = ['asset_index', 'under5_mort', 'women_bmi', 'women_edu', 'water_index', 'sanitation_index']

In [8]:
import torch
from torch.utils.data import Dataset
from torchvision import datasets
from torchvision.transforms import ToTensor
import matplotlib.pyplot as plt
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import sampler

import torchvision.datasets as dset
import torchvision.transforms as T

import torch.nn.functional as F  # useful stateless functions
import scipy

In [9]:
USE_GPU = True
dtype = torch.float32 # We will be using float throughout this tutorial.

if USE_GPU and torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

# Constant to control how frequently we print train loss.
print_every = 100
print('using device:', device)

using device: cuda


In [10]:
label = "n_under5_mort"

SPLITS = {
    'train': [
        'AL', 'BD', 'CD', 'CM', 'GH', 'GU', 'HN', 'IA', 'ID', 'JO', 'KE', 'KM',
        'LB', 'LS', 'MA', 'MB', 'MD', 'MM', 'MW', 'MZ', 'NG', 'NI', 'PE', 'PH',
        'SN', 'TG', 'TJ', 'UG', 'ZM', 'ZW'],
    'val': [
        'BF', 'BJ', 'BO', 'CO', 'DR', 'GA', 'GN', 'GY', 'HT', 'NM', 'SL', 'TD',
        'TZ'],
    'test': [
        'AM', 'AO', 'BU', 'CI', 'EG', 'ET', 'KH', 'KY', 'ML', 'NP', 'PK', 'RW',
        'SZ']
}
SPLITS['trainval'] = SPLITS['train'] + SPLITS['val']

#partial splits

SPLITS['train_partial'] = SPLITS['train'][:2]
SPLITS['val_partial'] = SPLITS['val'][:2]
SPLITS['test_partial'] = SPLITS['test'][:2]
SPLITS['trainval_partial'] = SPLITS['train_partial'] + SPLITS['val_partial']

In [11]:
class SustainBenchDataset(Dataset):
    def __init__(self, annotations_file, img_dir, file_ext, split, category, bands=None, transform=None, target_transform=None):
        self.img_labels = pd.read_csv(annotations_file)
        self.img_dir = img_dir
        self.split = split
        self.bands = bands
        self.category = category
        self.img_labels['survey'] = self.img_labels['DHSID_EA'].str[:10]
        self.img_labels['cc'] = self.img_labels['DHSID_EA'].str[:2]
        # Set up dataframe to have accurate path names
        self.img_labels['survey'] = self.img_labels['DHSID_EA'].str[:10]
        self.img_labels['cc'] = self.img_labels['DHSID_EA'].str[:2]
        self.img_labels['path'] = img_dir + self.img_labels['survey'] + '/' + self.img_labels['DHSID_EA'] + file_ext
        # Only include necessary countries' data with non NaN values
        self.df_split = self.img_labels[self.img_labels['cc'].isin(SPLITS[split]) & self.img_labels[category].notna()].copy()
        path_years = self.df_split[['DHSID_EA', 'path', 'year']].apply(tuple, axis=1)
        self.df_split.set_index('DHSID_EA', verify_integrity=True, inplace=True, drop=False) #drop=False to keep column from disappearing
        print()
        self.transform = transform
        self.target_transform = target_transform

    def __len__(self):
        return len(self.df_split)

    def __getitem__(self, idx):
        df_row = self.df_split.iloc[idx]
        image = np.load(df_row['path'])['x']  # with all 8 channels 

        # Reduce to 3 bands/channels at a time if needed
        if self.bands is not None:
          image = np.load(df_row['path'])['x'][self.bands, :, :]
        
        label = df_row[self.category]

        # Apply transforms if needed
        if self.transform:
            image = self.transform(image)
        if self.target_transform:
            label = self.target_transform(label)

        return image, label

In [12]:
train_dataset = SustainBenchDataset(
    annotations_file=os.path.join(dataset_root_dir, 'output_labels/dhs_final_labels.csv'),
    img_dir='/content/drive/Shareddrives/CS 230 231N/dhs_datasets/',
    category = 'n_under5_mort',
    file_ext = '.npz',
    split = 'train_partial',
    # bands = [0, 1, 2]
    #transform=ToTensor()
)

val_dataset = SustainBenchDataset(
    annotations_file=os.path.join(dataset_root_dir, 'output_labels/dhs_final_labels.csv'),
    img_dir='/content/drive/Shareddrives/CS 230 231N/dhs_datasets/',
    category = 'n_under5_mort',
    file_ext = '.npz',
    split = 'val_partial',  #TODO: CHANGE THIS TO VAL
    # bands = [0, 1, 2]
    #transform=ToTensor()
)

test_dataset = SustainBenchDataset(
    annotations_file=os.path.join(dataset_root_dir, 'output_labels/dhs_final_labels.csv'),
    img_dir='/content/drive/Shareddrives/CS 230 231N/dhs_datasets/',
    category = 'n_under5_mort',
    file_ext = '.npz',
    split = 'test_partial',  #TODO: CHANGE THIS TO TEST
    # bands = [0, 1, 2]
    #transform=ToTensor()
)






In [13]:
# We set up a Dataset object for each split (train / val / test); Datasets load
# training examples one at a time, so we wrap each Dataset in a DataLoader which
# iterates through the Dataset and forms minibatches. We divide the SustainBench
# training set into train and val sets using the splits above

#TODO: use num_workers=<insert number> to help speed up the process
loader_train = DataLoader(train_dataset, batch_size=64, num_workers=2)

loader_val = DataLoader(val_dataset, batch_size=64, num_workers=2)

loader_test = DataLoader(test_dataset, batch_size=64, num_workers=2)

In [14]:
def check_accuracy_part34(loader, model, val_or_test="val"):
    if val_or_test == "val":
        print('Checking accuracy on validation set')
    else:
        print('Checking accuracy on test set')   
    num_correct = 0
    num_samples = 0
    model.eval()  # set model to evaluation mode

    all_preds = []
    Y = []
    with torch.no_grad():
        for x, y in loader:
            x = x.to(device=device, dtype=dtype)  # move to device, e.g. GPU
            y = y.to(device=device, dtype=torch.long)
            scores = model(x).cpu().numpy()
            # _, preds = scores.max(1)
            # num_correct += (preds == y).sum()
            # num_samples += preds.size(0)
            #print(scores.shape)
            preds = np.argmax(scores, axis=1)
            num_correct += (preds == y.cpu().numpy()).sum()
            num_samples += preds.shape[0]

        # TODO: calculate r^2 scoure
        r2 = 0
        # # for r^2
        # all_preds.append(preds)
        # all_preds = np.concatenate(all_preds, axis=0)
        # print('preds:', all_preds[:10], 'actual:', Y.cpu().numpy()[:10])
        # r2, _ = scipy.stats.pearsonr(all_preds, Y.cpu().numpy()[:all_preds.shape[0]])

        # r2 = r2 ** 2
        acc = float(num_correct) / num_samples
        print('Got %d / %d correct (%.2f)' % (num_correct, num_samples, 100 * acc))
        return acc, r2

In [15]:
def train_part34(model, optimizer, epochs=1):
    """
    Train a model on CIFAR-10 using the PyTorch Module API.
    
    Inputs:
    - model: A PyTorch Module giving the model to train.
    - optimizer: An Optimizer object we will use to train the model
    - epochs: (Optional) A Python integer giving the number of epochs to train for
    
    Returns: Nothing, but prints model accuracies during training.
    """
    model = model.to(device=device)  # move the model parameters to CPU/GPU
    for e in range(epochs):
        for t, (x, y) in enumerate(loader_train):
            model.train()  # put model to training mode
            x = x.to(device=device, dtype=dtype)  # move to device, e.g. GPU
            y = y.to(device=device, dtype=torch.long)

            scores = model(x)
            loss = F.cross_entropy(scores, y)

            # Zero out all of the gradients for the variables which the optimizer
            # will update.
            optimizer.zero_grad()

            # This is the backwards pass: compute the gradient of the loss with
            # respect to each  parameter of the model.
            loss.backward()

            # Actually update the parameters of the model using the gradients
            # computed by the backwards pass.
            optimizer.step()

            if t % print_every == 0:
                print('Iteration %d, loss = %.4f' % (t, loss.item()))
                check_accuracy_part34(loader_val, model)
                print()

In [16]:
def flatten(x):
    N = x.shape[0] # read in N, C, H, W
    return x.view(N, -1)  # "flatten" the C * H * W values into a single vector per image

# We need to wrap `flatten` function in a module in order to stack it
# in nn.Sequential
class Flatten(nn.Module):
    def forward(self, x):
        return flatten(x)

In [17]:
best_val = 0
best_lr = None

model = None
optimizer = None

channel_0 = 8
channel_1 = 64
channel_2 = 64
channel_3 = 32
hidden_layer_size_1 = 128
hidden_layer_size_2 = 32
learning_rates = [1e-3, 1e-4, 1e-5, 1e-6, 1e-7]
drop_probs = [0, 0.2, 0.5, 0.8]

In [None]:
for drop_prob in drop_probs:
    for learning_rate in learning_rates:
        model = nn.Sequential(
        nn.Conv2d(channel_0, channel_1, (3, 3), padding="same"),
        nn.ReLU(),
        nn.MaxPool2d((2, 2), stride=2),  # changes H, W from 32 to 16
        nn.Dropout2d(drop_prob),
        nn.Conv2d(channel_1, channel_2, (3, 3), padding="same"),
        nn.ReLU(),
        nn.MaxPool2d((2, 2), stride=2),  # changes H, W from 16 to 8
        nn.Dropout2d(drop_prob),
        nn.Conv2d(channel_2, channel_3, (3, 3), padding="same"),
        nn.ReLU(),
        nn.MaxPool2d((2, 2), stride=2),  # changes H, W from 8 to 4
        nn.BatchNorm2d(num_features = channel_3),
        Flatten(),
        nn.Linear(30752, hidden_layer_size_1),
        nn.ReLU(),
        nn.Dropout(drop_prob),
        nn.Linear(hidden_layer_size_1, hidden_layer_size_2),    
        nn.ReLU(),
        nn.Linear(hidden_layer_size_2, 167),
        )
        optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9, nesterov=True)

        print('LEARNING RATE:', learning_rate, 'DROP PROB:', drop_prob)
        train_part34(model, optimizer, epochs=5)
        val_acc, r2 = check_accuracy_part34(loader_val, model, "val")

        if r2 > best_val:
            best_model = model
            best_lr = learning_rate

LEARNING RATE: 0.001 DROP PROB: 0
Iteration 0, loss = 5.1311
Checking accuracy on validation set
Got 3 / 3183 correct (0.09)

Iteration 0, loss = 4.3544
Checking accuracy on validation set
Got 91 / 3183 correct (2.86)

Iteration 0, loss = 3.8911
Checking accuracy on validation set
Got 91 / 3183 correct (2.86)

Iteration 0, loss = 3.8886
Checking accuracy on validation set
Got 91 / 3183 correct (2.86)

Iteration 0, loss = 3.9063
Checking accuracy on validation set
Got 92 / 3183 correct (2.89)

Checking accuracy on validation set
Got 92 / 3183 correct (2.89)
LEARNING RATE: 0.0001 DROP PROB: 0
Iteration 0, loss = 5.1620
Checking accuracy on validation set
Got 75 / 3183 correct (2.36)

Iteration 0, loss = 5.0621
Checking accuracy on validation set
Got 122 / 3183 correct (3.83)

Iteration 0, loss = 5.0349
Checking accuracy on validation set
Got 122 / 3183 correct (3.83)

Iteration 0, loss = 4.9424
Checking accuracy on validation set
Got 122 / 3183 correct (3.83)

Iteration 0, loss = 4.8579


In [None]:
check_accuracy_part34(loader_test, best_model, "test")