In [2]:
import sys
from concurrent.futures import ThreadPoolExecutor
import os
import magic

import numpy as np
import pandas as pd
import sklearn
from tqdm.auto import tqdm
import scipy

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import sampler

import torchvision.datasets as dset
import torchvision.transforms as T

import torch.nn.functional as F  # useful stateless functions

In [3]:
# Note: when testing on your own may have to change this to reset your python path
# import get_partial_data

data_clean_dir = '../../../../231nproj/data_clean/' 

def load_data(data):
# parameter data is 'all', 'partial', or 'random'. 
# 'all' and 'partial' loads the respective .npz stored files, 'random' randomly generates new partial data.
    print('loading data...')

    if data == 'all':
        # load all data
        train_data = np.load(data_clean_dir + 'train.npz', allow_pickle=True)
        train_X, train_Y = train_data['train_X'], train_data['train_Y']
        print("train_X: ", train_X.shape)
        print("train_Y: ", train_Y.shape)

        val_data = np.load(data_clean_dir + 'val.npz', allow_pickle=True)
        val_X, val_Y = val_data['val_X'], val_data['val_Y']
        print("val_X: ", val_X.shape)
        print("val_Y: ", val_Y.shape)

        trainval_X, trainval_Y = np.concatenate((train_data['train_X'], val_data['val_X']), axis=0), np.concatenate((train_data['train_Y'], val_data['val_Y']), axis=0)
        print("trainval_X: ", trainval_X.shape)
        print("trainval_Y: ", trainval_Y.shape)

        train_data = np.load(data_clean_dir + 'test.npz', allow_pickle=True)
        test_X, test_Y = test_data['test_X'], test_data['test_X'], 
        print("test_X: ", test_X.shape)
        print("test_Y: ", test_Y.shape)
    elif data == 'partial':
        train_data = np.load(data_clean_dir + 'train_partial.npz', allow_pickle=True)
        train_X, train_Y = train_data['train_X'], train_data['train_Y']
        print("train_X: ", train_X.shape)
        print("train_Y: ", train_Y.shape)

        val_data = np.load(data_clean_dir + 'val_partial.npz', allow_pickle=True)
        val_X, val_Y = val_data['val_X'], val_data['val_Y']
        print("val_X: ", val_X.shape)
        print("val_Y: ", val_Y.shape)

        trainval_X, trainval_Y = np.concatenate((train_data['train_X'], val_data['val_X']), axis=0), np.concatenate((train_data['train_Y'], val_data['val_Y']), axis=0)
        print("trainval_X: ", trainval_X.shape)
        print("trainval_Y: ", trainval_Y.shape)

        test_data = np.load(data_clean_dir + 'test_partial.npz', allow_pickle=True)
        test_X, test_Y = test_data['test_X'], test_data['test_X'], 
        print("test_X: ", test_X.shape)
        print("test_Y: ", test_Y.shape)
    else: 
        print('generating random data...')
        label = "n_under5_mort" 
        train_X, train_Y = get_partial_data.get_data_split(label, 'train', 0.005)
        print("train_X: ", train_X.shape)
        print("train_Y: ", train_Y.shape)

        val_X, val_Y = get_partial_data.get_data_split(label, 'val', 0.005)
        print("val_X: ", val_X.shape)
        print("val_Y: ", val_Y.shape)

        test_X, test_Y = get_partial_data.get_data_split(label, 'test', 0.005)
        print("test_X: ", test_X.shape)
        print("test_Y: ", test_Y.shape)
    return torch.from_numpy(train_X), torch.from_numpy(train_Y), torch.from_numpy(val_X), torch.from_numpy(val_Y), torch.from_numpy(test_X), torch.from_numpy(test_Y)

In [4]:
train_X, train_Y, val_X, val_Y, test_X, test_Y = load_data(data='partial')  #used to be random
print('finished loading data.')

USE_GPU = True
dtype = torch.float32 

if USE_GPU and torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

# Constant to control how frequently we print train loss.
print_every = 100
print('using device:', device)

loading data...
train_X:  (514, 8, 255, 255)
train_Y:  (514,)
val_X:  (1614, 8, 255, 255)
val_Y:  (1614,)
trainval_X:  (2128, 8, 255, 255)
trainval_Y:  (2128,)
test_X:  (381, 8, 255, 255)
test_Y:  (381, 8, 255, 255)
finished loading data.
using device: cuda


In [5]:
# function adapted from PyTorch.ipynb from CS 231N Assignment 2
def check_accuracy_part34(X, Y, model, val_or_test):
    if val_or_test == "val":
        print('Checking accuracy on validation set')
    elif val_or_test == "test":
        print('Checking accuracy on test set')

    batch_size = 32
    num_batches = Y.shape[0] // batch_size   
    num_correct = 0
    num_samples = 0
    model.eval()  # set model to evaluation mode
    
    all_preds = []
    with torch.no_grad():
        model = model.to(device=device)  # move the model parameters to CPU/GPU
        for t in range(num_batches):
          x = X[t*batch_size:(t+1)*batch_size, :, :, :]
          y = Y[t*batch_size:(t+1)*batch_size]
          x = x.to(device=device, dtype=dtype)  # move to device, e.g. GPU
          y = y.to(device=device, dtype=torch.long)
          scores = model(x).cpu().numpy()
          #print(scores.shape)
          preds = np.argmax(scores, axis=1)
          # throwing error for last cell, calling (preds == y.cpu().numpy()) a bool
          num_correct += (preds == y.cpu().numpy()).sum()
          num_samples += preds.shape[0]
          
          # for r^2
          all_preds.append(preds)
        all_preds = np.concatenate(all_preds, axis=0)
        print('preds:', all_preds[:10], 'actual:', Y.cpu().numpy()[:10])
        r2, _ = scipy.stats.pearsonr(all_preds, Y.cpu().numpy()[:all_preds.shape[0]])
        r2 = r2 ** 2
        acc = float(num_correct) / num_samples
        print('Got %d / %d correct (%.2f)' % (num_correct, num_samples, 100 * acc), ' and an r^2 value of', r2)
    return acc, r2

In [6]:
# function adapted from PyTorch.ipynb from CS 231N Assignment 2
def train_part34(model, optimizer, val_or_test, epochs=1):
    """
    Train a model using the PyTorch Module API.
    
    Inputs:
    - model: A PyTorch Module giving the model to train.
    - optimizer: An Optimizer object we will use to train the model
    - epochs: (Optional) A Python integer giving the number of epochs to train for
    
    Returns: Nothing, but prints model accuracies during training.
    """
    
    batch_size = 32
    if val_or_test == "val":
      X = train_X
      Y = train_Y
    elif val_or_test == "test":
      X = trainval_X
      Y = trainval_Y
    num_batches = Y.shape[0] // batch_size

    model = model.to(device=device)  # move the model parameters to CPU/GPU
    for e in range(epochs):
      # make minibatches x, y from X, Y
        for t in range(num_batches):
          x = X[t*num_batches:(t+1)*num_batches, :, :, :]
          y = Y[t*num_batches:(t+1)*num_batches]
          model.train()  # put model to training mode
          x = x.to(device=device, dtype=dtype)  # move to device, e.g. GPU
          y = y.to(device=device, dtype=torch.long)
        
          scores = model(x)
          y_one_hots = torch.zeros_like(scores)
          y_one_hots[np.arange(y.size(dim=0)),y] = 1
          # print('scores:', scores, 'y:' , y_one_hots)
          loss = F.cross_entropy(scores, y_one_hots)

          # Zero out all of the gradients for the variables which the optimizer
          # will update.
          optimizer.zero_grad()

          # This is the backwards pass: compute the gradient of the loss with
          # respect to each  parameter of the model.
          loss.backward()

          # Actually update the parameters of the model using the gradients
          # computed by the backwards pass.
          optimizer.step()

          if t % print_every == 0:
              print('Iteration %d, loss = %.4f' % (t, loss.item()))
              check_accuracy_part34(X, Y, model, "val")
              print()


def flatten(x):
    N = x.shape[0] # read in N, C, H, W
    return x.view(N, -1)  # "flatten" the C * H * W values into a single vector per image



In [7]:
# We need to wrap `flatten` function in a module in order to stack it
# in nn.Sequential
class Flatten(nn.Module):
    def forward(self, x):
        return flatten(x)

In [11]:
best_val = 0
best_lr = None

model = None
optimizer = None

channel_0 = 8
channel_1 = 64
channel_2 = 64
channel_3 = 32
hidden_layer_size_1 = 128
hidden_layer_size_2 = 32
learning_rates = [1e-3, 1e-4, 1e-5, 1e-6, 1e-7]
drop_probs = [0, 0.2, 0.5, 0.8]

for drop_prob in drop_probs:
    for learning_rate in learning_rates:
        model = nn.Sequential(  # input is N, 8, 255, 255
            nn.Conv2d(channel_0, channel_1, 3, padding="same"),  # -> N, channel_1, 255, 255
            nn.ReLU(),
            nn.MaxPool2d((2, 2), stride=2),  # changes H, W from 255 to 127
            nn.Dropout2d(drop_prob),
            nn.Conv2d(channel_1, channel_2, (3, 3), padding="same"),  # -> N, channel_2, 127, 127
            nn.ReLU(),
            nn.MaxPool2d((2, 2), stride=2),  # changes H, W from 127 to 63
            nn.Dropout2d(drop_prob),
            nn.Conv2d(channel_2, channel_3, (3, 3), padding="same"),  # -> N, channel_3, 63, 63
            nn.ReLU(),
            nn.MaxPool2d((2, 2), stride=2),  # changes H, W from 63 to 31
            nn.BatchNorm2d(num_features = channel_3),
            Flatten(),  # -> N, (channel_3 * 31 * 31) = 32 * 31 * 31 = 30,752
            nn.Linear(30752, hidden_layer_size_1),
            nn.ReLU(),
            nn.Dropout(drop_prob),
            nn.Linear(hidden_layer_size_1, hidden_layer_size_2),    
            nn.ReLU(),
            nn.Linear(hidden_layer_size_2, 167)
        )
        optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9, nesterov=True)

        print('LEARNING RATE:', learning_rate, 'DROP PROB:', drop_prob)
        train_part34(model, optimizer, epochs=5, val_or_test="val")
        val_acc, r2 = check_accuracy_part34(val_X, val_Y, model, "val")
        if r2 > best_val:
            best_model = model
            best_lr = learning_rate

LEARNING RATE: 0.001 DROP PROB: 0
Iteration 0, loss = 5.1486
Checking accuracy on validation set
preds: [21 21 21 21 21 21 21 21 21 21] actual: [ 6.  6.  9.  8. 19.  6.  9.  6.  6.  6.]
Got 0 / 512 correct (0.00)  and an r^2 value of 0.0001248972996809263

Iteration 0, loss = 3.3880
Checking accuracy on validation set
preds: [6 6 6 6 6 6 6 6 6 6] actual: [ 6.  6.  9.  8. 19.  6.  9.  6.  6.  6.]
Got 117 / 512 correct (22.85)  and an r^2 value of nan





Iteration 0, loss = 2.6885
Checking accuracy on validation set
preds: [5 5 5 5 5 5 5 5 5 5] actual: [ 6.  6.  9.  8. 19.  6.  9.  6.  6.  6.]
Got 131 / 512 correct (25.59)  and an r^2 value of nan

Iteration 0, loss = 2.5057
Checking accuracy on validation set
preds: [8 8 6 8 6 8 6 8 6 8] actual: [ 6.  6.  9.  8. 19.  6.  9.  6.  6.  6.]
Got 104 / 512 correct (20.31)  and an r^2 value of 0.002816391574584883

Iteration 0, loss = 2.5869
Checking accuracy on validation set
preds: [8 8 8 8 8 8 8 8 8 8] actual: [ 6.  6.  9.  8. 19.  6.  9.  6.  6.  6.]
Got 76 / 512 correct (14.84)  and an r^2 value of 0.00438988905829146

Checking accuracy on validation set
preds: [8 8 8 8 8 8 8 8 8 8] actual: [ 8. 11. 14. 19.  7.  9. 18. 12. 13. 12.]
Got 9 / 1600 correct (0.56)  and an r^2 value of nan
LEARNING RATE: 0.0001 DROP PROB: 0
Iteration 0, loss = 5.1233
Checking accuracy on validation set
preds: [99 99 99 99 99 99 99 99 99 99] actual: [ 6.  6.  9.  8. 19.  6.  9.  6.  6.  6.]
Got 0 / 512 correct

preds: [43 10 43 10 10 10 10 10 10 43] actual: [ 6.  6.  9.  8. 19.  6.  9.  6.  6.  6.]
Got 22 / 512 correct (4.30)  and an r^2 value of 0.00429442110520665

Iteration 0, loss = 5.0576
Checking accuracy on validation set
preds: [43 10 43 10 10 10 10 10 10 10] actual: [ 6.  6.  9.  8. 19.  6.  9.  6.  6.  6.]
Got 20 / 512 correct (3.91)  and an r^2 value of 0.004038285984221359

Checking accuracy on validation set
preds: [10 10 10 10 10 79 10 10 10 10] actual: [ 8. 11. 14. 19.  7.  9. 18. 12. 13. 12.]
Got 8 / 1600 correct (0.50)  and an r^2 value of 0.045507176740425946
LEARNING RATE: 1e-05 DROP PROB: 0.2
Iteration 0, loss = 5.1331
Checking accuracy on validation set
preds: [60 60 60 60 60 60 60 60 60 60] actual: [ 6.  6.  9.  8. 19.  6.  9.  6.  6.  6.]
Got 0 / 512 correct (0.00)  and an r^2 value of 1.730415903465099e-05

Iteration 0, loss = 5.1464
Checking accuracy on validation set
preds: [61 61 61 61 61 61 61 61 61 61] actual: [ 6.  6.  9.  8. 19.  6.  9.  6.  6.  6.]
Got 0 / 512 

Iteration 0, loss = 5.1275
Checking accuracy on validation set
preds: [104 104 104 104 104 104 104 104 104 104] actual: [ 6.  6.  9.  8. 19.  6.  9.  6.  6.  6.]
Got 3 / 512 correct (0.59)  and an r^2 value of 0.0043011111860489955

Checking accuracy on validation set
preds: [104 104 104 104 104 104 104 104 104 104] actual: [ 8. 11. 14. 19.  7.  9. 18. 12. 13. 12.]
Got 0 / 1600 correct (0.00)  and an r^2 value of 0.0019096916058303438
LEARNING RATE: 1e-06 DROP PROB: 0.5
Iteration 0, loss = 5.1081
Checking accuracy on validation set
preds: [65 65 65 65 65 65 65 65 65 65] actual: [ 6.  6.  9.  8. 19.  6.  9.  6.  6.  6.]
Got 0 / 512 correct (0.00)  and an r^2 value of 0.002148373121684616

Iteration 0, loss = 5.1238
Checking accuracy on validation set
preds: [109 109 109 109 109 109 109 109 109 109] actual: [ 6.  6.  9.  8. 19.  6.  9.  6.  6.  6.]
Got 0 / 512 correct (0.00)  and an r^2 value of 0.005245456963344268

Iteration 0, loss = 5.1014
Checking accuracy on validation set
preds: [

preds: [102 102 102 102 102 102 102 102 102 102] actual: [ 8. 11. 14. 19.  7.  9. 18. 12. 13. 12.]
Got 0 / 1600 correct (0.00)  and an r^2 value of 0.0016547366229911669
LEARNING RATE: 1e-07 DROP PROB: 0.8
Iteration 0, loss = 5.0713
Checking accuracy on validation set
preds: [77 77 77 77 77 77 77 77 77 77] actual: [ 6.  6.  9.  8. 19.  6.  9.  6.  6.  6.]
Got 0 / 512 correct (0.00)  and an r^2 value of 0.009633232677807805

Iteration 0, loss = 5.1395
Checking accuracy on validation set
preds: [106 106 106 106 106 106 106 106 106 106] actual: [ 6.  6.  9.  8. 19.  6.  9.  6.  6.  6.]
Got 0 / 512 correct (0.00)  and an r^2 value of nan

Iteration 0, loss = 5.0800
Checking accuracy on validation set
preds: [106 106 106 106 106 106 106 106 106 106] actual: [ 6.  6.  9.  8. 19.  6.  9.  6.  6.  6.]
Got 0 / 512 correct (0.00)  and an r^2 value of nan

Iteration 0, loss = 5.0308
Checking accuracy on validation set
preds: [106 106 106 106 106 106 106 106 106 106] actual: [ 6.  6.  9.  8. 19.  

In [12]:
check_accuracy_part34(test_X, test_Y, best_model, "test")

Checking accuracy on test set




AttributeError: 'bool' object has no attribute 'sum'