In [1]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

from matplotlib_inline import backend_inline
backend_inline.set_matplotlib_formats('retina')

In [2]:
from __future__ import print_function, division
import os,sys
import numpy as np
import torch # pytorch package, allows using GPUs
# fix seed
from datetime import datetime
from glob import glob
import pandas as pd
import h5py
import psutil

In [3]:
mem_usage = psutil.virtual_memory()
mem_usage_start = mem_usage.used

# Step 1. Identify data and info

In [4]:
X_train_file_info = pd.DataFrame({'filelist' : glob('data/tmax_train/*.h5')})

In [5]:
X_train_file_info['num'] = [int(file.split('/')[2].split('_')[-1].split('.')[0]) for file in X_train_file_info['filelist']]

In [6]:
X_train_file_info = X_train_file_info.sort_values('num')
X_train_file_info.index = range(len(X_train_file_info))

In [7]:
X_train_info = np.load('data/tmax_train/tmax_X_train_info.npz').get('arr_0')

In [8]:
y_train = np.load('data/tmax_train/tmax_y_train.npz', allow_pickle=True).get('arr_0')
y_train = np.nan_to_num(y_train.astype(float), nan=-8888)

In [9]:
mem_usage = psutil.virtual_memory()

print(f"Free: {mem_usage.percent}%")
print(f"Total: {mem_usage.total/(1024**3):.2f}G")
print(f"Used: {mem_usage.used/(1024**3):.2f}G")
print(f"Used - Start: {(mem_usage.used - mem_usage_start)/(1024**3):.2f}G")

Free: 6.4%
Total: 251.65G
Used: 15.19G
Used - Start: 0.21G


In [10]:
X_val_file_info = pd.DataFrame({'filelist' : glob('data/tmax_val/*.h5')})

In [11]:
X_val_file_info['num'] = [int(file.split('/')[2].split('_')[-1].split('.')[0]) for file in X_val_file_info['filelist']]

In [12]:
X_val_file_info = X_val_file_info.sort_values('num')
X_val_file_info.index = range(len(X_val_file_info))

In [13]:
X_val_info = np.load('data/tmax_val/tmax_X_val_info.npz').get('arr_0')

In [14]:
y_val = np.load('data/tmax_val/tmax_y_val.npz', allow_pickle=True).get('arr_0')
y_val = np.nan_to_num(y_val.astype(float), nan=-8888)

In [15]:
mem_usage = psutil.virtual_memory()

print(f"Free: {mem_usage.percent}%")
print(f"Total: {mem_usage.total/(1024**3):.2f}G")
print(f"Used: {mem_usage.used/(1024**3):.2f}G")
print(f"Used - Start: {(mem_usage.used - mem_usage_start)/(1024**3):.2f}G")

Free: 6.4%
Total: 251.65G
Used: 15.19G
Used - Start: 0.21G


# Step 2. Initialize the dataset with a data loader

In [16]:
import torch
from torch.utils.data import Dataset, DataLoader

class WeatherDataset(Dataset):
    def __init__(self, data_path, y_data, batch_size=1000):
        super().__init__()
        with h5py.File(data_path, 'r') as f:
            self.data_X = np.array(f['data'])
        self.data_y = np.array(np.split(y_data, len(self.data_X)//batch_size))
        self.data_X = np.array(np.split(self.data_X, len(self.data_X)//batch_size))
        
    def __getitem__(self, idx):
        return self.data_X[idx]

    def __len__(self):
        return len(self.data_X)

def load_data(dataset_path, y_data, i):
    # define dataset path
    
    # create dataset object
    batch_size = 1000
    dataset = WeatherDataset(dataset_path[i], y_data[i*1000:i*1000+1000], batch_size=batch_size)

    # create data loader object
    data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
    return data_loader

In [17]:
mem_usage = psutil.virtual_memory()

print(f"Free: {mem_usage.percent}%")
print(f"Total: {mem_usage.total/(1024**3):.2f}G")
print(f"Used: {mem_usage.used/(1024**3):.2f}G")
print(f"Used - Start: {(mem_usage.used - mem_usage_start)/(1024**3):.2f}G")

Free: 6.4%
Total: 251.65G
Used: 15.19G
Used - Start: 0.21G


# Step 3. Design the model

In [18]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class PeriodicConv3d(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1):
        super(PeriodicConv3d, self).__init__()
        self.conv = nn.Conv3d(in_channels, out_channels, kernel_size, stride=stride, padding=0, dilation=dilation, groups=groups)

        # Set the padding to 0 since we will add the padding manually using F.pad
        self.conv.padding = (0, 0, 0)

        # Save the kernel size and stride
        self.kernel_size = kernel_size
        self.stride = stride

        # Compute the padding size
        self.padding_size = ((kernel_size[0]-1)//2, (kernel_size[1]-1)//2, (kernel_size[2]-1)//2)

    def forward(self, x):
        # Pad the input tensor with the last slice of the tensor along each dimension
        x = F.pad(x, (self.padding_size[2], self.padding_size[2], self.padding_size[1], self.padding_size[1], self.padding_size[0], self.padding_size[0]), mode='circular')

        # Perform the convolution
        x = self.conv(x)

        return x


In [19]:
class Model(nn.Module):
    def __init__(self, batch_size=100, n_conv=2, out_channels=16, kernel_size=(5, 30, 5), stride=3, padding=5, \
                 fc_features=1024):
        """
        batch_size is the batch size 
        n_conv is the number of convolutions to do
        out_channels is the number of output channels from the first convolution. The second and onward double each time. 
        kernel_size is the shape of the convolution kernel
        stride is the stride
        fc_features is the number of features for the fully-connected layer
        """
        
        super(Model, self).__init__()
        
        # 3D convolutional layers
        if padding == 'circular':
            self.conv1 = PeriodicConv3d(in_channels=10, out_channels=out_channels, kernel_size=kernel_size, stride=stride, \
                                        padding=padding)
            self.conv2 = PeriodicConv3d(in_channels=out_channels, out_channels=out_channels*2, kernel_size=kernel_size, \
                                        stride=stride, padding=padding)
        else:
            self.conv1 = nn.Conv3d(in_channels=10, out_channels=out_channels, kernel_size=kernel_size, stride=stride, \
                                   padding=padding)
            self.conv2 = nn.Conv3d(in_channels=out_channels, out_channels=out_channels*2, kernel_size=kernel_size, \
                                   stride=stride, padding=padding)
        
        # Batch normalization layers
        self.bn1 = nn.BatchNorm3d(num_features=out_channels)
        self.bn2 = nn.BatchNorm3d(num_features=out_channels*2)
        
        # Max pooling layer
        self.pool = nn.MaxPool3d(kernel_size=(1,2,2), stride=(1,2,2))
        
        # Fully connected layers
        if padding == 'circular':
            num_features = np.array([\
                1, \
                out_channels*n_conv, \
                (( ((11+self.conv1.padding_size[0]*2 - kernel_size[0])//stride + 1) +self.conv1.padding_size[0]*2 - kernel_size[0] )//stride + 1),\
                (( ((365+self.conv1.padding_size[1]*2 - kernel_size[1])//stride + 1)//2 +self.conv1.padding_size[1]*2 - kernel_size[1] )//stride + 1)//2,\
                (( ((batch_size+self.conv1.padding_size[2]*2 - kernel_size[2])//stride + 1)//2 +self.conv1.padding_size[2]*2 - kernel_size[2] )//stride + 1)//2,\
                                    ]).astype(int)
        else:
            num_features = np.array([\
                1, \
                out_channels*n_conv, \
                (( ((11+padding*2 - kernel_size[0])//stride + 1) +padding*2 - kernel_size[0] )//stride + 1),\
                (( ((365+padding*2 - kernel_size[1])//stride + 1)//2 +padding*2 - kernel_size[1] )//stride + 1)//2,\
                (( ((batch_size+padding*2 - kernel_size[2])//stride + 1)//2 +padding*2 - kernel_size[2] )//stride + 1)//2,\
                                    ]).astype(int)
        #print('----', num_features, np.prod(num_features))
        self.fc1 = nn.Linear(in_features=np.prod(num_features), out_features=fc_features) #32*5*182*128, 32*11*91*250
        self.fc2 = nn.Linear(in_features=fc_features, out_features=7*batch_size)
        
        # Dropout layer
        self.dropout = nn.Dropout(p=0.1)
        
        # ReLU activation function
        self.relu = nn.ReLU()
        
    def forward(self, x, batch_size):
        # Input shape: (batch_size=400000, channels=10, depth=11, height=365, width=1)
        
        # First convolutional block
        #print(x.shape)
        # Perform the convolution
        x = self.conv1(x)
        #print(x.shape)
        x = self.bn1(x)
        #print(x.shape)
        x = self.relu(x)
        #print(x.shape)
        x = self.pool(x)
        #print(x.shape)
        
        # Second convolutional block
        x = self.conv2(x)
        #print(x.shape)
        x = self.bn2(x)
        #print(x.shape)
        x = self.relu(x)
        #print(x.shape)
        x = self.pool(x)
        #print(x.shape)
        
        # Flatten
        x = torch.flatten(x, start_dim=1)
        #print(x.shape)
        
        # Fully connected layers with dropout
        x = self.fc1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        
        # Output shape: (batch_size=400000, num_classes=10)
        return x.reshape((1, 7, batch_size))


In [20]:
def accuracy(output, labels):
    "A function that determines how close the outputs are to the real data"
    "Report the total number within 5 and with 10 degrees celsius."
    #print(output.shape)
    #print(labels.shape)
    
    #rmse = torch.sqrt(torch.sum(labels - output)**2 / labels.size()[0])
    within10 = torch.count_nonzero(torch.abs(labels - output) <= 100)
    within5 = torch.count_nonzero(torch.abs(labels - output) <= 50)
    
    return np.array([within5.cpu(), within10.cpu()])

In [23]:
def custom_loss(predictions, labels):
    loss = torch.pow(torch.abs(predictions - labels), 0.75)
    loss = torch.median(loss)
    return loss

In [24]:
import gc
import torch.optim as optim

def train(model, train_data, val_data, batch_size=100, num_epochs=10, loss_function='l1', test_mode=False, out=False):
    # loss function can be `l1` or `custom` for now
    # test_mode uses only 10 files for each, off by default
    # out returns the statistics, off by default
    # Set device to GPU if available, else CPU
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    
    # Define loss function and optimizer
    #criterion = nn.CrossEntropyLoss()
    #criterion = nn.MSELoss()
    if loss_function == 'l1':
        criterion = nn.L1Loss(reduction='sum')
    elif loss_function == 'custom':
        criterion = custom_loss
    else:
        print('NO LOSS FUNCTION CHOSEN, BAD, BAD!')
    optimizer = optim.Adam(model.parameters())
    
    # Training loop
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0
        train_correct = 0
        
        # loop over each file within the big dataset
        # FOR NOW THIS IS GOING TO BE HARD-CODED AND I NEED TO ADJUST IT LATER
        # there are 462 files, but let's not use the last one since it'll be a different shape, so 461 to use.
        # in each epoch, select a random order to train them in 
        file_order = np.arange(0, 461)
        if test_mode:
            file_order = np.arange(0, 10)
        np.random.shuffle(file_order)
        
        for file_i, file_num in enumerate(file_order):
            print('Training on file', file_i+1, '/', len(file_order), '\r', end='')
            train_data = train_loader(file_i)
            
            num_batches = train_data.dataset.data_X.shape[1]//batch_size
            # iterate over the batches
            for batch_i in range(num_batches):
                #print('batch', batch_i+1, '/', train_data.dataset.data_X.shape[1]//batch_size, '\r', end='')
                inputs = torch.Tensor(train_data.dataset.data_X[:, batch_i*batch_size:(batch_i+1)*batch_size, :, :, :]\
                                      .astype('int64')).reshape(1, 10, 11, 365, batch_size)
                labels = torch.Tensor(train_data.dataset.data_y[:, batch_i*batch_size:(batch_i+1)*batch_size, :]\
                                      .astype('int64')).reshape(1, 7, batch_size)

                # Move data to device
                inputs, labels = inputs.to(device), labels.to(device)

                #  pass
                outputs = model(inputs, batch_size)
                loss = criterion(outputs, labels)

                # Backward pass and optimization
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                # Track training loss and accuracy
                train_loss += loss.item() * inputs.size(0)
                #_, predicted = torch.max(outputs.data, 1)
                predicted = outputs
                train_correct += accuracy(predicted, labels)
                
                #del inputs, labels, outputs, loss
                #gc.collect()
                #torch.cuda.empty_cache()

        # Calculate average training loss and accuracy
        train_loss /= len(train_data.dataset.data_X)
        #train_accuracy = 100. * train_correct / len(train_data.dataset)
        train_accuracy5 = 100*train_correct[0]/(predicted.numel()*num_batches*len(file_order))
        train_accuracy10 = 100*train_correct[1]/(predicted.numel()*num_batches*len(file_order))
        
        # Evaluate on validation set
        model.eval()
        val_loss = 0.0
        val_correct = 0

        # Loop through each validation file
        val_file_order = np.arange(0, 46)
        if test_mode:
            val_file_order = np.arange(0, 10)
        np.random.shuffle(val_file_order)
        for file_i, file_num in enumerate(val_file_order):
            print('Validating on file', file_i+1, '/', len(val_file_order), '\r', end='')
            val_data = val_loader(file_i)
        
            # Disable gradient computation
            with torch.no_grad():

                # iterate over the batches for the validation set 
                for batch_i in range(val_data.dataset.data_X.shape[1]//batch_size):
                    #print('batch', batch_i+1, '/', val_data.dataset.data_X.shape[1]//batch_size, '\r', end='')
                    inputs = torch.Tensor(val_data.dataset.data_X[:, batch_i*batch_size:(batch_i+1)*batch_size, :, :, :]\
                                          .astype('int64')).reshape(1, 10, 11, 365, batch_size)
                    labels = torch.Tensor(val_data.dataset.data_y[:, batch_i*batch_size:(batch_i+1)*batch_size, :]\
                                          .astype('int64')).reshape(1, 7, batch_size)
                    
                    # Move data to device
                    inputs, labels = inputs.to(device), labels.to(device)

                    #  pass
                    outputs = model(inputs, batch_size)
                    loss = criterion(outputs, labels)

                    # Track validation loss and accuracy
                    val_loss += loss.item() * inputs.size(0)
                    #_, predicted = torch.max(outputs.data, 1)
                    predicted = outputs
                    val_correct += accuracy(predicted, labels)
                    
                    #del inputs, labels, outputs, loss
                    #gc.collect()
                    #torch.cuda.empty_cache()
            
        # Calculate average validation loss and accuracy
        val_loss /= len(val_data.dataset.data_X)
        #val_accuracy = 100. * val_correct / len(val_data.dataset)
        val_accuracy5 = 100*val_correct[0]/(predicted.numel()*num_batches*len(val_file_order))
        val_accuracy10 = 100*val_correct[1]/(predicted.numel()*num_batches*len(val_file_order))
        
        # Print epoch statistics
        print("Epoch [{}/{}] Train Loss: {:.2e}, Train Acc10: {:.1f}%, Train Acc5: {:.1f}% | \
Val Loss: {:.2e}, Val Acc10: {:.1f}%, Val Acc5: {:.1f}%"
              .format(epoch+1, num_epochs, train_loss, train_accuracy10, train_accuracy5, \
                      val_loss, val_accuracy10, val_accuracy5))

    if out:
        return (train_loss, train_accuracy10, train_accuracy5, val_loss, val_accuracy10, val_accuracy5)

In [25]:
def train_loader(i):
    return load_data(X_train_file_info['filelist'].tolist(), y_train, i)

def val_loader(i):
    return load_data(X_val_file_info['filelist'].tolist(), y_val, i)

# Step 4. Run the model

In [26]:
try: 
    del(model)
except NameError:
    pass
gc.collect()
torch.cuda.empty_cache()

In [27]:
model = Model(batch_size=25, n_conv=2, out_channels=64, kernel_size=(5, 30, 5), stride=2, padding='circular', fc_features=1024)

In [None]:
train(model, train_loader, val_loader, num_epochs=100, batch_size=25, test_mode=True, loss_function='custom')

Training on file 5 / 10 

# Step 5. Tune the hyperparameters

In [27]:
# let stride = 2
kernel_size_list = [
[3, 10, 3],
[3, 30, 3],
[3, 50, 3],

[5, 10, 5],
[5, 30, 5],
[5, 50, 5]]
out_channels_list = [32]
fc_features_list = [1024, 2048]
padding_list = ['circular']
loss_function_list = ['l1', 'custom']

In [28]:
6 * 1 * 2 * 1 * 2

24

In [None]:
for i, kernel_size in enumerate(kernel_size_list):
    for j, out_channels in enumerate(out_channels_list):
        for k, fc_features in enumerate(fc_features_list):
            for l, padding in enumerate(padding_list):
                for m, loss_function in enumerate(loss_function_list):
                    #print((i+1)*(j+1)*(k+1), '/', len(kernel_size_list)*len(out_channels_list)*len(fc_features_list), '\r', end='')
                    print('\nkernel_size :', kernel_size, '| out_channels :', out_channels, ' | fc_features :', fc_features, \
                          '| loss func :', loss_function)
                    try: 
                        del(model)
                    except NameError:
                        pass
                    gc.collect()
                    torch.cuda.empty_cache()

                    model = Model(batch_size=25, n_conv=2, out_channels=out_channels, kernel_size=kernel_size, stride=2, padding=1,\
                                  fc_features=fc_features)

                    out = train(model, train_loader, val_loader, num_epochs=10, batch_size=25, out=True, test_mode=True, \
                                loss_function=loss_function)
                    with open('hyperparameter_results.txt', 'a') as f:
                        f.write("Train Loss: {:.2e}, Train Acc10: {:.1f}%, Train Acc5: {:.1f}% | \
    Val Loss: {:.2e}, Val Acc10: {:.1f}%, Val Acc5: {:.1f}%\n".format(out[0], out[1], out[2], out[3], out[4], out[5]))
                    f.close()

In [29]:
mem_usage = psutil.virtual_memory()

print(f"Free: {mem_usage.percent}%")
print(f"Total: {mem_usage.total/(1024**3):.2f}G")
print(f"Used: {mem_usage.used/(1024**3):.2f}G")
print(f"Used - Start: {(mem_usage.used - mem_usage_start)/(1024**3):.2f}G")

Free: 2.4%
Total: 376.36G
Used: 7.60G
Used - Start: 2.39G


# Step 6. Run [what I think might be] the best model

In [32]:
model = Model(batch_size=25, n_conv=2, out_channels=64, kernel_size=(5, 30, 5), stride=2, padding='circular', fc_features=2048)

train(model, train_loader, val_loader, num_epochs=5, batch_size=25, test_mode=False, loss_function='custom')

Epoch [1/5] Train Loss: 5.26e+05, Train Acc10: 86.2%, Train Acc5: 57.0% | Val Loss: 1.75e+05, Val Acc10: 52.6%, Val Acc5: 27.0%
Epoch [2/5] Train Loss: 5.22e+05, Train Acc10: 86.7%, Train Acc5: 57.9% | Val Loss: 1.74e+05, Val Acc10: 53.7%, Val Acc5: 28.3%
Epoch [3/5] Train Loss: nan, Train Acc10: 2.8%, Train Acc5: 1.9% | Val Loss: nan, Val Acc10: 0.0%, Val Acc5: 0.0%
Epoch [4/5] Train Loss: nan, Train Acc10: 0.0%, Train Acc5: 0.0% | Val Loss: nan, Val Acc10: 0.0%, Val Acc5: 0.0%
Epoch [5/5] Train Loss: nan, Train Acc10: 0.0%, Train Acc5: 0.0% | Val Loss: nan, Val Acc10: 0.0%, Val Acc5: 0.0%


In [None]:
model = Model(batch_size=25, n_conv=2, out_channels=64, kernel_size=(5, 30, 5), stride=2, padding='circular', fc_features=2048)

train(model, train_loader, val_loader, num_epochs=50, batch_size=25, test_mode=False, loss_function='l1')

Epoch [1/50] Train Loss: 4.97e+08, Train Acc10: 86.6%, Train Acc5: 57.4% | Val Loss: 2.44e+08, Val Acc10: 49.9%, Val Acc5: 25.4%
Epoch [2/50] Train Loss: 4.96e+08, Train Acc10: 86.9%, Train Acc5: 58.0% | Val Loss: 2.41e+08, Val Acc10: 54.6%, Val Acc5: 29.5%
Epoch [3/50] Train Loss: 4.96e+08, Train Acc10: 86.9%, Train Acc5: 58.1% | Val Loss: 2.39e+08, Val Acc10: 58.0%, Val Acc5: 32.1%
Epoch [4/50] Train Loss: 4.96e+08, Train Acc10: 86.8%, Train Acc5: 58.0% | Val Loss: 2.38e+08, Val Acc10: 58.8%, Val Acc5: 32.2%
Epoch [5/50] Train Loss: 4.88e+08, Train Acc10: 88.2%, Train Acc5: 59.9% | Val Loss: 2.36e+08, Val Acc10: 61.1%, Val Acc5: 33.6%
Epoch [6/50] Train Loss: 4.83e+08, Train Acc10: 89.0%, Train Acc5: 61.0% | Val Loss: 2.35e+08, Val Acc10: 60.7%, Val Acc5: 33.4%
Epoch [7/50] Train Loss: 4.82e+08, Train Acc10: 89.2%, Train Acc5: 61.2% | Val Loss: 2.34e+08, Val Acc10: 63.4%, Val Acc5: 35.1%
Epoch [8/50] Train Loss: 4.80e+08, Train Acc10: 89.4%, Train Acc5: 61.5% | Val Loss: 2.31e+08, Va