In [1]:
import numpy as np
import torch
from torch import nn
import os
torch.manual_seed(42)
from utils import generate_dataloaders
import wandb

In [2]:
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mzzhang2816[0m (use `wandb login --relogin` to force relogin)


True

In [3]:
train_set=np.load("dataset/train.npz")
val_set=np.load("dataset/val.npz")

In [4]:
train_loader, val_loader=generate_dataloaders(train_set, val_set,batch_size=32)

TRAIN: the shape of X: torch.Size([32, 8, 49]); the shape of y: torch.Size([32, 1])
VAL: the shape of X: torch.Size([32, 8, 49]); the shape of y: torch.Size([32, 1])


In [5]:
class RNNModel(nn.Module):
    """The RNN model."""
    def __init__(self, input_dimension,output_dimension,num_layers,num_hiddens):
        super().__init__()
        self.input_dimension = input_dimension
        self.output_dimension=output_dimension
        self.num_hiddens=num_hiddens
        self.rnn = nn.LSTM(self.input_dimension, self.num_hiddens,num_layers)
        self.linear = nn.Linear(self.num_hiddens, self.output_dimension)

    def forward(self, inputs,state=None):
        X = inputs.permute(1,0,2)
        X = X.to(torch.float32)
        _, state = self.rnn(X)
        # if state is not None:
        #     _,state = self.rnn(X, state)
        # else:
        #     _, state = self.rnn(X)
        output = self.linear(state[-1][-1]) # take the hidden state 2, on the layer 2
        return output
    
    # def begin_state(self, device, batch_size=1):
    #     if not isinstance(self.rnn, nn.LSTM):
    #         # `nn.GRU` takes a tensor as hidden state
    #         return torch.zeros((self.rnn.num_layers,
    #                             batch_size, self.num_hiddens), device=device)
    #     else:
    #         # `nn.LSTM` takes a tuple of hidden states
    #         return (torch.zeros((self.rnn.num_layers,
    #                              batch_size, self.num_hiddens),
    #                             device=device),
    #                 torch.zeros((self.rnn.num_layers,
    #                              batch_size, self.num_hiddens),
    #                             device=device))

In [28]:
def train(net, num_epochs, device,train_loader, val_loader, load_from_path,save_to_path):
        if load_from_path:
                net.load_state_dict(torch.load(load_from_path))
        if not os.path.isdir(save_to_path):
            os.mkdir(save_to_path)
        loss = nn.MSELoss()
        optimizer = torch.optim.Adam(net.parameters(), lr=0.001, betas=(0.9, 0.999), eps=1e-08, weight_decay=0)
        
        # scheduler = optim.lr_scheduler.ReduceLROnPlateau(optim, 'min',factor=0.5, verbose = True, min_lr=1e-6, patience = 5)
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, 25)

        # saved_name=f'fold {fold_i}.pt'
        # early_stopping = EarlyStopping(model_save,saved_name,patience = 10, verbose=True)

        net.to(device)
        
        epoch_trainlosses=[]
        epoch_vallosses=[]
        for epoch in range(num_epochs):
            for (dataset, loader) in [("train", train_loader), ("val", val_loader)]: 
                if dataset == "train":
                        torch.set_grad_enabled(True)
                        net.train()
                else:
                        torch.set_grad_enabled(False)
                        net.eval()
                total_epoch_loss = 0
                for batch_idx, (X,y) in enumerate(loader): 
                    X=X.to(device)
                    y=y.to(device)

                    y_hat=net(X)
                    l=loss(y_hat,y)
                    
                    total_epoch_loss += l.cpu().detach().numpy()*X.shape[0]
                    if(batch_idx%100==0):
                        message=""
                        message += f"Epoch {epoch+1}/{num_epochs} progress: {int((batch_idx / len(loader)) * 100)}% "
                        message += f'loss: {l.data.item():.4f}'
                        print(message)
                        # wandb.log({"message": message})

                    if dataset == "train" :
                        optimizer.zero_grad()
                        l.backward()
                        optimizer.step()
          
                avg_epoch_loss = total_epoch_loss/ len(loader.dataset)
                if dataset == "train" :
                    epoch_trainlosses.append(avg_epoch_loss)
                if dataset == 'val':
                    epoch_vallosses.append(avg_epoch_loss)
                
                # print(f'Epoch: {epoch}; Avg_loss: {avg_epoch_loss}')
            wandb.log({"train_loss": epoch_trainlosses[-1],"val":epoch_vallosses[-1]})

            # epoch_valloss=epoch_vallosses[-1]
            scheduler.step()        
            # early_stopping(epoch_valloss, model)
            # if early_stopping.early_stop:
            #     print("Early stopping")
            #     break
            if epoch%10 == 0:
                torch.save(net.state_dict(), save_to_path+f"{epoch}.pt")
        if epoch%10!=0:
            torch.save(net.state_dict(), save_to_path+f"{num_epochs}.pt")
        # wandb.finish()
        

In [29]:
wandb.init(project="trial", config={'lr':0.001,'num_layers':2,'num_hiddens':8})

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train_loss,█▃▃▂▂▁▁▁▁▁
val,█▅▃▃▂▂▁▂▁▁

0,1
message,Epoch 10/10 progress...
train_loss,22.73558
val,26.62013


[34m[1mwandb[0m: wandb version 0.12.9 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


In [30]:
net = RNNModel(input_dimension=49,output_dimension=1,num_layers=2,num_hiddens=8)
num_epochs=5
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
load_from_path=None
save_to_path="checkpoints/v0/"

In [None]:
train(net, num_epochs, device,train_loader, val_loader, load_from_path,save_to_path)

In [55]:
a,b=next(iter(train_loader))