In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

#from torchtext.datasets import Multi30k
#from torchtext.data import Field, BucketIterator

#import spacy
import numpy as np

import random
import math
import time

import torch.utils.data as data
import pandas as pd

SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

import os
os.environ["CUDA_VISIBLE_DEVICES"] = "3"

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [2]:
class Dataset(data.Dataset):
    def __init__(self,data = None,src_len = 20,trg_len=26):
        self.data = data
        self.data_lengths = len(data)
        self.src_len = src_len
        self.trg_len = trg_len
    def __getitem__(self,index):
        data=self.data[index]
        src_data = data[0:self.src_len]
        trg_data = data[self.src_len:self.trg_len+self.src_len]
        return src_data,trg_data
    def __len__(self):
        return self.data_lengths

def dataset_iter(trDataX ,trDataY):
    trData = pd.concat([trDataX, trDataY], axis=1).to_numpy()
    trData = trData[:,:,np.newaxis]
    train_data, validate_data = np.split(trData, [int(.5*len(trData))])
    train_data_loader = torch.utils.data.DataLoader(dataset=Dataset(train_data,src_len = 20,trg_len=26),batch_size=BATCHSIZE)
    validate_data_loader = torch.utils.data.DataLoader(dataset=Dataset(validate_data,src_len = 20,trg_len=26),batch_size=BATCHSIZE)
    return train_data_loader,validate_data_loader

In [3]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        
        #self.embedding = nn.Embedding(input_dim, emb_dim)
        
        #self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)
        self.rnn = nn.LSTM(input_dim, hid_dim, n_layers, dropout = dropout)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src):
        
        #src = [src len, batch size]
        
        #embedded = self.dropout(self.embedding(src))
        #embedded = [src len, batch size, emb dim]
        
        embedded = self.dropout(src)
        
        outputs, (hidden, cell) = self.rnn(embedded)
        
        #outputs = [src len, batch size, hid dim * n directions]
        #hidden = [n layers * n directions, batch size, hid dim]
        #cell = [n layers * n directions, batch size, hid dim]
        
        #outputs are always from the top hidden layer
        
        return hidden, cell
    
    
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        
        self.output_dim = output_dim
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        
        #self.embedding = nn.Embedding(output_dim, emb_dim)
        
        #self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)
        self.rnn = nn.LSTM(output_dim, hid_dim, n_layers, dropout = dropout)
        
        self.fc_out = nn.Linear(hid_dim, output_dim)
        
        self.relu = nn.ReLU()
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input_, hidden, cell):
        
        #input = [batch size]
        #hidden = [n layers * n directions, batch size, hid dim]
        #cell = [n layers * n directions, batch size, hid dim]
        
        #n directions in the decoder will both always be 1, therefore:
        #hidden = [n layers, batch size, hid dim]
        #context = [n layers, batch size, hid dim]
        
        input_ = input_.float().unsqueeze(0)
        #print("decode input shape",input_.shape)
        #input = [1, batch size]
        
        #embedded = self.dropout(self.embedding(input))
        embedded = self.dropout(input_)
        
        #embedded = [1, batch size, emb dim]
                
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        
        #output = [seq len, batch size, hid dim * n directions]
        #hidden = [n layers * n directions, batch size, hid dim]
        #cell = [n layers * n directions, batch size, hid dim]
        
        #seq len and n directions will always be 1 in the decoder, therefore:
        #output = [1, batch size, hid dim]
        #hidden = [n layers, batch size, hid dim]
        #cell = [n layers, batch size, hid dim]
        
        #prediction = self.fc_out(output.squeeze(0)).unsqueeze(-1)
        prediction = self.fc_out(output.squeeze(0))
        
        #print(prediction.shape)
        #prediction = [batch size, output dim]
        
        return prediction, hidden, cell

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
        assert encoder.hid_dim == decoder.hid_dim, \
            "Hidden dimensions of encoder and decoder must be equal!"
        assert encoder.n_layers == decoder.n_layers, \
            "Encoder and decoder must have equal number of layers!"
        
    def forward(self, src, trg, teacher_forcing_ratio = 0.5):
        
        #src = [src len, batch size]
        #trg = [trg len, batch size]
        #teacher_forcing_ratio is probability to use teacher forcing
        #e.g. if teacher_forcing_ratio is 0.75 we use ground-truth inputs 75% of the time
        
        batch_size = trg.shape[1]
        trg_len = trg.shape[0]
        #print("trg_len",trg.shape[0],"batch_size",trg.shape[1])
        trg_feature_size = self.decoder.output_dim
        
        #tensor to store decoder outputs
        outputs = torch.zeros(trg_len, batch_size, trg_feature_size).to(self.device)
        
        #last hidden state of the encoder is used as the initial hidden state of the decoder
        
        hidden, cell = self.encoder(src[0:-1])
        
        #first input to the decoder is the <sos> tokens
        input_ = src[-1]
        output, hidden, cell = self.decoder(input_, hidden, cell)
        
        #print(input.shape)
        for t in range(0, trg_len):
            #place predictions in a tensor holding predictions for each token
            outputs[t] = output
            
            #decide if we are going to use teacher forcing or not
            teacher_force = random.random() < teacher_forcing_ratio
            
            #get the highest predicted token from our predictions
            #top1 = output.argmax(1) 
            
            #if teacher forcing, use actual next token as next input
            #if not, use predicted token
            #print(trg[t].shape,output.unsqueeze(-1).shape)
            input_ = trg[t] if teacher_force else output
            
            #insert input token embedding, previous hidden and previous cell states
            #receive output tensor (predictions) and new hidden and cell states
            output, hidden, cell = self.decoder(input_, hidden, cell)
            
            #print("output.shape",output.shape)[20000, 1]
            
        return outputs

In [4]:
def train(model, iterator, optimizer, criterion, clip):
    
    model.train()
    
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
        
        src = batch[0].permute(1,0,2).float().to(device)
        trg = batch[1].permute(1,0,2).float().to(device)
        
        torch.cuda.empty_cache()
        
        #print(src.shape,trg.shape)
        optimizer.zero_grad()
        
        output = model(src, trg)
        
        #trg = [trg len, batch size]
        #output = [trg len, batch size, output dim]
        #print(output.shape,trg.shape)
        #output_dim = output.shape[-1]
        #if i%50 == 0:
        #    print("output[0],trg[0]",f'{output[0].sum():.8f}',f'{trg[0].sum():.8f}')    
        #    print("output[10],trg[10]",f'{output[10].sum():.8f}',f'{trg[10].sum():.8f}')  
        #    print("output[25],trg[25]",f'{output[25].sum():.8f}',f'{trg[25].sum():.8f}')  
        
        batchsize = trg.shape[1]
        
        #output = output[1:].view(-1)
        output = output.permute(1,0,2).reshape(batchsize,-1)
        
        #trg = trg[1:].view(-1)
        trg = trg.permute(1,0,2).reshape(batchsize,-1)
        
        #trg = [(trg len - 1) * batch size]
        #output = [(trg len - 1) * batch size, output dim]
        
        loss = criterion(output, trg)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()

    print("train total epoch_loss",epoch_loss,"avarage epoch_loss",epoch_loss / len(iterator))    
    return epoch_loss / len(iterator)

def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):

            src = batch[0].permute(1,0,2).float().cuda()
            trg = batch[1].permute(1,0,2).float().cuda()

            torch.cuda.empty_cache()
            
            output = model(src, trg, 0) #turn off teacher forcing
            #$print(output[1],trg[1])
            #trg = [trg len, batch size]
            #output = [trg len, batch size, output dim]
            #if i%50 == 0:
                #print("output[0].sum(),trg[0].sum()",output[0].sum(),trg[0].sum())
 
            batchsize = trg.shape[1]
            
            #output = output[1:].view(-1)
            output = output.permute(1,0,2).reshape(batchsize,-1)
            
            
            
            #trg = trg[1:].view(-1)
            trg = trg.permute(1,0,2).reshape(batchsize,-1)

            #trg = [(trg len - 1) * batch size]
            #output = [(trg len - 1) * batch size, output dim]

            loss = criterion(output, trg)
            
            epoch_loss += loss.item()
    print("vali total epoch_loss",epoch_loss,"avarage epoch_loss",epoch_loss / len(iterator)) 
    return epoch_loss / len(iterator)

In [5]:
FOLD = 0
CITY = 3
BATCHSIZE = 40000

best_valid_loss = float('inf')

trDataX  = pd.read_csv("fold{}_city{}_trainX.csv".format(FOLD,CITY),header=None)
trDataY  = pd.read_csv("fold{}_city{}_trainY.csv".format(FOLD,CITY),header=None)
train_iterator,valid_iterator = dataset_iter(trDataX,trDataY)

In [6]:
for i,batch in enumerate(train_iterator):
    if i ==0:
        print(len(batch),batch[0].permute(1,0,2)[0:-1].shape,batch[0].permute(1,0,2)[-1].shape)

2 torch.Size([19, 40000, 1]) torch.Size([40000, 1])


In [7]:
#model init

INPUT_DIM = 1
OUTPUT_DIM = 1
ENC_EMB_DIM = 8
DEC_EMB_DIM = 8
HID_DIM = 64
#HID_DIM = 64

N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)

model = Seq2Seq(enc, dec, device).to(device)

def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.1, 0.1)
        
model.apply(init_weights)

optimizer = optim.Adam(model.parameters())

#criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)
#criterion = nn.MSELoss(reduction='sum')
criterion = nn.MSELoss()

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 100,929 trainable parameters


In [8]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

torch.cuda.empty_cache()

N_EPOCHS = 15
CLIP = 1

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)
    
    #train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    #valid_loss = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'fold{}-city{}-model.pt'.format(FOLD,CITY))
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.5e} ')
    print(f'\t Val. Loss: {valid_loss:.5e} ')

train total epoch_loss 0.00077116586851389 avarage epoch_loss 1.2642063418260493e-05
vali total epoch_loss 1.2687776518305327e-05 avarage epoch_loss 2.079963363656611e-07
Epoch: 01 | Time: 2m 59s
	Train Loss: 1.26421e-05 
	 Val. Loss: 2.07996e-07 
train total epoch_loss 6.874039746662675e-05 avarage epoch_loss 1.1268917617479795e-06
vali total epoch_loss 5.112719172473135e-06 avarage epoch_loss 8.381506840119892e-08
Epoch: 02 | Time: 2m 59s
	Train Loss: 1.12689e-06 
	 Val. Loss: 8.38151e-08 
train total epoch_loss 2.9135332965779526e-05 avarage epoch_loss 4.776284092750742e-07
vali total epoch_loss 4.952269925695418e-06 avarage epoch_loss 8.118475288025275e-08
Epoch: 03 | Time: 2m 59s
	Train Loss: 4.77628e-07 
	 Val. Loss: 8.11848e-08 
train total epoch_loss 1.8538562628123145e-05 avarage epoch_loss 3.039108627561171e-07
vali total epoch_loss 4.890511092980887e-06 avarage epoch_loss 8.017231299968667e-08
Epoch: 04 | Time: 3m 0s
	Train Loss: 3.03911e-07 
	 Val. Loss: 8.01723e-08 
train 

In [9]:
FOLD = 1
CITY = 3
BATCHSIZE = 40000

best_valid_loss = float('inf')

trDataX  = pd.read_csv("fold{}_city{}_trainX.csv".format(FOLD,CITY),header=None)
trDataY  = pd.read_csv("fold{}_city{}_trainY.csv".format(FOLD,CITY),header=None)
train_iterator,valid_iterator = dataset_iter(trDataX,trDataY)

In [10]:
for i,batch in enumerate(train_iterator):
    if i ==0:
        print(len(batch),batch[0].permute(1,0,2)[0:-1].shape,batch[0].permute(1,0,2)[-1].shape)

2 torch.Size([19, 40000, 1]) torch.Size([40000, 1])


In [11]:
#model init

INPUT_DIM = 1
OUTPUT_DIM = 1
ENC_EMB_DIM = 8
DEC_EMB_DIM = 8
HID_DIM = 64
#HID_DIM = 64

N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)

model = Seq2Seq(enc, dec, device).to(device)

def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.1, 0.1)
        
model.apply(init_weights)

optimizer = optim.Adam(model.parameters())

#criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)
#criterion = nn.MSELoss(reduction='sum')
criterion = nn.MSELoss()

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 100,929 trainable parameters


In [12]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

torch.cuda.empty_cache()

N_EPOCHS = 15
CLIP = 1

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)
    
    #train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    #valid_loss = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'fold{}-city{}-model.pt'.format(FOLD,CITY))
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.5e} ')
    print(f'\t Val. Loss: {valid_loss:.5e} ')

train total epoch_loss 0.00070833207968235 avarage epoch_loss 7.3023925740448454e-06
vali total epoch_loss 7.669933282272723e-06 avarage epoch_loss 7.90714771368322e-08
Epoch: 01 | Time: 4m 44s
	Train Loss: 7.30239e-06 
	 Val. Loss: 7.90715e-08 
train total epoch_loss 3.858415244906155e-05 avarage epoch_loss 3.977747675160984e-07
vali total epoch_loss 7.316844442328829e-06 avarage epoch_loss 7.543138600338999e-08
Epoch: 02 | Time: 4m 46s
	Train Loss: 3.97775e-07 
	 Val. Loss: 7.54314e-08 
train total epoch_loss 2.181375509735517e-05 avarage epoch_loss 2.2488407316861e-07
vali total epoch_loss 7.312925724534125e-06 avarage epoch_loss 7.539098685086727e-08
Epoch: 03 | Time: 4m 46s
	Train Loss: 2.24884e-07 
	 Val. Loss: 7.53910e-08 
train total epoch_loss 1.7579472931572582e-05 avarage epoch_loss 1.8123167970693382e-07
vali total epoch_loss 7.309920405873527e-06 avarage epoch_loss 7.536000418426316e-08
Epoch: 04 | Time: 4m 46s
	Train Loss: 1.81232e-07 
	 Val. Loss: 7.53600e-08 
train tota

In [13]:
FOLD = 2
CITY = 3
BATCHSIZE = 40000

best_valid_loss = float('inf')

trDataX  = pd.read_csv("fold{}_city{}_trainX.csv".format(FOLD,CITY),header=None)
trDataY  = pd.read_csv("fold{}_city{}_trainY.csv".format(FOLD,CITY),header=None)
train_iterator,valid_iterator = dataset_iter(trDataX,trDataY)

In [14]:
for i,batch in enumerate(train_iterator):
    if i ==0:
        print(len(batch),batch[0].permute(1,0,2)[0:-1].shape,batch[0].permute(1,0,2)[-1].shape)

2 torch.Size([19, 40000, 1]) torch.Size([40000, 1])


In [15]:
#model init

INPUT_DIM = 1
OUTPUT_DIM = 1
ENC_EMB_DIM = 8
DEC_EMB_DIM = 8
HID_DIM = 64
#HID_DIM = 64

N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)

model = Seq2Seq(enc, dec, device).to(device)

def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.1, 0.1)
        
model.apply(init_weights)

optimizer = optim.Adam(model.parameters())

#criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)
#criterion = nn.MSELoss(reduction='sum')
criterion = nn.MSELoss()

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 100,929 trainable parameters


In [16]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

torch.cuda.empty_cache()

N_EPOCHS = 15
CLIP = 1

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)
    
    #train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    #valid_loss = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'fold{}-city{}-model.pt'.format(FOLD,CITY))
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.5e} ')
    print(f'\t Val. Loss: {valid_loss:.5e} ')

train total epoch_loss 0.014085469427300268 avarage epoch_loss 0.00010590578516767119
vali total epoch_loss 2.0395440074594262e-05 avarage epoch_loss 1.5334917349318993e-07
Epoch: 01 | Time: 6m 33s
	Train Loss: 1.05906e-04 
	 Val. Loss: 1.53349e-07 
train total epoch_loss 0.0003825760829840874 avarage epoch_loss 2.8765119021359955e-06
vali total epoch_loss 1.5152333737944446e-05 avarage epoch_loss 1.1392732133792816e-07
Epoch: 02 | Time: 6m 34s
	Train Loss: 2.87651e-06 
	 Val. Loss: 1.13927e-07 
train total epoch_loss 0.0001818792622998444 avarage epoch_loss 1.36751325037477e-06
vali total epoch_loss 1.2166082129283495e-05 avarage epoch_loss 9.147430172393605e-08
Epoch: 03 | Time: 6m 33s
	Train Loss: 1.36751e-06 
	 Val. Loss: 9.14743e-08 
train total epoch_loss 9.619569823371421e-05 avarage epoch_loss 7.232759265692798e-07
vali total epoch_loss 1.0591287047390097e-05 avarage epoch_loss 7.963373719842178e-08
Epoch: 04 | Time: 6m 33s
	Train Loss: 7.23276e-07 
	 Val. Loss: 7.96337e-08 
tr