In [11]:
import sys
import argparse
import os
import torch
from torch.autograd import Variable
from tqdm import tqdm
from time import time
import shutil

In [12]:
from torch.utils.tensorboard import SummaryWriter

In [13]:
from configs.config_utils import CONFIG, read_to_dict, mount_external_config
from net_utils.utils import load_dataloader
from adl_scripts.MLP_Regressor import MLP_Regressor

In [30]:
def parse_args():
    '''PARAMETERS'''
    parser = argparse.ArgumentParser('Pose2Room.')
    parser.add_argument('--config', type=str, default='configs/config_files/p2rnet_train.yaml',
                        help='configure file for training or testing.')
    parser.add_argument('--mode', type=str, default='train', help='train, test or demo.')
    parser.add_argument('--demo_path', type=str, default='demo', help='Please specify the demo path.')
    
    args, unknown = parser.parse_known_args()
    return args

In [24]:
def train_epoch(dataloader, optimizer, model, loss_func):
    model.train()
    
    device = torch.device("cuda")
    current_loss = 0.0
    for batch_data in dataloader:
        inputs, targets = batch_data['adl_input'], batch_data['adl_output']
        inputs = Variable(inputs.to(device))
        targets = Variable(targets.to(device))
        
        optimizer.zero_grad()
        
        predictions = model(inputs)
        
        loss = loss_func(predictions, targets)
        
        loss.backward()
        optimizer.step()    
        
        current_loss += loss.item()
    
    return current_loss

In [25]:
def validate_epoch(dataloader, model, loss_func):
    model.eval()
    device = torch.device("cuda")
    current_loss = 0.0
    with torch.no_grad():
        for batch_data in dataloader:
            inputs, targets = batch_data['adl_input'], batch_data['adl_output']
            inputs = inputs.to(device)
            targets = targets.to(device)
            predictions = model(inputs)
            loss = loss_func(predictions, targets)
            
            current_loss += loss.item()
            
    return current_loss

In [31]:
writer = SummaryWriter()

args = parse_args()
config = read_to_dict('/home/gogebakan/workspace/Pose2Room/configs/config_files/p2rnet_train.yaml')
# initiate device environments
os.environ["CUDA_VISIBLE_DEVICES"] = config['device']['gpu_ids']
from net_utils.utils import initiate_environment, get_sha
config = initiate_environment(config)

# initialize config
cfg = CONFIG(args, config)
cfg.update_config(args.__dict__)
'''Configuration'''
cfg.log_string('Loading configurations.')
cfg.log_string("git:\n  {}\n".format(get_sha()))
cfg.log_string(cfg.config)
cfg.write_config()

'''Mount external config data'''
dataset = 'MLP'
cfg = mount_external_config(cfg)
train_loader = load_dataloader(cfg, mode='train', dataset=dataset).dataloader
validation_loader = load_dataloader(cfg, mode='val', dataset=dataset).dataloader

Not using distributed mode


In [35]:
device = torch.device("cuda")
epochs = 50
val_epoch = 3
nearest_k_frames = 10
input_size = 8 + 2*nearest_k_frames*256
output_size = 1024
layer_sizes = [2048]
model = MLP_Regressor(input_size=input_size, output_size=output_size, layer_sizes=layer_sizes)

model = model.to(device)
model.train()

l2_loss = torch.nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

best_loss = 10*5
patience = 2
early_stop_counter = 0

for epoch in range(epochs):
    #train
    start = time()
    train_loss = train_epoch(train_loader, optimizer, model, l2_loss)
    end = time()
    epoch_time = end-start
    writer.add_scalar("Loss/train_epoch", train_loss, epoch)
    
    # validation
    if epoch % val_epoch == 0:
        validation_loss = validate_epoch(validation_loader, model, l2_loss)
        print(f'Epoch {epoch}\tTraining loss:\t{train_loss:.5f}\t{epoch_time:.2f}s\tValidation loss:\t{validation_loss:.5f}\t{epoch_time:.2f}s')
        
        writer.add_scalar("Loss/validation_epoch", validation_loss, epoch)
        if validation_loss > best_loss:
            early_stop_counter += 1
            if early_stop_counter == patience:
                print(f'Early stopping with best validation_loss: {best_loss}')
                break
        else:
            best_loss = validation_loss
            early_stop_counter = 0
            torch.save(model.state_dict(), f'saved_models/checkpoint.pt')
                
    else:
        print(f'Epoch {epoch}\tTraining loss:\t{train_loss:.5f}\t{epoch_time:.2f}s')
    
writer.flush()
writer.close()

Epoch 0	Training loss:	46.16708	1.63s	Validation loss:	53.07045	1.63s
Epoch 1	Training loss:	9.02832	1.54s
Epoch 2	Training loss:	5.90941	1.54s
Epoch 3	Training loss:	5.35951	1.54s	Validation loss:	22.31247	1.54s
Epoch 4	Training loss:	5.04022	1.50s
Epoch 5	Training loss:	4.82734	1.55s
Epoch 6	Training loss:	4.70199	1.49s	Validation loss:	20.28049	1.49s
Epoch 7	Training loss:	4.57219	1.50s
Epoch 8	Training loss:	4.43189	1.50s
Epoch 9	Training loss:	4.37441	1.51s	Validation loss:	19.64477	1.51s
Epoch 10	Training loss:	4.29454	1.55s
Epoch 11	Training loss:	4.22552	1.49s
Epoch 12	Training loss:	4.09942	1.57s	Validation loss:	19.13207	1.57s
Epoch 13	Training loss:	4.02178	1.64s
Epoch 14	Training loss:	3.92822	1.50s
Epoch 15	Training loss:	3.87207	1.55s	Validation loss:	18.23363	1.55s
Epoch 16	Training loss:	3.76613	1.55s
Epoch 17	Training loss:	3.75881	1.58s
Epoch 18	Training loss:	3.65466	1.55s	Validation loss:	17.72648	1.55s
Epoch 19	Training loss:	3.56955	1.60s
Epoch 20	Training loss:	3