In [1]:
import database
from dataset import Dataset
from ae_test import AutoencoderTest
import torch.optim as optim
import torch
from tqdm import tqdm_notebook
import torch.nn.functional as F

In [2]:
def distance(data, eps=1e-8):
    n = data.shape[0]
    distance_matrix = []
    data = data.to(data.device)
    for i in tqdm_notebook(range(n)):
        dist = data - data[i:i+1].repeat(n, 1)
        dist = dist.pow(2).sum(1)
        dist = (dist+eps).sqrt()
        dist[i] = 0
        distance_matrix.append(dist)
    distance_matrix = torch.cat(distance_matrix,0).view(n,n)
    distance_matrix = distance_matrix.cpu()
    mask = torch.ones(n,n)
    mask = torch.triu(mask,diagonal=1)
    mean = distance_matrix[mask>0].mean()
#     print("######:", mean)
    distance_matrix[mask>0] = distance_matrix[mask>0]/mean
    return distance_matrix[mask>0]

In [3]:
device = torch.device('cuda')
BATCH_SIZE = 64
LEARNING_RATE = 1e-3

In [4]:
db = database.DEFAULT_DB()
db.setUp()

In [None]:
def experiment(train_dataset_name, test_dataset_name, col=None):
    # load training data
    dataset = Dataset(db, train_dataset_name)
    if col is not None:
        dataset.set_columns(col)
    dataset.load(BATCH_SIZE, 1, 1)
    
    MAX_EPOCH = 10000 // len(dataset.dataloader) + 1
    INPUT_DIM = len(dataset.columns)
    
    # define network
    model = AutoencoderTest(input_dim=INPUT_DIM, lw=1).to(device)
    optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
    
    # training
    for epoch in tqdm_notebook(range(MAX_EPOCH)):
        for data in dataset.dataloader:
            input_data, output_data = data[:, :INPUT_DIM], data[:, INPUT_DIM:]
            input_data = input_data.to(device)
            output_data = output_data.to(device)
            optimizer.zero_grad()
            _, loss, _ = model.loss(input_data, output_data, dataset.mean)
            loss.backward()
            optimizer.step()
            
    # load test_data
    test_dataset = Dataset(db, test_dataset_name)
    if col is not None:
        test_dataset.set_columns(col)
    test_dataset.load(BATCH_SIZE, 1, 1, normalizer=dataset.normalizer)
    test_data = test_dataset.dataloader.dataset[:, :INPUT_DIM]
    
    # inference
    test_z = model(test_data.to(device))
    
    xd = distance(data=test_data.detach().cpu())
    zd = distance(data=test_z.detach().cpu())
    loss_n = F.mse_loss(xd,zd)
#     print("dist loss:",loss_n)
    return loss_n

In [None]:
arr11 = []
arr22 = []
arr12 = []
arr21 = []
arr01 = []
arr02 = []

In [None]:

for i in tqdm_notebook(range(10)):
    arr11.append(experiment('mrt_in_hour_eng_abbr1108', 'mrt_in_hour_eng_abbr1108'))
    arr22.append(experiment('mrt_in_hour_eng_abbr0801', 'mrt_in_hour_eng_abbr0801'))

    arr12.append(experiment('mrt_in_hour_eng_abbr1108', 'mrt_in_hour_eng_abbr0801'))
    arr21.append(experiment('mrt_in_hour_eng_abbr0801', 'mrt_in_hour_eng_abbr1108'))

    arr01.append(experiment('mrt_in_hour_eng_abbr', 'mrt_in_hour_eng_abbr1108'))
    arr02.append(experiment('mrt_in_hour_eng_abbr', 'mrt_in_hour_eng_abbr0801'))

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

new dataset: mrt_in_hour_eng_abbr1108
input_dim: 108


HBox(children=(IntProgress(value=0, max=112), HTML(value='')))

new dataset: mrt_in_hour_eng_abbr1108


HBox(children=(IntProgress(value=0, max=5755), HTML(value='')))

HBox(children=(IntProgress(value=0, max=5755), HTML(value='')))

new dataset: mrt_in_hour_eng_abbr0801
input_dim: 108


HBox(children=(IntProgress(value=0, max=197), HTML(value='')))

new dataset: mrt_in_hour_eng_abbr0801


HBox(children=(IntProgress(value=0, max=3218), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3218), HTML(value='')))

new dataset: mrt_in_hour_eng_abbr1108
input_dim: 108


HBox(children=(IntProgress(value=0, max=112), HTML(value='')))

new dataset: mrt_in_hour_eng_abbr0801


HBox(children=(IntProgress(value=0, max=3218), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3218), HTML(value='')))

new dataset: mrt_in_hour_eng_abbr0801
input_dim: 108


HBox(children=(IntProgress(value=0, max=197), HTML(value='')))

new dataset: mrt_in_hour_eng_abbr1108


HBox(children=(IntProgress(value=0, max=5755), HTML(value='')))

HBox(children=(IntProgress(value=0, max=5755), HTML(value='')))

new dataset: mrt_in_hour_eng_abbr
input_dim: 108


HBox(children=(IntProgress(value=0, max=71), HTML(value='')))

In [None]:
# cur_col = [
#     'PP','LDPE','HDPE','EG_contract','EG_OUCC','PVC','SM_GPPC','VCM','Butadiene','Nylons',
#     'SM_US_contract','WTI_Futures','SM_US','DubaiCrudeOil','BrentOilFutures','BrentOil','WTI','Ethyl_acetate','Isopropanol','Singapore_RSS3'
# ]
# # experiment('plastics_and_chemicals_en_0713', 'plastics_and_chemicals_en_0713', cur_col)
# experiment('plastics_and_chemicals_en1318', 'plastics_and_chemicals_en1318', cur_col)


# experiment('plastics_and_chemicals_en_0713', 'plastics_and_chemicals_en1318', cur_col)
# experiment('plastics_and_chemicals_en1318', 'plastics_and_chemicals_en_0713', cur_col)

# experiment('plastics_and_chemicals_en', 'plastics_and_chemicals_en1318', cur_col)
# experiment('plastics_and_chemicals_en', 'plastics_and_chemicals_en_0713', cur_col)

In [None]:
# cur_col = [
#     'temp','hum','windspeed','casual','registered','cnt'
# ]
# experiment('bike_sharing_2011', 'bike_sharing_2011', cur_col)
# experiment('bike_sharing_2012', 'bike_sharing_2012', cur_col)
           
# experiment('bike_sharing_2011', 'bike_sharing_2012', cur_col)
# experiment('bike_sharing_2012', 'bike_sharing_2011', cur_col)
           
# experiment('bike_sharing', 'bike_sharing_2012', cur_col)
# experiment('bike_sharing', 'bike_sharing_2011', cur_col)