In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as torch_data
import torch.optim as optim

In [2]:
# Import local dependencies
import sys
sys.path.insert(0, "../src")
from collection import Collection

In [3]:
#%load_ext blackcellmagic

In [4]:
class Dataset(torch_data.Dataset):
    def __init__(self, data_list, pos_list):
        self.data = data_list
        self.pos = pos_list

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        return self.data[index], self.pos[index]

In [5]:
def list_of_tensors(data):
    tensor_list = []
    for pl in data:
        tensor_list.append(torch.tensor(pl, dtype=torch.float64))
    return tensor_list


def load_test_data(posting_length_to_use):
    # Load data
    test_collection = Collection("../test_data/test_collection")
    posting_lists = []
    for _, pl in enumerate(test_collection):
        if len(pl[0]) >= posting_length_to_use:
            posting_lists.append(np.array(pl[0], dtype=np.int))
    #posting_lists.sort(key=lambda x: np.shape(x)[0], reverse=True)

    data = list_of_tensors(posting_lists)
    return data

In [6]:
class Network(nn.Module):
    def __init__(self, num_hidden, hidden_size):
        """
        num_hidden: number of hidden layers
        hidden_size: a list of the sizes (num of neurons) of the hidden layers
        """
        super(Network, self).__init__()
        self.num_hidden = num_hidden
        self.fc = []    # fc layers
        self.relu = []  # ReLU activations
        input_size = 1  # size of the previous layer (input of current layer)
        for fc_idx in range(num_hidden):
            # add fc layer and ReLU activation
            self.fc.append(nn.Linear(input_size, hidden_size[fc_idx]))
            self.relu.append(nn.ReLU())
            # input of next layer should be output of this layer
            input_size = hidden_size[fc_idx]
        # the layer should always have 1-dim output
        self.last = nn.Linear(input_size, 1)

    def forward(self, x):
        out = x
        for fc_idx in range(self.num_hidden):
            out = self.fc[fc_idx](out)
            out = self.relu[fc_idx](out)
        out = self.last(out)
        return out

In [7]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
torch.set_default_tensor_type(torch.DoubleTensor)

cpu


In [8]:
data = load_test_data(128)
print(len(data))
pl = data[0]
print(pl.shape)

3836
torch.Size([2021])


In [9]:
len_pl = len(pl)
pos_list = torch.arange(len_pl, dtype=torch.float64).to(device)
data_list = pl
ds = [[Dataset(data_list, pos_list)]]
#ds = [[Dataset(pos_list, data_list)]]

In [20]:
def load_data(csv_source):
    """
    load data in csv file
    """
    data = []
    pos = []
    with open(csv_source) as ds:
        for line in ds:
            dp = line.rstrip().split(',')
            # each data/index should be a 1x1 PyTorch tensor
            data.append(torch.tensor([float(dp[0])]))
            pos.append(torch.tensor([float(dp[1])]))
    return data, pos

In [21]:
data, pos = load_data("random.csv")
ds = [[Dataset(data, pos)]]

In [22]:
# max of index (position), used to determine next-stage model
max_pos = len(data)
#max_pos = len(pl)
#max_pos = pl[-1]

# model is a 2-dim list, entry i, j is the model for stage i, model j
models = []

# num_model is a tuple, entry i is the number of models for stage i
#num_model = [1]
num_model = (1, 10)
#num_model = (1, 10, 10)
#num_model = (1, 10, 10)

# model_params is a tuple, entry i is the params of models in stage i
# each entry specifies (num of hidden layers, size of each hidden layer)
#model_params = ((2, [2000, 67]), (2, [1000, 500]))
#model_params = [(3, [4, 8, 8])]
model_params = ((2, [4, 8]), (2, [4, 8]))
#model_params = [(1, [4])]
#model_params = [(2, [4, 8]), (2, [4, 8]),(2, [4, 8])]
#print(model_params[0][0])
# number of stages
num_stage = len(num_model)

In [23]:
batch_size = 64
epochs = 5000

In [24]:
for stage_idx in range(num_stage):
    models.append([])
    
    # if it's the last stage, we don't need to prepare datasets for the
    # next stage. or, we need to initialize datasets for the next stage
    if stage_idx != num_stage - 1:
        next_data = [[] for i in range(num_model[stage_idx + 1])]
        next_pos = [[] for i in range(num_model[stage_idx + 1])]

    for model_idx in range(num_model[stage_idx]):

        # initialize a model
        model = Network(model_params[stage_idx][0], model_params[stage_idx][1]).to(
            device
        )
        # use MSE loss for training
        criterion = nn.MSELoss()
        #criterion = nn.L1Loss()
        # use Adam algo for training
        optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
        #optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
        # load dataset
        data_gen = torch_data.DataLoader(
            ds[stage_idx][model_idx], batch_size=batch_size, shuffle=False
        )
        # we will stop training when loss stops decreasing
        last_loss = float("inf")

        print(
            "Stage={}, Model={}, {} data points".format(
                stage_idx, model_idx, len(ds[stage_idx][model_idx])
            )
        )
        for epoch in range(epochs + 1):

            # train model
            for local_data, local_pos in data_gen:
                local_data, local_pos = local_data.to(device), local_pos.to(device)
                # feedforward
                outputs = model(local_data)
                # calc loss
                loss = criterion(outputs, local_pos)
                # back propagation
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                
            # calculate and print loss for this model at this epoch
            with torch.set_grad_enabled(False):
                loss_tot = 0.0
                for local_data, local_pos in data_gen:
                    local_data, local_pos = local_data.to(device), local_pos.to(device)
                    outputs = model(local_data)
                    loss = criterion(outputs, local_pos)
                    loss_tot += loss.item()
                   

                print("Loss:", loss_tot / len(ds[stage_idx][model_idx]))
                #print("Loss:{}, last_loss={}, loss_tot={}, epoch={}"
                #      .format(loss_tot / len(ds[stage_idx][model_idx]), last_loss, loss_tot, epoch))
                # if lost stops decreasing, just stop training

#                 if last_loss == 0:
#                     break
                    
#                 if last_loss < 0.001:
#                     break
                    
#                if (loss_tot / len(ds[stage_idx][model_idx]) < 100) or epoch > 2000:
                if (last_loss - loss_tot) / last_loss < 0.001:
                    break
                else:
                    last_loss = loss_tot

 
        # append the model we just trained to model tree
        models[stage_idx].append(model)

        # prepare datasets for the next stage. only when we're not at the last stage
        if stage_idx != num_stage - 1:
            
            # load the datapoints in current set one by one. we need to assign each of them to a
            # model in the next stage
            data_gen = torch_data.DataLoader(
                ds[stage_idx][model_idx], batch_size=1, shuffle=False
            )
            
            for local_data, local_pos in data_gen:
                local_data, local_pos = local_data.to(device), local_pos.to(device)
                
                # calculate which model in the next stage to assign to
                # model_idx = output * num_model_next_stage / max_position
                
                output = model(local_data)
                
                model_sel = int(output.item() * num_model[stage_idx + 1] / max_pos)
                
                if model_sel >= num_model[stage_idx + 1]:
                    model_sel = num_model[stage_idx + 1] - 1
                elif model_sel <= 0:
                    model_sel = 0
                # append this datapoint to corresponding dataset
                next_data[model_sel].append(local_data)
                next_pos[model_sel].append(local_pos)

    # create the Dataset objects for the next stage
    if stage_idx != num_stage - 1:
        ds.append([])
        for next_model_idx in range(num_model[stage_idx + 1]):
            ds[stage_idx + 1].append(
                Dataset(next_data[next_model_idx], next_pos[next_model_idx])
            )

Stage=0, Model=0, 100000 data points
Loss: 81.91110741190667
Loss: 79.11101674089831
Loss: 77.1803571456556
Loss: 81.78773118638419
Stage=1, Model=0, 10066 data points
Loss: 518.732087817916
Loss: 27.021529306656213
Loss: 26.91156672758241
Loss: 26.910463223642182
Stage=1, Model=1, 9886 data points
Loss: 3572097.717160348
Loss: 3470537.310600406
Loss: 3373711.377673097
Loss: 3279849.190243229
Loss: 3188373.187698886
Loss: 3099004.5220522434
Loss: 3011576.6705322918
Loss: 2925975.825762341
Loss: 2842116.860895909
Loss: 2759932.0625771126
Loss: 2679365.2447879845
Loss: 2600368.405628545
Loss: 2522899.69415057
Loss: 2446922.099301396
Loss: 2372402.559278876
Loss: 2299311.32707851
Loss: 2227621.4984267238
Loss: 2157308.6463310504
Loss: 2088350.5279647997
Loss: 2020726.8422087042
Loss: 1954419.0238060467
Loss: 1889410.064837544
Loss: 1825684.3572442362
Loss: 1763227.5520851463
Loss: 1702026.4325049256
Loss: 1642068.7982441776
Loss: 1583343.3601010654
Loss: 1525839.6431422466
Loss: 1469547.8

In [25]:
# testing. load datapoints one by one
test_ds = Dataset(data_list, pos_list)
#test_ds = Dataset(pos_list, data_list)
test_gen = torch_data.DataLoader(test_ds, batch_size=1, shuffle=False)
err_tot = 0
for local_data, local_pos in test_gen:
    local_data, local_pos = local_data.to(device), local_pos.to(device)
    model_sel = 0
    for stage_idx in range(num_stage):
        model = models[stage_idx][model_sel]
        output = model(local_data)
        # if it's not the last stage, the output determines which model
        # in the next stage to use
        if stage_idx != num_stage - 1:
            model_sel = int(output.item() * num_model[stage_idx + 1] / max_pos)
            if model_sel >= num_model[stage_idx + 1]:
                model_sel = num_model[stage_idx + 1] - 1
            elif model_sel <= 0:
                model_sel = 0
        # if it's the last layer, the output is the position (index)
        else:
            err_tot += abs(int(output.item()) - int(local_pos.item()))

print("Final Loss:", float(err_tot) / len(test_ds))

Final Loss: 626.5601187530925
