In [1]:
# https://www.kaggle.com/pinocookie/pytorch-dataset-and-dataloader/data
# https://discuss.pytorch.org/t/runtimeerror-multi-target-not-supported-newbie/10216/4

# Build the Dataset. We are going to generate a simple data set and then we will read it.
# Build the DataLoader.
# Build the model.
# Define the loss function and the optimizer.
# Train the model.
# Generate predictions.
# Plot the results. 

In [1]:
import math
import logging
import numpy as np
import igraph as ig
import collections, gc

import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.autograd import Variable
from torch.utils.data import DataLoader, Dataset


from scripts.utils.hierarchy import *
from scripts.utils.processing import *
from scripts.utils.data_reading import *

logging.basicConfig(level=logging.INFO )

In [2]:
n = 16 # wn vector size  --> ~log_{2}(num_classes)

In [3]:
num_gpus = torch.cuda.device_count()

In [4]:
device = torch.device("cuda" if (torch.cuda.is_available() and num_gpus > 0) else "cpu")

In [5]:
device

device(type='cuda')

In [6]:
# N, T_leaves & PI_parents have to be present globally! (list of all the labels)
# one_hot_labels because I will keep accessing it for each document <1082>
p2c_table, c2p_table, node2id, id2node, PI_parents, T_leaves, N = lookup_table("swiki/data/cat_hier.txt", subset = False)
graph_obj = hierarchy2graph(p2c_table, node2id)
node2vec, w_pi_vec = hierarchy_vectors(graph_obj, id2node, p2c_table, c2p_table, n, device, True)

65333it [00:00, 247599.88it/s]


In [7]:
order_mapping = generate_order_mapping(N, False)
rev_order_mapping = generate_order_mapping(N, True)
binary_yin = generate_binary_yin(N, device)

100%|██████████| 50312/50312 [00:00<00:00, 153526.58it/s]


In [8]:
torch.cuda.manual_seed(123)

In [9]:
def update_non_leaf_wn(label_id):
    '''
    accepts label ids only which are non-leaf nodes
    '''
    
    assert label_id in N, "{} is not a node".format(label_id)    

    C_ids = p2c_table[label_id]
    Cn = len(C_ids)

    w_n = node2vec[label_id]
    w_pi = node2vec[c2p_table[label_id][0]]
    sum_wc = 0.0

    for idx in C_ids:
        w_c = node2vec[idx]
        sum_wc += w_c

    Wn = 1/(Cn +1) * (w_pi + sum_wc)

    return Wn

In [10]:
class DatasetSWIKI(Dataset):
    
    def __init__(self, file_path, reduce = True, n_components = 128):
        self.reduce = reduce
        
        self.n_components = n_components
        
        self.data, self.labels = lower_dim(file_path, reduce, n_components)
             
        self.labels = torch.as_tensor(self.labels, device = device, dtype = torch.float32)
        
        self.w_vec = node2vec
        
        self.w_pi_vec = w_pi_vec
        
        self.y_in = binary_yin
        
    def __len__(self):
        return self.data.shape[0]
    
    def __getitem__(self, index):
        
        document = torch.as_tensor(self.data[index].todense(), device = device, dtype = torch.float32)
       
        label = self.labels[index]
        
        w_n = self.w_vec[label.item()]
        
        w_pi = self.w_pi_vec[label.item()]
        
        y_in = self.y_in[order_mapping[label.item()]]
        
        l2_reg = torch.sqrt(torch.sum((w_n-w_pi)**2)).to(device)

        return document, label, w_n, y_in, l2_reg

In [11]:
n_components = 1200

In [12]:
train_data = DatasetSWIKI("swiki/data/valid_remapped.txt", reduce=False, n_components = n_components)
valid_data = DatasetSWIKI("swiki/data/valid_small_remapped.txt", reduce=False, n_components = n_components)

In [13]:
len(train_data)

252757

In [14]:
train_data.data[0].shape, valid_data.data[0].shape

((1, 2085161), (1, 2085161))

In [15]:
batch_size = 251

In [16]:
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle = True)
valid_loader = DataLoader(valid_data, batch_size=batch_size, shuffle = False)

In [17]:
train_iter = iter(train_loader)

doc, labs, w_n, y_in, l2 = train_iter.next()

print('docs shape on batch size = {}, {}, {}'.format(doc.shape, doc.dtype, doc.device))
print('label shape on batch size = {}, {}, {}'.format(labs.shape, labs.dtype, labs.device))
print('w_n shape on batch size = {}, {}, {}'.format(w_n.shape, w_n.dtype, w_n.device))
print('y_in shape on batch size = {}, {}, {}'.format(y_in.shape, y_in.dtype, y_in.device))
print('l2-reg shape on batch size = {}, {}, {}'.format(l2.shape, l2.dtype, l2.device))

docs shape on batch size = torch.Size([128, 1, 2085161]), torch.float32, cuda:0
label shape on batch size = torch.Size([128]), torch.float32, cuda:0
w_n shape on batch size = torch.Size([128, 16]), torch.float32, cuda:0
y_in shape on batch size = torch.Size([128, 16]), torch.float32, cuda:0
l2-reg shape on batch size = torch.Size([128]), torch.float32, cuda:0


In [17]:
# Hyper Parameters 
# input_size = train_data.data.shape[1] #128 n_components
input_size = train_data.data[0].shape[1] #2085164 

num_classes = n #50312 --> n (16)
num_epochs = 10 # TRAIN IT FOR A LOT OF EPOCHS in case of lbfgs (2nd order method) else less is more
learning_rate = 0.0001 #1e-4, 0.0005

In [18]:
torch.backends.cudnn.version()

7402

In [19]:
torch.backends.cudnn.benchmark = True

In [20]:
# Model
class LogisticRegression(nn.Module):
    def __init__(self):
        super(LogisticRegression, self).__init__()
        self.linear1 = nn.Linear(input_size, num_classes, False)
        
    def forward(self, x, wn):
        x1 = self.linear1(x)
        y =  torch.mul(-x1, wn)
        return y

In [21]:
model = LogisticRegression().to(device)

In [22]:
criterion = nn.SoftMarginLoss(reduction='mean').to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
# scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1000, gamma=0.1)

In [23]:
total_step = len(train_loader)

In [24]:
total_step

1007

In [25]:
torch.cuda.empty_cache()

In [26]:
checkpoint_path = "batch_{}_{}_{}_train_valid_model.pt".format(batch_size, learning_rate, input_size)

In [27]:
if os.path.exists(checkpoint_path):
    checkpoint = torch.load(checkpoint_path)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    epoch = checkpoint['epoch']
    loss = checkpoint['loss']
    losses = checkpoint['losses']
    step = checkpoint['step']

In [None]:
# Training the Model
import matplotlib.pyplot as plt
%matplotlib inline

losses = []
epoch = 0
for step in range(num_epochs):  
    for document, _, labels, y_ins, l2_reg in tqdm(train_loader):

        document = document.reshape(batch_size,-1)
        
        optimizer.param_groups[0]['weight_decay'] = torch.mean(l2_reg)
        optimizer.zero_grad()
        
        w_xi = model.forward(document, labels)
        loss1 = criterion(w_xi, y_ins)
        
        if (epoch+1) % 100 == 0: 
            logging.info('Epoch [{}/{}], step:[{}/{}], loss: {:.6f}'.format(step+1, num_epochs, epoch+1, total_step, loss1.item()))
            torch.cuda.empty_cache()


        losses.append(loss1.item())
        loss1.backward()
        optimizer.step()
        epoch += 1
#         if type(optimizer) != torch.optim.LBFGS:
#         else:
#             def closure():               
#                 optimizer.zero_grad()
#                 w_xi = model(document, labels)
#                 loss1 = criterion(w_xi, y_ins) + l2_reg
                
#                 if (step+1) % 100 == 0: 
#                     print ('Epoch [{}/{}], step:[{}/{}], loss: {:.6f}'.format(epoch+1, num_epochs, step+1, total_step, loss1.item()))
#                     torch.cuda.empty_cache()
                
#                 losses.append(loss1.item())
#                 loss1.backward()
#                 return loss1
#             optimizer.step(closure)

    torch.save({
    'epoch': step,
    'step': epoch,
    'losses': losses,
    'node2vec': node2vec,
    'learning_rate': learning_rate,
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'loss': loss1}, checkpoint_path)
    plt.plot(losses);


 10%|▉         | 99/1007 [04:03<36:59,  2.44s/it]INFO:root:Epoch [1/10], step:[100/1007], loss: 0.684101
 20%|█▉        | 199/1007 [08:07<32:49,  2.44s/it]INFO:root:Epoch [1/10], step:[200/1007], loss: 0.679254
 25%|██▌       | 254/1007 [10:22<30:43,  2.45s/it]

In [None]:
# save node2vec also as a pkl file or something to retrieve original classes

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

plt.plot(losses);

In [None]:
def converter(input_, mode_in="binary", mode_out="decimal"):
    
    if mode_in == "binary" and mode_out == "decimal":
        a2str = np.array2string(input_, separator='')
        width = len(a2str)-1
        a2str = a2str[1:width]
        decimal = int(a2str, 2)
    
    return decimal

In [None]:
with torch.no_grad():
    correct = 0
    total = 0
    for document, all_label, labels, _, y_ins, _ in valid_data:
        
        # change document size
        if document.shape[1] != input_size:
            docx = document.view(-1)
            zeros = torch.zeros((input_size-docx.shape[0],), device=device, dtype=torch.float32)
            document = torch.cat((docx, zeros),0).view(-1,)
        
        output = model(document, labels)
        
        
        print((output))
        trans_out = torch.where(torch.tanh(output)>=0, torch.tensor(1, device=device), torch.tensor(0, device=device)).view(-1,)
        trans_yin = torch.where(y_ins>0, torch.tensor(1, device=device), torch.tensor(0, device=device))
        o = trans_out.cpu().numpy()
        print(trans_out)
        dec_o = converter(input_ = o)
        y = trans_yin.cpu().numpy()
        print(rev_order_mapping[dec_o])
        print(int(all_label.cpu().numpy()))
        # todo: multilabel prediction
        # todo: see how much it differs from the original (root node) or something else
        
        break

In [None]:
# Save the model checkpoint
torch.save(model.state_dict(), 'test_valid_model.ckpt')

In [None]:
import umap # fit should get a sparse matrix
%time trans = umap.UMAP(n_neighbors=5, random_state=42, n_components=32, verbose=True).fit(train_data.data)
trans.embedding_