In [1]:
# https://www.kaggle.com/pinocookie/pytorch-dataset-and-dataloader/data
# https://discuss.pytorch.org/t/runtimeerror-multi-target-not-supported-newbie/10216/4

# Build the Dataset. We are going to generate a simple data set and then we will read it.
# Build the DataLoader.
# Build the model.
# Define the loss function and the optimizer.
# Train the model.
# Generate predictions.
# Plot the results. 

In [2]:
import math
import logging
import numpy as np
import igraph as ig
import collections, gc

import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.autograd import Variable
from torch.utils.data import DataLoader, Dataset

from scripts.utils.hierarchy import *
from scripts.utils.processing import *
from scripts.utils.data_reading import *

logging.basicConfig(level=logging.INFO )

In [3]:
n = 16 # wn vector size  --> ~log_{2}(num_classes)

In [4]:
num_gpus = torch.cuda.device_count()

In [5]:
device = torch.device("cuda" if (torch.cuda.is_available() and num_gpus > 0) else "cpu")

In [6]:
device

device(type='cuda')

In [7]:
# N, T_leaves & PI_parents have to be present globally! (list of all the labels)
# one_hot_labels because I will keep accessing it for each document <1082>
p2c_table, c2p_table, node2id, id2node, PI_parents, T_leaves, N = lookup_table("swiki/data/cat_hier.txt", subset = False)
graph_obj = hierarchy2graph(p2c_table, node2id)
node2vec = hierarchy_vectors(graph_obj, id2node, p2c_table, n, device)

65333it [00:00, 239454.71it/s]


In [8]:
order_mapping = generate_order_mapping(N)
binary_yin = generate_binary_yin(N, device)

100%|██████████| 50312/50312 [00:00<00:00, 210219.94it/s]


In [9]:
torch.cuda.manual_seed(123)

In [10]:
def too_hot_mapping(label):

    # order_mapping, wn_tensors & binary_yin HAVE TO BE A GLOBAL OBJECT
    
#     doc_labels = list(map(int, list(label)))
    y_in = []
    
    try:
        w_n = node2vec[label]
        if label in c2p_table:
            pi_n = c2p_table[label][0]
            w_pi = node2vec[pi_n]
        if label in T_leaves:
            int_rep = order_mapping[label]
            y_in = binary_yin[int_rep-1]
            
    except:
        print("wait whaat?")
    
    return w_n, w_pi, y_in

In [11]:
def update_non_leaf_wn(label_id):
    '''
    accepts label ids only which are non-leaf nodes
    '''
    
    assert label_id in N, "{} is not a node".format(label_id)    

    C_ids = p2c_table[label_id]
    Cn = len(C_ids)

    w_n = node2vec[label_id]
    w_pi = node2vec[c2p_table[label_id][0]]
    sum_wc = 0.0

    for idx in C_ids:
        w_c = node2vec[idx]
        sum_wc += w_c

    Wn = 1/(Cn +1) * (w_pi + sum_wc)

    return Wn

In [12]:
class DatasetSWIKI(Dataset):
    
    def __init__(self, file_path, reduce = True, n_components = 128):
        self.reduce = reduce
        self.n_components = n_components
        self.data, self.labels = lower_dim(file_path, reduce, n_components)
        
    def __len__(self):
        return self.data.shape[0]
    
    def __getitem__(self, index):
        
        if self.reduce:
            document = torch.as_tensor(self.data[index], device = device, dtype = torch.float32)
        else:
            document = torch.as_tensor(self.data[index].todense(), device = device, dtype = torch.float32)

        label = torch.as_tensor(self.labels[index], device = device, dtype = torch.float32)
        
        w_n, w_pi, y_in = too_hot_mapping(int(label))
        
        l2_reg = nn.parameter.Parameter((0.5*torch.sqrt(torch.sum((w_n-w_pi)**2))**2).to(device))
        
#         self.w_n[label] = w_n
        
        return document, label, w_n, w_pi, y_in, l2_reg
    
#     def update_wn(self, label_id):
        
#         self.w_n[label_id] = update_non_leaf_wn(label_id)

In [13]:
n_components = 1200

In [14]:
train_data = DatasetSWIKI("swiki/data/train_split_remapped.txt", reduce=False, n_components = n_components)
valid_data = DatasetSWIKI("swiki/data/valid_remapped.txt", reduce=False, n_components = n_components)

In [15]:
train_data.data.shape

(590035,)

In [16]:
batch_size = 199

In [17]:
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle = True)
valid_loader = DataLoader(valid_data, batch_size=batch_size, shuffle = False)

In [18]:
train_iter = iter(train_loader)

doc, labs, w_n, w_pi, y_in, l2 = train_iter.next()

print('docs shape on batch size = {}, {}, {}'.format(doc.shape, doc.dtype, doc.device))
print('label shape on batch size = {}, {}, {}'.format(labs.shape, labs.dtype, labs.device))
print('w_n shape on batch size = {}, {}, {}'.format(w_n.shape, w_n.dtype, w_n.device))
print('w_pi shape on batch size = {}, {}, {}'.format(w_pi.shape, w_pi.dtype, w_pi.device))
print('y_in shape on batch size = {}, {}, {}'.format(y_in.shape, y_in.dtype, y_in.device))
print('l2-reg shape on batch size = {}, {}, {}'.format(l2.shape, l2.dtype, l2.device))

docs shape on batch size = torch.Size([199, 1, 2085164]), torch.float32, cuda:0
label shape on batch size = torch.Size([199]), torch.float32, cuda:0
w_n shape on batch size = torch.Size([199, 16]), torch.float32, cuda:0
w_pi shape on batch size = torch.Size([199, 16]), torch.float32, cuda:0
y_in shape on batch size = torch.Size([199, 16]), torch.float32, cuda:0
l2-reg shape on batch size = torch.Size([199]), torch.float32, cuda:0


In [19]:
train_data.data[0].shape

(1, 2085164)

In [20]:
# Hyper Parameters 
# input_size = train_data.data.shape[1] #128 n_components
input_size = train_data.data[0].shape[1] #2085164 

num_classes = n #50312 --> n (16)
num_epochs = 10 # TRAIN IT FOR A LOT OF EPOCHS in case of lbfgs (2nd order method) else less is more
learning_rate = 1e-4 #1e-4, 0.0005

In [None]:
torch.backends.cudnn.version()

In [21]:
torch.backends.cudnn.benchmark = True

In [22]:
# Model
class LogisticRegression(nn.Module):
    def __init__(self):
        super(LogisticRegression, self).__init__()
        self.linear1 = nn.Linear(input_size, num_classes, False)
        
    def forward(self, x, wn):
        x1 = self.linear1(x)
        y =  torch.mul(-x1, wn)
        return y

In [23]:
model = LogisticRegression().to(device)

In [24]:
criterion = nn.SoftMarginLoss(reduction='mean').to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [25]:
total_step = len(train_loader)

In [26]:
total_step

2965

In [27]:
torch.cuda.empty_cache()

In [30]:
checkpoint_path = "batch_{}_train_valid_model.pt".format(batch_size)

In [31]:
if os.path.exists(checkpoint_path):
    checkpoint = torch.load(checkpoint_path)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    epoch = checkpoint['epoch']
    loss = checkpoint['loss']
    losses = checkpoint['losses']
    step = checkpoint['step']

In [None]:
# Training the Model
losses = []

for epoch in range(num_epochs):  
    train_iter = iter(train_loader)
    for step, (document, _, labels, _, y_ins, l2_reg) in enumerate(train_iter):

        document = document.reshape(batch_size,-1)
   
        optimizer.zero_grad()
        w_xi = model(document, labels)
        loss1 = criterion(w_xi, y_ins) + torch.mean(l2_reg)

        if (step+1) % 100 == 0: 
            print ('Epoch [{}/{}], step:[{}/{}], loss: {:.6f}'.format(epoch+1, num_epochs, step+1, total_step, loss1.item()))
            torch.cuda.empty_cache()


        losses.append(loss1.item())
        loss1.backward()
        optimizer.step()

#         if type(optimizer) != torch.optim.LBFGS:
#         else:
#             def closure():               
#                 optimizer.zero_grad()
#                 w_xi = model(document, labels)
#                 loss1 = criterion(w_xi, y_ins) + l2_reg
                
#                 if (step+1) % 100 == 0: 
#                     print ('Epoch [{}/{}], step:[{}/{}], loss: {:.6f}'.format(epoch+1, num_epochs, step+1, total_step, loss1.item()))
#                     torch.cuda.empty_cache()
                
#                 losses.append(loss1.item())
#                 loss1.backward()
#                 return loss1
#             optimizer.step(closure)

    torch.save({
    'epoch': epoch,
    'step': step,
    'losses': losses,
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'loss': loss1}, checkpoint_path)

Epoch [1/10], step:[100/2965], loss: 0.684947
Epoch [1/10], step:[200/2965], loss: 0.681258
Epoch [1/10], step:[300/2965], loss: 0.678774
Epoch [1/10], step:[400/2965], loss: 0.673325
Epoch [1/10], step:[500/2965], loss: 0.671472
Epoch [1/10], step:[600/2965], loss: 0.667838
Epoch [1/10], step:[700/2965], loss: 0.671404
Epoch [1/10], step:[800/2965], loss: 0.667230
Epoch [1/10], step:[900/2965], loss: 0.670128
Epoch [1/10], step:[1000/2965], loss: 0.663459
Epoch [1/10], step:[1100/2965], loss: 0.662757
Epoch [1/10], step:[1200/2965], loss: 0.663248
Epoch [1/10], step:[1300/2965], loss: 0.668458
Epoch [1/10], step:[1400/2965], loss: 0.669680
Epoch [1/10], step:[1500/2965], loss: 0.670203
Epoch [1/10], step:[1600/2965], loss: 0.666799


In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

plt.plot(losses);

In [None]:
umm = []
for v in l2:
    umm.append(v.data)

In [None]:
plt.plot(sorted(umm));

In [None]:
with torch.no_grad():
    correct = 0
    total = 0
    for document, all_labels, labels, pis, y_ins in valid_data:
        
        document = Variable(document).float().to(device) 
        
        labels = Variable(labels).float().to(device).view(-1, n)
        pis = pis.view(-1, n)
        y_ins = y_ins.view(-1, n)
        
        output = model(document, labels)

        
        print(torch.sum(torch.where(outputs>0.0001, torch.tensor(1), torch.tensor(0)), dim=0))
        print(torch.sum(torch.where(labels>0, torch.tensor(1), torch.tensor(0)), dim=0))

        umm, predicted = torch.max(outputs.data, 1)
        print(umm.shape)
        total += labels.size(0)
        correct += (predicted == labels).sum()

    print('Accuracy of the model on the 10000 test images: {} %'.format(100 * correct / total))

In [None]:
# Save the model checkpoint
torch.save(model.state_dict(), 'test_valid_model.ckpt')

In [None]:
import umap # fit should get a sparse matrix
%time trans = umap.UMAP(n_neighbors=5, random_state=42, n_components=32, verbose=True).fit(train_data.data)
trans.embedding_