In [None]:
# wn_tensors = wn_tensors.to(device)
# binary_yin = binary_yin.to(device)

In [None]:
# https://www.kaggle.com/pinocookie/pytorch-dataset-and-dataloader/data
# https://discuss.pytorch.org/t/runtimeerror-multi-target-not-supported-newbie/10216/4

# Build the Dataset. We are going to generate a simple data set and then we will read it.
# Build the DataLoader.
# Build the model.
# Define the loss function and the optimizer.
# Train the model.
# Generate predictions.
# Plot the results. 

In [None]:
import logging
import numpy as np
import igraph as ig
import collections, gc

import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.autograd import Variable
from torch.utils.data import DataLoader, Dataset

from scripts.utils.hierarchy import *
from scripts.utils.processing import *
from scripts.utils.data_reading import *

logging.basicConfig(level=logging.INFO )

In [None]:
n = 16 # wn vector size  --> ~log_{2}(num_classes)

In [None]:
# N, T_leaves & PI_parents have to be present globally! (list of all the labels)
# one_hot_labels because I will keep accessing it for each document <1082>
p2c_table, c2p_table, node2id, id2node, PI_parents, T_leaves, N = lookup_table("swiki/data/cat_hier.txt", subset = False)
graph_obj = hierarchy2graph(p2c_table, node2id)
node2vec = hierarchy_vectors(graph_obj, id2node, p2c_table, n)

In [None]:
order_mapping = generate_order_mapping(N)
binary_yin = generate_binary_yin(N)

In [None]:
num_gpus = torch.cuda.device_count()

In [None]:
device = torch.device("cuda" if (torch.cuda.is_available() and num_gpus > 0) else "cpu")

In [None]:
device

In [None]:
torch.cuda.manual_seed(123)

In [None]:
def too_hot_mapping(label):

    # order_mapping, wn_tensors & binary_yin HAVE TO BE A GLOBAL OBJECT
    
#     doc_labels = list(map(int, list(label)))
    w_n = []
    w_pi = []
    y_in = []
    
    try:
        int_rep = order_mapping[label]
        w_n.append(torch.from_numpy(node2vec[label]).float())
        if label in T_leaves:
            y_in.append(binary_yin[int_rep-1])
            if label in c2p_table:
                pi_n = c2p_table[label][0]
                w_pi.append(torch.from_numpy(node2vec[pi_n]).float())
    except:
        print("wait whaat?")
    
    w_n = list2tensor(w_n).to(device)
    w_pi = list2tensor(w_pi).to(device)
    y_in = list2tensor(y_in).to(device)
    
    return w_n, w_pi, y_in

In [None]:
def update_non_leaf_wn(label_id):
    '''
    accepts label ids only which are non-leaf nodes
    '''
    
    assert label_id in N, "{} is not a node".format(label_id)    

    C_ids = p2c_table[label_id]
    Cn = len(C_ids)

    w_n = node2vec[label_id]
    w_pi = node2vec[c2p_table[label_id][0]]
    sum_wc = 0.0

    for idx in C_ids:
        w_c = node2vec[idx]
        sum_wc += w_c

    Wn = 1/(Cn +1) * (w_pi + sum_wc)

    return Wn

In [None]:
class DatasetSWIKI(Dataset):
    
    def __init__(self, file_path, reduce = True, n_components = 128):
        self.w_n = {}
        self.reduce = reduce
        self.n_components = n_components
        self.data, self.labels = lower_dim(file_path, reduce, n_components)
        
    def __len__(self):
        return self.data.shape[0]
    
    def __getitem__(self, index):
        
        if self.reduce:
            document = torch.from_numpy(self.data[index]).to(device)
        else:
            document = torch.from_numpy(self.data[index].todense()).to(device)
        
        label = self.labels[index]
        
        w_n, w_pi, y_in = too_hot_mapping(label)  
        self.w_n[label] = w_n
        
        return document, label, w_n, w_pi, y_in
    
    def update_wn(self, label_id):
        
        self.w_n[label_id] = update_non_leaf_wn(label_id)

In [None]:
n_components = 128

In [None]:
train_data = DatasetSWIKI("swiki/data/train_remapped_small.txt", reduce=True, n_components = n_components)
# valid_data = DatasetSWIKI("swiki/data/valid_remapped.txt", reduce=True, n_components = n_components)

In [None]:
len(train_data), train_data.w_n

In [None]:
batch_size = 32

In [None]:
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle = True)
# valid_loader = DataLoader(valid_data, batch_size=batch_size, shuffle = False)

In [None]:
train_iter = iter(train_loader)

doc, labs, w_n, w_pi, y_in = train_iter.next()

print('docs shape on batch size = {}'.format(doc.shape))
print('label shape on batch size = {}'.format(labs.shape))
print('w_n shape on batch size = {}'.format(w_n.shape))
print('w_pi shape on batch size = {}'.format(w_pi.shape))
print('y_in shape on batch size = {}'.format(y_in.shape))

In [None]:
# Hyper Parameters 
input_size = train_data.data.shape[1] #2085164 -> 128

num_classes = n #50312 --> n (16)
num_epochs = 30 # TRAIN IT FOR A LOT OF EPOCHS in case of lbfgs (2nd order method) else less is more
learning_rate = 0.0002 #1e-4, 0.0005

In [None]:
# Model
class LogisticRegression(nn.Module):
    def __init__(self):
        super(LogisticRegression, self).__init__()
        self.linear1 = nn.Linear(input_size, num_classes, False)
        
    def forward(self, x, wn):
        x1 = self.linear1(x)
        return x1*(wn)

In [None]:
model = LogisticRegression().to(device)

In [None]:
model

In [None]:
criterion = nn.SoftMarginLoss(reduction='mean') 
optimizer = torch.optim.LBFGS(model.parameters(), lr=learning_rate)

In [None]:
total_step = len(train_loader)

In [None]:
torch.cuda.empty_cache()

In [None]:
# torch.cuda.memory_cached()-torch.cuda.memory_allocated()

In [None]:
0.5*torch.sqrt(torch.sum((labels-pis)**2))**2

In [None]:
# Training the Model
losses = []
l2 = []

for epoch in range(num_epochs):
    train_iter = iter(train_loader)
    for i, (document, all_labels, labels, pis, y_ins) in enumerate(train_iter):
        
        document = Variable(document).float().to(device) 
        
        labels = Variable(labels).float().to(device).view(-1, n)
        pis = pis.view(-1, n)
        y_ins = y_ins.view(-1, n)
                
        l2_reg = nn.parameter.Parameter(0.5*torch.sqrt(torch.sum((labels-pis)**2))**2)
        l2.append(l2_reg)

        if type(optimizer) != torch.optim.LBFGS:
            
            optimizer.zero_grad()
            
            w_xi = model(document, labels)
            loss1 = criterion(w_xi, y_ins) + l2_reg
            
            if (i+1) % 40 == 0: 
                print ('Epoch [{}/{}], step:[{}/{}], loss: {:.6f}'.format(epoch+1, num_epochs, i+1, total_step, loss1.item()))
                torch.cuda.empty_cache()

            losses.append(loss1.item())
            loss1.backward()
            optimizer.step()

        else:
            def closure():               
                optimizer.zero_grad()
                w_xi = model(document, labels)
                loss1 = criterion(w_xi, y_ins) + l2_reg
                
                if (i+1) % 40 == 0: 
                    print ('Epoch [{}/{}], step:[{}/{}], loss: {:.6f}'.format(epoch+1, num_epochs, i+1, total_step, loss1.item()))
                    torch.cuda.empty_cache()
                
                losses.append(loss1.item())
                loss1.backward()
                return loss1
            optimizer.step(closure)
            
# optimise LR!!

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

plt.plot(losses);

In [None]:
umm = []
for v in l2:
    umm.append(v.data)

In [None]:
plt.plot(sorted(umm));

In [None]:
with torch.no_grad():
    correct = 0
    total = 0
    for documents, _, labels in valid_data:
        docs = Variable(torch.from_numpy(documents)).float()
        outputs = model(docs)
        print(torch.sum(torch.where(outputs>0.0001, torch.tensor(1), torch.tensor(0)), dim=0))
        print(torch.sum(torch.where(labels>0, torch.tensor(1), torch.tensor(0)), dim=0))

        umm, predicted = torch.max(outputs.data, 1)
        print(umm.shape)
        total += labels.size(0)
        correct += (predicted == labels).sum()

    print('Accuracy of the model on the 10000 test images: {} %'.format(100 * correct / total))

In [None]:
# Save the model checkpoint
torch.save(model.state_dict(), 'train_valid_model.ckpt')

In [None]:
import umap # fit should get a sparse matrix
%time trans = umap.UMAP(n_neighbors=5, random_state=42, n_components=32, verbose=True).fit(train_data.data)
trans.embedding_