In [1]:
# https://www.kaggle.com/pinocookie/pytorch-dataset-and-dataloader/data
# https://discuss.pytorch.org/t/runtimeerror-multi-target-not-supported-newbie/10216/4

# Build the Dataset. We are going to generate a simple data set and then we will read it.
# Build the DataLoader.
# Build the model.
# Define the loss function and the optimizer.
# Train the model.
# Generate predictions.
# Plot the results. 

In [2]:
import logging
import numpy as np
import collections, gc

import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.autograd import Variable
from sklearn import preprocessing, metrics
from torch.utils.data import DataLoader, Dataset


# from scripts.utils.logger import Logger
from scripts.utils.data_reading import lower_dim, rr_reader
from scripts.utils.processing import lookup_table, generate_label_vector

logging.basicConfig(level=logging.INFO )

In [3]:
# N, T_leaves & PI_parents have to be present globally! (list of all the labels)
# one_hot_labels because I will keep accessing it for each document <1082>
p2c_table, c2p_table, _, _, PI_parents, T_leaves, N = lookup_table("swiki/data/cat_hier.txt", subset = False)

65333it [00:00, 235908.39it/s]


In [4]:
num_gpus = torch.cuda.device_count()

In [5]:
num_gpus

1

In [6]:
device = torch.device("cpu" if (torch.cuda.is_available() and num_gpus > 0) else "cpu")

In [7]:
device

device(type='cpu')

In [8]:
torch.cuda.manual_seed(123)

In [9]:
order_label_mapping = generate_label_vector(N)

In [10]:
def too_hot_mapping(label_tuple):

    # order_label_mapping HAS TO BE A GLOBAL OBJECT
    # y_in = {-1, +1}
    
    doc_labels = list(map(int, list(label_tuple)))
    
    temp_y_in = torch.ones((len(N),), device = device)*-1
    
    temp_wn = torch.zeros((len(N),), device = device, requires_grad=True)
    temp_w_pi_n = torch.zeros((len(N),), device = device)
    
    try:
        for label in doc_labels:
            int_rep = order_label_mapping[label]
            temp_wn[int_rep-1] += 1
            if label in T_leaves:
                temp_y_in[int_rep-1] += 2
                if label in c2p_table:
                    temp_w_pi_n[order_label_mapping[c2p_table[label][0]]-1] += 1 
                    # todo: think of a way to return w_n and w_pi_n and then compute MSE exclusively
                    # for them in training
    except:
        print("wait whaat?")
    return temp_wn, temp_y_in, temp_w_pi_n

In [11]:
class DatasetSWIKI(Dataset):
    
    def __init__(self, file_path, reduce = True, n_components = 128):
        self.reduce = reduce
        self.n_components = n_components
        self.data, self.labels = lower_dim(file_path, reduce, n_components)
        
    def __len__(self):
        return self.data.shape[0]
    
    def __getitem__(self, index):
        
        if self.reduce:
            document = torch.from_numpy(self.data[index]).to(device)
        else:
            document = torch.from_numpy(self.data[index].todense()).to(device)
        
        label = self.labels[index]
        
        label_vector, y_in, pi = too_hot_mapping(label)        
        
        return document, label, label_vector, y_in, pi

In [12]:
n_components = 128

In [13]:
train_data = DatasetSWIKI("swiki/data/train_split_remapped.txt", reduce=True, n_components = n_components)
valid_data = DatasetSWIKI("swiki/data/valid_remapped.txt", reduce=True, n_components = n_components)

INFO:root:Elapsed time: 3.0min 40.45sec
INFO:root:Elapsed time: 2.0min 46.69sec


In [14]:
batch_size = 32

In [15]:
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle = False)
valid_loader = DataLoader(valid_data, batch_size=batch_size, shuffle = False)

In [16]:
train_iter = iter(train_loader)

doc, labbbs, labs, y, pi = train_iter.next()

print('docs shape on batch size = {}'.format(doc.shape))
print('labels shape on batch size = {}'.format(labs.shape))
print('y_in shape on batch size = {}'.format(y.shape))
print('out shape on batch size = {}'.format(pi.shape))

docs shape on batch size = torch.Size([32, 128])
labels shape on batch size = torch.Size([32, 50312])
y_in shape on batch size = torch.Size([32, 50312])
out shape on batch size = torch.Size([32, 50312])


In [17]:
pi

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

In [18]:
208516400/10940
# 2085164/547

19060.0

In [19]:
# Hyper Parameters 
input_size = train_data.data.shape[1] #2085164 -> 128

num_classes = len(N) #50312
num_epochs = 10
learning_rate = 0.001

In [28]:
# Model
class LogisticRegression(nn.Module):
    def __init__(self):
        super(LogisticRegression, self).__init__()
        self.linear1 = nn.Linear(input_size, batch_size, False)
        
    def forward(self, x, y):
        x1 = self.linear1(x)  
        x1 = F.softmax(x1, dim=0)
        return x1.mm(y)

In [29]:
model = LogisticRegression().to(device)

In [30]:
model

LogisticRegression(
  (linear1): Linear(in_features=128, out_features=32, bias=False)
)

In [31]:
criterion = nn.SoftMarginLoss(reduction='mean') 
L2Loss = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [32]:
total_step = len(train_loader)

In [33]:
torch.cuda.empty_cache()

In [34]:
torch.cuda.memory_cached()-torch.cuda.memory_allocated()

0

In [None]:
# Training the Model
losses = []

for epoch in range(num_epochs):
    train_iter = iter(train_loader)
    for i, (document, _, labels, y_in, pi) in enumerate(train_iter):
        
        document = Variable(document).float().to(device) # batch size 100
        labels = Variable(labels).float().to(device)

        if type(optimizer) != torch.optim.LBFGS:
            
            optimizer.zero_grad()

            w_xi = model(document, labels)
            loss1 = criterion(w_xi, y_in)
            loss2 = L2Loss(labels, pi)
            loss_full = torch.sqrt(loss2) + loss1

            if (i+1) % 100 == 0: 
                print ('Epoch [{}/{}], step:[{}/{}], loss: {:.6f}'.format(epoch+1, num_epochs, i+1, total_step, loss_full.item()))
                torch.cuda.empty_cache()
 
            losses.append(loss_full.item())
            loss_full.backward()

            optimizer.step()

        else:
            
            def closure():
                optimizer.zero_grad()

                w_xi = model(document, labels)
                loss1 = criterion(w_xi, y_in)
                loss2 = L2Loss(labels, pi)
                loss_full = torch.sqrt(loss2) + loss1
                if (i+1) % 100 == 0: 
                    print ('Epoch [{}/{}], step:[{}/{}], loss: {:.6f}'.format(epoch+1, num_epochs, i+1, total_step, loss_full.item()))
                    torch.cuda.empty_cache()

                losses.append(loss_full.item())
                loss_full.backward()
                return loss_full

            optimizer.step(closure)

Epoch [1/10], step:[100/9995], loss: 0.701836
Epoch [1/10], step:[200/9995], loss: 0.701126
Epoch [1/10], step:[300/9995], loss: 0.701698


In [None]:
labels.t().mm(document)

In [None]:
losses[-469]

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.plot(losses);

In [None]:
with torch.no_grad():
    correct = 0
    total = 0
    for documents, _, labels in valid_data:
        docs = Variable(torch.from_numpy(documents)).float()
        outputs = model(docs)
        print(torch.sum(torch.where(outputs>0.0001, torch.tensor(1), torch.tensor(0)), dim=0))
        print(torch.sum(torch.where(labels>0, torch.tensor(1), torch.tensor(0)), dim=0))

        umm, predicted = torch.max(outputs.data, 1)
        print(umm.shape)
        total += labels.size(0)
        correct += (predicted == labels).sum()

    print('Accuracy of the model on the 10000 test images: {} %'.format(100 * correct / total))

In [None]:
# Save the model checkpoint
torch.save(model.state_dict(), 'train_valid_model.ckpt')

In [None]:
import torchvision.utils as vutils
from tensorboardX import SummaryWriter
writer = SummaryWriter()

In [None]:
import matplotlib.pyplot as plt
plt.switch_backend('agg')

fig = plt.figure()

c1 = plt.Circle((0.2, 0.5), 0.2, color='r')
c2 = plt.Circle((0.8, 0.5), 0.2, color='r')

ax = plt.gca()
ax.add_patch(c1)
ax.add_patch(c2)
plt.axis('scaled')


# from tensorboardX import SummaryWriter
# writer = SummaryWriter()
writer.add_figure('matplotlib', fig)
writer.close()

In [None]:
# #         Forward + Backward + Optimize
#         def closure():
#             optimizer.zero_grad()
#             outputs = model(document)
#             loss = criterion(outputs, torch.max(labels, 1)[0])
# #             print('loss:', loss.item())
#             loss.backward()
#             return loss
#         optimizer.step(closure)


In [None]:
import umap # fit should get a sparse matrix
%time trans = umap.UMAP(n_neighbors=5, random_state=42, n_components=32, verbose=True).fit(train_data.data)
trans.embedding_

In [None]:
167593,441685 160318:1 227881:1 255720:1 265934:1 432905:2 515946:1 538188:1 586136:1 610561:1 692683:1 
                                        735075:1 828325:1 874107:1 898766:1 1087064:1 1354716:1 1432746:1 
                                        1454292:1 1463839:1 1626714:1 1715083:1 1839104:1 1864180:1 2023750:1 
