In [1]:
# https://www.kaggle.com/pinocookie/pytorch-dataset-and-dataloader/data
# https://discuss.pytorch.org/t/runtimeerror-multi-target-not-supported-newbie/10216/4

# Build the Dataset. We are going to generate a simple data set and then we will read it.
# Build the DataLoader.
# Build the model.
# Define the loss function and the optimizer.
# Train the model.
# Generate predictions.
# Plot the results. 

In [2]:
import logging
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.autograd import Variable
from sklearn import preprocessing, metrics
from torch.utils.data import DataLoader, Dataset


# from scripts.utils.logger import Logger
from scripts.utils.data_reading import lower_dim, rr_reader
from scripts.utils.processing import lookup_table, generate_label_vector

logging.basicConfig(level=logging.INFO )

In [3]:
# N, T_leaves & PI_parents have to be present globally! (list of all the labels)
# one_hot_labels because I will keep accessing it for each document <1082>
p2c_table, c2p_table, _, _, PI_parents, T_leaves, N = lookup_table("swiki/data/cat_hier.txt", subset = False)

65333it [00:00, 210191.83it/s]


In [4]:
max(N)

2445705

In [5]:
order_label_mapping = generate_label_vector(N)

In [6]:
order_label_mapping[1]

1

In [7]:
c2p_table[2445705][0]

2230930

In [8]:
def min_w(N = N, c2p_table = c2p_table):

    optim_dict = {}
    sum_n = 0

    temp_wn = torch.zeros((len(N),))
    temp_w_pi_n = torch.zeros((len(N),))

    for n in T_leaves:
        if n not in optim_dict:
            if n in c2p_table:
                temp_w_pi_n[order_label_mapping[c2p_table[n][0]]-1] = 1 
            temp_wn[order_label_mapping[n]-1] = 1

            res = torch.norm(temp_wn - temp_w_pi_n, 'fro', keepdim=True)
            optim_dict[n] =  res
            sum_n += res
    return optim_dict, sum_n

In [None]:
optim, sumn = min_w()

In [None]:
sumn # GLOBAL

In [9]:
def too_hot_mapping(label_tuple):

    # order_label_mapping HAS TO BE A GLOBAL OBJECT
    # y_in = {-1, +1}
    
    doc_labels = list(map(int, list(label_tuple)))
    
    temp = torch.zeros((len(N),))
    temp_y_in = torch.ones((len(N),))*-1
    
    temp_wn = torch.zeros((len(N),))
    temp_w_pi_n = torch.zeros((len(N),))
    res = 0.0
    
    try:
        for label in doc_labels:
            int_rep = order_label_mapping[label]
            temp[int_rep-1] += 1
            if label in T_leaves:
                temp_y_in[int_rep-1] += 2
                temp_wn[int_rep-1] = 1
                if label in c2p_table:
                    temp_w_pi_n[order_label_mapping[c2p_table[label][0]]-1] = 1 
                    res += torch.norm(temp_wn - temp_w_pi_n, 'fro', keepdim=True)
        res = res/len(doc_labels)
    except:
        print("wait whaat?")
    return temp, temp_y_in, res

In [10]:
class DatasetSWIKI(Dataset):
    
    def __init__(self, file_path, reduce = True, n_components = 128):
        self.reduce = reduce
        self.n_components = n_components
        self.data, self.labels = lower_dim(file_path, reduce, n_components)
        
    def __len__(self):
        return self.data.shape[0]
    
    def __getitem__(self, index):
        
        if self.reduce:
            document = torch.from_numpy(self.data[index])
        else:
            document = torch.from_numpy(self.data[index].todense())
        
        label = self.labels[index]
        
        # label vector := w_n 
        label_vector, y_in, res = too_hot_mapping(label)        
        
        return document, label, label_vector, y_in, res

In [11]:
n_components = 128

In [12]:
train_data = DatasetSWIKI("swiki/data/train_split_remapped.txt", reduce=True, n_components = n_components)
valid_data = DatasetSWIKI("swiki/data/valid_remapped.txt", reduce=True, n_components = n_components)

INFO:root:Elapsed time: 4.0min 18.15sec
INFO:root:Elapsed time: 3.0min 22.34sec


In [13]:
batch_size = 100

In [14]:
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle = True)
valid_loader = DataLoader(valid_data, batch_size=batch_size, shuffle = False)

In [15]:
train_iter = iter(train_loader)

doc, labbbs, labs, y, res = train_iter.next()

print('docs shape on batch size = {}'.format(doc.shape))
print('labels shape on batch size = {}'.format(labs.shape))
print('y_in shape on batch size = {}'.format(y.shape))
print('res shape on batch size = {}'.format(res.shape))

docs shape on batch size = torch.Size([100, 128])
labels shape on batch size = torch.Size([100, 50312])
y_in shape on batch size = torch.Size([100, 50312])
res shape on batch size = torch.Size([100])


In [16]:
208516400/10940
# 2085164/547

19060.0

In [17]:
# Hyper Parameters 
input_size = train_data.data.shape[1] #2085164 -> 128

num_classes = 50312 #50312
num_epochs = 5
learning_rate = 0.001

In [18]:
# Model
class LogisticRegression(nn.Module):
    def __init__(self):
        super(LogisticRegression, self).__init__()
        self.linear1 = nn.Linear(input_size, batch_size, False)
        self.linear2 = nn.Linear(batch_size, num_classes, False)

    
    def forward(self, x):
        x = self.linear1(x)
        x = self.linear2(x)
        return x

In [19]:
model = LogisticRegression()

In [20]:
model

LogisticRegression(
  (linear1): Linear(in_features=128, out_features=100, bias=False)
  (linear2): Linear(in_features=100, out_features=50312, bias=False)
)

In [21]:
criterion = nn.SoftMarginLoss(reduction='mean')  
optimizer = torch.optim.LBFGS(model.parameters(), lr=learning_rate)

In [22]:
total_step = len(train_loader)

In [None]:
# Training the Model
train_iter = iter(train_loader)

for epoch in range(num_epochs):
    for i, (document, _, labels, y_in, res) in enumerate(train_iter):
              
        document = Variable(document).float() # batch size 100
        labels = Variable(labels).float()
        
#         x_i = model(document)
#         w_xi = labels*(x_i)
#         loss = criterion(w_xi, y_in)

#         optimizer.zero_grad()
#         print ('Epoch [{}/{}], step:[{}/{}], loss: {:.4f}'.format(epoch+1, num_epochs, i+1, total_step, loss.item()))
#         loss.backward()
#         optimizer.step()


        def closure():
            optimizer.zero_grad()
            x_i = model(document)
            w_xi = labels*(x_i)
            loss = torch.mean(res) + criterion(w_xi, y_in)
            print('Epoch [{}/{}], step:[{}/{}], loss: {:.4f}'.format(epoch+1, num_epochs, i+1, total_step, loss.item()))
            loss.backward()
            return loss
        optimizer.step(closure)


Epoch [1/5], step:[1/3199], loss: 2.3529
Epoch [1/5], step:[1/3199], loss: 2.3529
Epoch [1/5], step:[2/3199], loss: 2.3515
Epoch [1/5], step:[2/3199], loss: 2.3515
Epoch [1/5], step:[3/3199], loss: 2.3974
Epoch [1/5], step:[3/3199], loss: 2.3974
Epoch [1/5], step:[4/3199], loss: 2.3747
Epoch [1/5], step:[4/3199], loss: 2.3747
Epoch [1/5], step:[5/3199], loss: 2.3560
Epoch [1/5], step:[5/3199], loss: 2.3560
Epoch [1/5], step:[6/3199], loss: 2.3692


In [None]:
torch.sum(torch.where(w_xi>0, torch.tensor(1), torch.tensor(0)), dim=1)

In [None]:
torch.sum(torch.where(y_in>0, torch.tensor(1), torch.tensor(0)), dim=1)

In [None]:
print(torch.sum(torch.where(labels>0, torch.tensor(1), torch.tensor(0)), dim=1))

In [None]:
50128-50312

In [None]:
with torch.no_grad():
    correct = 0
    total = 0
    for documents, _, labels in valid_data:
        docs = Variable(torch.from_numpy(documents)).float()
        outputs = model(docs)
        print(torch.sum(torch.where(outputs>0.0001, torch.tensor(1), torch.tensor(0)), dim=0))
        print(torch.sum(torch.where(labels>0, torch.tensor(1), torch.tensor(0)), dim=0))

        umm, predicted = torch.max(outputs.data, 1)
        print(umm.shape)
        total += labels.size(0)
        correct += (predicted == labels).sum()

    print('Accuracy of the model on the 10000 test images: {} %'.format(100 * correct / total))

# Save the model checkpoint
torch.save(model.state_dict(), 'model.ckpt')

In [None]:
95894672/50312

In [None]:
import torchvision.utils as vutils
from tensorboardX import SummaryWriter
writer = SummaryWriter()

In [None]:
import matplotlib.pyplot as plt
plt.switch_backend('agg')

fig = plt.figure()

c1 = plt.Circle((0.2, 0.5), 0.2, color='r')
c2 = plt.Circle((0.8, 0.5), 0.2, color='r')

ax = plt.gca()
ax.add_patch(c1)
ax.add_patch(c2)
plt.axis('scaled')


# from tensorboardX import SummaryWriter
# writer = SummaryWriter()
writer.add_figure('matplotlib', fig)
writer.close()

In [None]:
# #         Forward + Backward + Optimize
#         def closure():
#             optimizer.zero_grad()
#             outputs = model(document)
#             loss = criterion(outputs, torch.max(labels, 1)[0])
# #             print('loss:', loss.item())
#             loss.backward()
#             return loss
#         optimizer.step(closure)


In [None]:
import umap # fit should get a sparse matrix
%time trans = umap.UMAP(n_neighbors=5, random_state=42, n_components=32, verbose=True).fit(train_data.data)
trans.embedding_

In [None]:
167593,441685 160318:1 227881:1 255720:1 265934:1 432905:2 515946:1 538188:1 586136:1 610561:1 692683:1 
                                        735075:1 828325:1 874107:1 898766:1 1087064:1 1354716:1 1432746:1 
                                        1454292:1 1463839:1 1626714:1 1715083:1 1839104:1 1864180:1 2023750:1 
