In [1]:
# https://www.kaggle.com/pinocookie/pytorch-dataset-and-dataloader/data
# https://discuss.pytorch.org/t/runtimeerror-multi-target-not-supported-newbie/10216/4

# Build the Dataset. We are going to generate a simple data set and then we will read it.
# Build the DataLoader.
# Build the model.
# Define the loss function and the optimizer.
# Train the model.
# Generate predictions.
# Plot the results. I hope it can be useful for someone who is starting programming using Pytorch.

In [2]:
import logging
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.autograd import Variable
from sklearn import preprocessing, metrics
from torch.utils.data import DataLoader, Dataset


from scripts.utils.logger import Logger
from scripts.utils.data_reading import lower_dim, rr_reader
from scripts.utils.processing import lookup_table, generate_label_vector

logging.basicConfig(level=logging.INFO )

In [3]:
# N and one_hot_labels have to be present globally! (list of all the labels)
# one_hot_labels because I will keep accessing it for each document <1082>
_, _, _, _, _, _, N = lookup_table("swiki/data/cat_hier.txt", subset = False)

In [4]:
len(N)

50312

In [5]:
order_label_mapping = generate_label_vector(N)

In [6]:
order_label_mapping[1]

1

In [7]:
def too_hot_mapping(label_tuple):

    # order_label_mapping HAS TO BE A GLOBAL OBJECT

    doc_labels = list(map(int, list(label_tuple)))
    temp = torch.zeros((len(N),))
    
    try:
        for label in doc_labels:
            int_rep = order_label_mapping[label]
            temp[int_rep-1] += 1 
    except:
        print("wait whaat?")
    return temp

In [8]:
class DatasetSWIKI(Dataset):
    
    def __init__(self, file_path, reduce = True, n_components = 128):
        self.reduce = reduce
        self.n_components = n_components
        self.data, self.labels = lower_dim(file_path, reduce, n_components)
        
    def __len__(self):
        return self.data.shape[0]
    
    def __getitem__(self, index):
        
        if self.reduce:
            document = torch.from_numpy(self.data[index])
        else:
            document = torch.from_numpy(self.data[index].todense())
        
        label = self.labels[index]
        label_vector = too_hot_mapping(label)
        
        return document, label, label_vector

In [9]:
n_components = 128

In [10]:
train_data = DatasetSWIKI("swiki/data/train_split_remapped.txt", reduce=True, n_components = n_components)
valid_data = DatasetSWIKI("swiki/data/valid_remapped.txt", reduce=True, n_components = n_components)

INFO:root:Time elapsed: 4min 4.2secs
INFO:root:Time elapsed: 3min 18.7secs


In [11]:
train_data.data[0].shape

(128,)

In [12]:
batch_size = 100

In [13]:
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle = True)
valid_loader = DataLoader(valid_data, batch_size=batch_size, shuffle = False)

In [14]:
train_iter = iter(train_loader)

doc, _, labs = train_iter.next()

print('images shape on batch size = {}'.format(doc.size()))
print('labels shape on batch size = {}'.format(labs.size()))

images shape on batch size = torch.Size([100, 128])
labels shape on batch size = torch.Size([100, 50312])


In [15]:
208516400/10940
# 2085164/547

19060.0

In [16]:
# Hyper Parameters 
input_size = train_data.data.shape[1] #2085164 -> 128

num_classes = 50312 #50312
num_epochs = 5
learning_rate = 0.01

In [17]:
# Model
class LogisticRegression(nn.Module):
    def __init__(self):
        super(LogisticRegression, self).__init__()
#         self.linear1 = nn.Linear(input_size, batch_size)
#         self.linear2 = nn.Linear(batch_size, num_classes)
        self.linear2 = nn.Linear(input_size, num_classes)

    
    def forward(self, x):
#         x = self.linear1(x)
        out = self.linear2(x)
        out = F.sigmoid(out)
        return out

In [18]:
model = LogisticRegression()

In [19]:
model

LogisticRegression(
  (linear2): Linear(in_features=128, out_features=50312, bias=True)
)

In [20]:
criterion = nn.BCELoss()  
optimizer = torch.optim.LBFGS(model.parameters(), lr=learning_rate)

In [21]:
logger = Logger('./logs')

In [22]:
train_iter = iter(train_loader)
iter_per_epoch = len(train_loader)
total_step = 50000

In [26]:
# Training the Model

# for epoch in range(num_epochs):
#     for i, (document, labels, _) in enumerate(train_iter):
#     for i, (document, _, labels) in enumerate(train_data):

for i in range(total_step):
    
        # Reset the data_iter
        if (i+1) % iter_per_epoch == 0:
            train_iter = iter(train_loader)

        document, _, labels = train_iter.next()        
        document = Variable(document).float() # batch size 100
        labels = Variable(labels).float()

        # forward pass
#         outputs = model(document)
#         print(outputs.shape)
#         print(labels.shape)
#         loss = criterion(outputs, labels)

        # backward + optimize
#         optimizer.zero_grad()
#         loss.backward()
#         optimizer.step()
        
        def closure():
            optimizer.zero_grad()
            outputs = model(document)
            loss = criterion(outputs, labels)
            loss.backward()
            if (i+1) % 10 == 0:
                print ('Step [{}/{}], Loss: {:.4f}'.format(i+1, total_step, loss.item()))            
            return loss
        optimizer.step(closure)
        
#         if (i+1) % 10 == 0:
#             print ('Step [{}/{}], Loss: {:.4f}'.format(i+1, total_step, loss.item()))

Step [680/50000], Loss: 0.7101
Step [680/50000], Loss: 0.7101
Step [690/50000], Loss: 0.7132
Step [690/50000], Loss: 0.7132
Step [700/50000], Loss: 0.7090
Step [700/50000], Loss: 0.7090
Step [710/50000], Loss: 0.7076
Step [710/50000], Loss: 0.7076
Step [720/50000], Loss: 0.7161
Step [720/50000], Loss: 0.7161
Step [730/50000], Loss: 0.7101
Step [730/50000], Loss: 0.7101
Step [740/50000], Loss: 0.7087
Step [740/50000], Loss: 0.7087
Step [750/50000], Loss: 0.7108
Step [750/50000], Loss: 0.7108
Step [760/50000], Loss: 0.7073
Step [760/50000], Loss: 0.7073
Step [770/50000], Loss: 0.7119
Step [770/50000], Loss: 0.7119
Step [780/50000], Loss: 0.7127
Step [780/50000], Loss: 0.7127
Step [790/50000], Loss: 0.7076
Step [790/50000], Loss: 0.7076
Step [800/50000], Loss: 0.7111
Step [800/50000], Loss: 0.7111
Step [810/50000], Loss: 0.7087
Step [810/50000], Loss: 0.7087
Step [820/50000], Loss: 0.7082
Step [820/50000], Loss: 0.7082
Step [830/50000], Loss: 0.7113
Step [830/50000], Loss: 0.7113
Step [84

KeyboardInterrupt: 

In [None]:
print(torch.sum(torch.where(outputs>0.0000001, torch.tensor(1), torch.tensor(0)), dim=0))

In [None]:
print(torch.sum(torch.where(labels>0, torch.tensor(1), torch.tensor(0)), dim=0))

In [None]:
50128-50312

In [None]:
with torch.no_grad():
    correct = 0
    total = 0
    for documents, _, labels in valid_data:
        docs = Variable(torch.from_numpy(documents)).float()
        outputs = model(docs)
        print(torch.sum(torch.where(outputs>0.0001, torch.tensor(1), torch.tensor(0)), dim=0))
        print(torch.sum(torch.where(labels>0, torch.tensor(1), torch.tensor(0)), dim=0))

        umm, predicted = torch.max(outputs.data, 1)
        print(umm.shape)
        total += labels.size(0)
        correct += (predicted == labels).sum()

    print('Accuracy of the model on the 10000 test images: {} %'.format(100 * correct / total))

# Save the model checkpoint
torch.save(model.state_dict(), 'model.ckpt')

In [None]:
95894672/50312

In [None]:
import torchvision.utils as vutils
from tensorboardX import SummaryWriter
writer = SummaryWriter()

In [None]:
import matplotlib.pyplot as plt
plt.switch_backend('agg')

fig = plt.figure()

c1 = plt.Circle((0.2, 0.5), 0.2, color='r')
c2 = plt.Circle((0.8, 0.5), 0.2, color='r')

ax = plt.gca()
ax.add_patch(c1)
ax.add_patch(c2)
plt.axis('scaled')


# from tensorboardX import SummaryWriter
# writer = SummaryWriter()
writer.add_figure('matplotlib', fig)
writer.close()

In [None]:
# #         Forward + Backward + Optimize
#         def closure():
#             optimizer.zero_grad()
#             outputs = model(document)
#             loss = criterion(outputs, torch.max(labels, 1)[0])
# #             print('loss:', loss.item())
#             loss.backward()
#             return loss
#         optimizer.step(closure)


In [None]:
import umap # fit should get a sparse matrix
%time trans = umap.UMAP(n_neighbors=5, random_state=42, n_components=32, verbose=True).fit(train_data.data)
trans.embedding_