In [1]:
# https://www.kaggle.com/pinocookie/pytorch-dataset-and-dataloader/data
# Build the Dataset. We are going to generate a simple data set and then we will read it.
# Build the DataLoader.
# Build the model.
# Define the loss function and the optimizer.
# Train the model.
# Generate predictions.
# Plot the results. I hope it can be useful for someone who is starting programming using Pytorch.

In [2]:
import os
from random import sample
import numpy as np
import pandas as pd
import igraph as ig

from pathlib import Path
from tqdm import tqdm 
from scipy.sparse import *

import pickle
import warnings
warnings.simplefilter('ignore')

In [3]:
import logging
from collections import OrderedDict, Counter

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.utils.data import DataLoader, Dataset
logging.basicConfig(level=logging.INFO )

In [4]:
from sklearn import preprocessing, metrics

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
from joblib import Memory
from sklearn.datasets import load_svmlight_file

In [7]:
from scripts.utils import lookup_table, generate_label_vector

In [8]:
# N and one_hot_labels have to be present globally! (list of all the labels)
# one_hot_labels because I will keep accessing it for each document 1082
_, _, _, _, _, _, N = lookup_table("swiki/data/cat_hier.txt", subset = False)

In [9]:
max(N)

2445705

In [10]:
order_label_mapping = generate_label_vector(N)

In [11]:
order_label_mapping[1]

1

In [12]:
mem = Memory("./mycache_getdata")
@mem.cache
def get_data(filename):
    
    fname = str(Path(filename))
    fe, ex = os.path.splitext(fname) 

    try:
        data = load_svmlight_file(fname, multilabel=True)
    except:
        # Required: if the input data isn't in the correct libsvm format
        outfile = str(Path("{}_small{}".format(fe, ex)))
#         outfile = str(Path("{}_remapped{}".format(fe, ex)))
        if not os.path.isfile(outfile):
            logging.info("Remapping data to LibSVM format...")
            f = preprocess_libsvm(fname, outfile)
        else:
            logging.info("Using already remapped data...")
            f = outfile
        data = load_svmlight_file(f, multilabel=True)

    return data[0], data[1]

In [13]:
def preprocess_libsvm(input_file, output_file):
    # converts file to the required libsvm format.
    # this is very brute force but can be made faster [IMPROVE]

    file = open(output_file, "w+")
    with open(input_file, "r") as f:
#         head = [next(f) for x in range(500)] # retrieve only `n` docs
        for i, line in enumerate(tqdm(f)): # change to f/head depending on your needs
            instance = line.strip().split()
            labels = instance[0]
            doc_dict = OrderedDict()
            temp_dict = {}
            temp_string = ''

            for pair in instance[1:]:
                feat = pair.split(":")
                if int(feat[0]) not in temp_dict:
                    temp_dict[int(feat[0])] = int(feat[1])

            for key in sorted(temp_dict.keys()):
                doc_dict[key] = temp_dict[key]

            for feat, tf in doc_dict.items():
                temp_string = temp_string + "{}:{} ".format(feat, tf)        
            file.write("{} {}\n".format(labels, temp_string))
        file.close()

    return output_file

In [14]:
def too_hot_mapping(label_tuple):

    # order_label_mapping HAS TO BE A GLOBAL OBJECT

    doc_labels = list(map(int, list(label_tuple)))
    temp = torch.zeros((len(N),))
    
    try:
        for label in doc_labels:
            int_rep = order_label_mapping[label]
            temp[int_rep-1] += 1
    except:
        print("wait whaat?")
    return temp

In [15]:
class DatasetSWIKI(Dataset):
    
    def __init__(self, file_path, transform = None):
        self.data, self.labels = get_data(file_path)
        self.transform = transform
        
    def __len__(self):
        return self.data.shape[0]
    
    def __getitem__(self, index):
        
        document = torch.from_numpy(self.data[index].todense())
        label = self.labels[index]
        label_vector = too_hot_mapping(label)
        
        if self.transform is not None:
            document = self.transform(document)
        
        return document, label, label_vector

In [16]:
train_data = DatasetSWIKI("swiki/data/train_split_remapped.txt", transform=None)
valid_data = DatasetSWIKI("swiki/data/valid_remapped.txt", transform=None)

In [17]:
train_data.__getitem__(12)

(tensor([[0., 0., 0.,  ..., 0., 0., 0.]], dtype=torch.float64),
 (11078.0,
  93043.0,
  154610.0,
  196360.0,
  216803.0,
  260119.0,
  285874.0,
  303303.0,
  308993.0,
  396769.0),
 tensor([0., 0., 0.,  ..., 0., 0., 0.]))

In [38]:
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle = True)
valid_loader = DataLoader(valid_data, batch_size=batch_size, shuffle = False)

In [39]:
train_iter = iter(train_loader)
train_iter

<torch.utils.data.dataloader._DataLoaderIter at 0x2a963f2e6d8>

In [40]:
documents, _, labels = train_iter.next()

print('images shape on batch size = {}'.format(documents.size()))
print('labels shape on batch size = {}'.format(labels.size()))

images shape on batch size = torch.Size([100, 1, 2085164])
labels shape on batch size = torch.Size([100, 50312])


In [37]:
# Hyper Parameters 
input_size = 100 

num_classes = 50312
num_epochs = 5
batch_size = 100
learning_rate = 0.001

In [41]:
# Model
class LogisticRegression(nn.Module):
    def __init__(self, input_size, num_classes):
        super(LogisticRegression, self).__init__()
        self.linear = nn.Linear(input_size, num_classes)
    
    def forward(self, x):
        out = self.linear(x)
        return out

In [42]:
model = LogisticRegression(input_size, num_classes)

In [43]:
criterion = nn.CrossEntropyLoss()  
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

In [44]:
# Training the Model

for epoch in range(num_epochs):
    for i, (document, _, labels) in enumerate(train_iter):
#     for i, (document, _, labels) in enumerate(train_loader):
        document = Variable(document).float()
        labels = Variable(labels).type(torch.LongTensor)
        
        optimizer.zero_grad()
        outputs = model(document)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        # Forward + Backward + Optimize
#         def closure():
#             optimizer.zero_grad()
#             outputs = model(document)
#             loss = criterion(outputs, labels)
#             print('loss:', loss.item())
#             loss.backward()
#             return loss
#         optimizer.step(closure)


        if (i+1) % 100 == 0:
            print ('Epoch: [%d/%d], Step: [%d/%d], Loss: %.4f' 
                   % (epoch+1, num_epochs, i+1, len(train_dataset)//batch_size, loss.data[0]))

RuntimeError: size mismatch, m1: [100 x 2085164], m2: [100 x 50312] at c:\a\w\1\s\windows\pytorch\aten\src\th\generic/THTensorMath.cpp:940

In [None]:
len(train_iter.next())