In [1]:
import math
import logging
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.autograd import Variable
from torch.utils.data import DataLoader, Dataset


from scripts.utils.hierarchy import *
from scripts.utils.processing import *
from scripts.utils.label_utils import *
from scripts.utils.data_reading import *

logging.basicConfig(level=logging.INFO )

In [2]:
num_gpus = torch.cuda.device_count()

In [3]:
device = torch.device("cuda" if (torch.cuda.is_available() and num_gpus > 0) else "cpu")

In [4]:
device

device(type='cpu')

In [5]:
n_components = 300

In [6]:
class DatasetIterator:
    def __init__(self, datafile, catfile):
        self.datafile = datafile
        self.cat = HierarchyUtils(catfile, False, False)
        self.wn = self.cat.generate_vectors(device = device, neighbours = True)
        self.lib_data = LIBSVM_Reader(self.datafile, True, n_components)
        self.df = self.lib_data.data_df
    
    def read_df(self, idx):
        i = self.df.index[self.df["doc_id"] == idx][0]
        return self.df.at[i, "doc_vector"], self.df.at[i, "doc_labels"], i

    def __getitem__(self, _id):
        return self.read_df(_id)

    def __iter__(self):
        for _id in self.df["doc_id"]:
            yield self[_id]

class DatasetModule(Dataset):

    def __init__(self, root_location, cat_file):
        self.iter = DatasetIterator(root_location, cat_file)
        self.lmbda = self.lambda_param()
        
    def lambda_param(self):
        for n_node, n_vector in self.iter.wn[0].items():
            w_n = n_vector
            w_pi = torch.mean(list2tensor(self.iter.wn[1][n_node]), dim=0)
            norm2 = torch.norm(w_n-w_pi, 2)
            lmbda = 0.5*norm2**2
        return lmbda

    def encode_labels(self, labels):
        yin = torch.ones(len(self.iter.cat.node2id), dtype=torch.float32)*-1
        label_vector = 0
        for l in labels:
            yin[self.iter.cat.node2id[l]] = 1.
            label_vector += self.iter.wn[0][l]
        label_vector = label_vector/len(labels)
        return label_vector, yin

    def __len__(self):
        return len(self.iter.df["doc_id"])

    def __load(self, idx):
        doc_vec, doc_labels, _id = self.iter[idx]
        return doc_vec, doc_labels, _id, self.encode_labels(doc_labels)

    def __getitem__(self, idx):
        return self.__load(idx)

In [7]:
train_data = DatasetModule("rcv1.tar/RCV1_1/rcv1.train.ltc.svm", "rcv1.tar/RCV1_1/rcv1.topic.hierarchy")

35it [00:00, 34869.51it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 284.98it/s]
100%|█████████████████████████████████████████████████████████████████████████| 23149/23149 [00:00<00:00, 35445.94it/s]


In [8]:
test_data = DatasetModule("rcv1.tar/RCV1_1/rcv1.test.ltc.svm", "rcv1.tar/RCV1_1/rcv1.topic.hierarchy")

35it [00:00, 69772.17it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 398.70it/s]
100%|███████████████████████████████████████████████████████████████████████| 781265/781265 [00:21<00:00, 35590.49it/s]


In [None]:
batch_size = 64

In [None]:
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle = True, pin_memory=True)
# valid_loader = DataLoader(valid_data, batch_size=batch_size, shuffle = False)

In [None]:
train_iter = iter(train_loader)
doc_vec, doc_labels, _id, wn =  train_iter.next()

In [None]:
print(doc_vec.shape)
print("*"*50)
print(wn[0].shape)
print("-"*50)
print(wn[1].shape)
print("_"*50)

In [None]:
# Hyper Parameters 
input_size = n_components #128 n_components
# input_size = train_data.data[0].shape[1] #2085164 

num_classes = len(train_data.iter.cat.N_all_nodes) #50312 --> n (16)
num_epochs = 15 # TRAIN IT FOR A LOT OF EPOCHS in case of lbfgs (2nd order method) else less is more
learning_rate = 0.001 #1e-4, 0.0005

In [None]:
torch.version.cuda

In [None]:
torch.backends.cudnn.version()

In [None]:
torch.backends.cudnn.benchmark = True

In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score

def gather_outputs(data, model):
    y_true = []
    y_pred = []
    logging.info("Evaluating ...")
    with torch.no_grad():
        for index, (doc_vec, doc_labels, _id, label_vecs) in enumerate(data):
            
            seq = doc_vec
            label_vec = label_vecs[0].squeeze()
            yin = label_vecs[1]
            output = model(seq, label_vec, yin)
            
            if (index+1) % 500 == 0:
                print(output)

            threshold = torch.mean(output).item()
#             threshold = 0.5

            output[output >= threshold] = 1
            output[output < threshold] = -1
            
            y_pred.append(output.cpu().view(-1).numpy())
            y_true.append(yin.numpy())
            
            if (index+1) % 500 == 0:
                print(output, yin)
                test_f1 = f1_score(np.asarray(y_true), np.array(y_pred), average="micro")
                print("Index: {}/{} F1: {}".format(index+1, len(data), test_f1))
    
    y_true = np.asarray(y_true)
    y_pred = np.array(y_pred)
    test_f1 = f1_score(y_true, y_pred, average="micro")
    return y_true, y_pred

In [None]:
# Model
class LogisticRegression(nn.Module):
    def __init__(self):
        super(LogisticRegression, self).__init__()
        self.linear1 = nn.Linear(input_size, num_classes, False)
        self.exp = torch.exp
        
    def forward(self, x, labels, yin):
        x1 = self.linear1(x)
        y  = x1.mul(labels)
        exp_ = self.exp(-y*yin)
        return torch.log(1+exp_)

In [None]:
model = LogisticRegression()

In [None]:
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=train_data.lmbda)

In [None]:
total_step = len(train_loader)

In [None]:
total_step

In [None]:
torch.cuda.empty_cache()

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
monitor = {
    "test_f1": [],
    "loss": []
}

In [None]:
for epoch in range(num_epochs):
    for doc_vec, doc_labels, _id, label_vecs in tqdm(train_loader):
        
        label_vec = label_vecs[0].squeeze()
        yin = label_vecs[1]
        
        model.zero_grad()
        output = model.forward(doc_vec, label_vec, yin)

        loss = criterion(output, label_vec)
        loss.backward()
        
        optimizer.step()
        
        monitor["loss"].append(loss.item())        

In [None]:
plt.plot(monitor["loss"]);

In [None]:
y_true, y_pred = gather_outputs(test_data, model)
test_f1 = f1_score(y_true, y_pred, average="micro")
logging.info("Test F1: {}".format(test_f1))
monitor["test_f1"].append(test_f1)

In [None]:
plt.plot(monitor["test_f1"]);