In [1]:
import os
import re
import pandas as pd 
import numpy as np 
import torch 
import torch.nn as nn 
import torch.nn.functional as F 
from torch.autograd import  Variable 
from torch.utils.data import DataLoader, Dataset
from AttentionTransformer.TrainClassificationTransformer import * 
from AttentionTransformer.ClassificationDataset import *
from AttentionTransformer.utilities import count_model_parameters
from AttentionTransformer.Encoder import * 
from AttentionTransformer.Decoder import *
import pickle 
from tqdm import tqdm_notebook, tqdm, trange, tnrange 
import torch.optim as optim

In [2]:
def load_pickle(filepath):
    with open(filepath, 'rb') as fp:
        return pickle.load(fp)

In [3]:
data = load_pickle('../data/tokenized_questions_classes_subclasses_dict.pkl')

In [4]:
len(data)

15447

In [5]:
class ClassificationDatasetDict(Dataset):

    def __init__(self, dataDict, seq_len):

        super(ClassificationDatasetDict, self).__init__()

        self.dataDict = dataDict 
        self.seq_len = seq_len 

    def pad_sequences(self, seq):

        if len(seq) > self.seq_len:

            seq = seq[:self.seq_len]

        op = torch.tensor(seq)
        tnsr = torch.zeros((self.seq_len))
        tnsr[:op.size(0)] = op
        return tnsr.float()

    def __len__(self):
        return len(self.dataDict)

    def __getitem__(self, ix):

        ixDict = self.dataDict[ix]
        seq = self.pad_sequences(ixDict['question-tokens'])
        cls_ = ixDict['question-class']
        subcls_ = ixDict['question-subclass']

        return {
            "source_seq": seq, 
            "class": cls_, 
            "subclass": subcls_
        }

In [6]:
[i for i in range(2)]

[0, 1]

In [12]:
class TwoHeadClassificationTransformer(nn.Module):


    def __init__(
        self, vocab_size, pad_id, CLS_label_id, num_class_heads, lst_num_cat_in_classes, emb_dim = 512, dim_model = 512, dim_inner = 2048,
        layers = 6, heads = 8, dim_key = 64, dim_value = 64, dropout = 0.1, num_pos = 200
    ):

        super(TwoHeadClassificationTransformer, self).__init__()

        self.pad_id = pad_id 

        self.encoder = Encoder(
            vocab_size, emb_dim, layers, heads, dim_key, dim_value, dim_model, dim_inner, pad_id, dropout = dropout, num_pos = num_pos
        )

        self.decoder = Decoder(
            vocab_size, emb_dim, layers, heads, dim_key, dim_value, dim_model, dim_inner, pad_id, dropout = dropout, num_pos = num_pos
        )

        # self.target_word_projection = nn.Linear(dim_model, num_classes, bias = False)

        self.class_heads = []

        for ix in range(num_class_heads):

            self.class_heads.append(nn.Linear(dim_model, lst_num_cat_in_classes[ix]))

        for parameter in self.parameters():

            if parameter.dim() > 1:

                nn.init.xavier_uniform_(parameter)

        assert dim_model == emb_dim, f'Dimensions of all the moduel outputs must be the same'

        self.x_logit_scale = 1

        self.cls_label_id = CLS_label_id

    def get_pad_mask(self, sequence, pad_id):

        return (sequence != pad_id).unsqueeze(-2)

    def get_subsequent_mask(self, sequence):

        batch_size, seq_length = sequence.size() 

        subsequent_mask = (
            1 - torch.triu(
                torch.ones((1, seq_length, seq_length), device=sequence.device), diagonal = 1
            )
        ).bool()

        return subsequent_mask

    def make_target_seq(self, batch_size):

        trg_tnsr = torch.zeros((batch_size, 1))
        trg_tnsr[trg_tnsr == 0] = self.cls_label_id
        return trg_tnsr

    def forward(self, source_seq):

        

        b, l = source_seq.size()

        target_seq = self.make_target_seq(b).to(source_seq.device)

        source_mask = self.get_pad_mask(source_seq, self.pad_id)
        target_mask = self.get_pad_mask(target_seq, self.pad_id) & self.get_subsequent_mask(target_seq)

        encoder_output = self.encoder(source_seq, source_mask)
        decoder_output = self.decoder(target_seq, target_mask, encoder_output, source_mask)

        decoder_output = decoder_output.view(decoder_output.size(0), -1)

        # seq_logits = self.target_word_projection(decoder_output)

        class_seq_logits = [ch(decoder_output) for ch in self.class_heads]

        return class_seq_logits


In [20]:
params = nn.ModuleList([nn.Linear(4, 2) for i in range(2)])

In [21]:
len(params)

2

In [22]:
params[0]

Linear(in_features=4, out_features=2, bias=True)

In [8]:
ds = ClassificationDatasetDict(data, 100)

In [9]:
dl = DataLoader(ds, batch_size=16)

In [10]:
d = next(iter(dl))

In [13]:
model = TwoHeadClassificationTransformer(
    10000, 0, 2, 2, [6, 47]
)

In [14]:
count_model_parameters(model) // 1e6

54.0

In [15]:
model = model.to('cuda')

In [16]:
d['source_seq']

tensor([[2.0000e+00, 7.6200e+02, 1.0760e+03,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [2.0000e+00, 1.2380e+03, 3.0040e+03,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [2.0000e+00, 7.6200e+02, 6.7200e+02,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        ...,
        [2.0000e+00, 6.2900e+02, 2.1880e+03,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [2.0000e+00, 1.2380e+03, 4.3700e+02,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [2.0000e+00, 1.2380e+03, 6.8760e+03,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00]])

In [17]:
o = model(d['source_seq'].cuda())

RuntimeError: Tensor for argument #3 'mat2' is on CPU, but expected it to be on GPU (while checking arguments for addmm)

In [37]:
len(o)

2

In [41]:
# o[0]

In [40]:
# o[1]