In [20]:
import os
import torch
import torch.nn as nn
import sys
import torch.utils.data as D
import matplotlib.pyplot as plt
import numpy as np

In [2]:
fp = '/home/t/Documents/genei/bert_sum/BertSum-master/bert_data/cnndm.test.0.bert.pt'

# initial attempt was to use concat dataset
the kernel kept dying  
this might be because we nee to load the dataset to get the length. we decided to try again with iterdataset instead

In [3]:
class IndividualFileDataset(D.Dataset):
    def __init__(self, fp):
        self.fp = fp
        self.docs = torch.load(self.fp)
        
    def __len__(self):
        return len(self.docs)
    
    def __getitem__(self, idx):
        s = self.docs[idx]
        tup = (
            s['src'],
#             s['segs'],
#             s['clss'],
            s['labels']
        )
        return tup

In [6]:
def get_file_paths(file_type):
    rootdir = '/home/t/Documents/genei/bert_sum/BertSum-master/bert_data'
    fps = []
    for subdir, dirs, files in os.walk(rootdir):
        for file in files:
            if file_type in file:
                fps.append(os.path.join(subdir, file))
    return fps

In [7]:
print('Number of training examples: ', len(get_file_paths('train')))
print('Number of validation examples: ', len(get_file_paths('valid')))
print('Number of testing examples: ', len(get_file_paths('test')))

Number of training examples:  144
Number of validation examples:  7
Number of testing examples:  6


In [8]:
train_datasets = [IndividualFileDataset(fp) for fp in get_file_paths('train')]

KeyboardInterrupt: 

In [None]:
train_dataset = D.ConcatDataset(train_datasets)

In [None]:
train_dataset.__len__()

In [None]:
train_loader = D.DataLoader(train_dataset, batch_size=1)

In [None]:
for i in train_loader:
    pass

# trying with iterdataset

In [5]:
def get_file_paths(file_type):
    rootdir = '/home/t/Documents/genei/bert_sum/BertSum-master/bert_data'
    fps = []
    for subdir, dirs, files in os.walk(rootdir):
        for file in files:
            if file_type in file:
                fps.append(os.path.join(subdir, file))
    return fps

In [6]:
class IndividualFileDataset(D.IterableDataset):
    def __init__(self, fp):
        super(IndividualFileDataset).__init__()
        self.fp = fp
        
    def __iter__(self):
        docs = torch.load(self.fp)
        for s in docs:
            tup = (
                s['src'],
                s['segs'],
                s['clss'],
                s['labels']
            )
            yield tup

In [7]:
train_dataset = D.ChainDataset([IndividualFileDataset(fp) for fp in get_file_paths('train')])

In [8]:
class Batch:
    def __init__(self, src, segs, clss, labels, mask_attn, mask_clss):
        self.src = src
        self.segs = segs
        self.clss = clss
        self.labels = labels
        self.mask_attn = mask_attn
        self.mask_clss = mask_clss

In [9]:
def _yang_pad(data, pad_id):
    width = max(len(d) for d in data)
    rtn_data = [d + [pad_id] * (width - len(d)) for d in data]
    return rtn_data

In [259]:
def collate_fn(batch):
    src = _yang_pad([s[0] for s in batch], 0)
    segs = _yang_pad([s[1] for s in batch], 0)
    clss = _yang_pad([s[2] for s in batch], -1)
    labels = _yang_pad([s[3] for s in batch], 0)
    
    # Ensure that masks initially specified as 0s and 1s
    # are converted to float32 tensors
    src = torch.Tensor(src).type(torch.long)
    segs = torch.Tensor(segs).type(torch.float32)
    clss = torch.Tensor(clss).type(torch.float32)
    labels = torch.Tensor(labels).type(torch.float32)
    
    # Self attention mask to deal with variable sentence lengths inside bert itself
    mask_attn = 1 - (src==0).type(torch.float32)
    
     # Self attention mask to deal with variable sentence length in fine tuning layers
    mask_clss = 1- (clss==-1).type(torch.float32)
    
    return Batch(src, segs, clss, labels, mask_attn, mask_clss)

In [260]:
train_loader = D.DataLoader(train_dataset, batch_size=256, collate_fn=collate_fn)

In [261]:
class Bert(nn.Module):
    "Wrapper for the pretrained Bert module"
    def __init__(self, temp_dir, load_pretrained_bert, bert_config):
        super(Bert, self).__init__()
        if load_pretrained_bert:
            self.model = BertModel.from_pretrained('bert-base-uncased',cache_dir=temp_dir)
        else:
            self.model = BertModel(bert_config)
    
    def forward(self, x, segs, mask_attn):
        x = x.type(torch.long)
        segs = segs.type(torch.long)
        encoded_layers, _=self.model(x, segs, attention_mask=mask_attn)
        final_vec = encoded[-1]
        return final_vec

In [262]:
class Classifier(nn.Module):
    "Simple logistic regression fine tuning layer"
    def __init__(self, hidden_size):
        super(Classifier, self).__init__()
        self.linear1 = nn.Linear(hidden_size, 1)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x, mask_clss):
        h = self.linear1(x).squeeze(-1) # squeeze(-1) removes last axis
        sent_scores = self.sigmoid(h) * mask_clss
        return sent_scores

In [263]:
class Summarizer(nn.Module):
    "State of the art extractive summarization"
    def __init__(self, language_model, finetune_model):
        
        super(Summarizer, self).__init__()
        self.language_model = language_model
        self.finetune_model = finetune_model
        
        self.to('cpu')
        
    def forward(self, x, segs, clss, mask_attn, mask_clss):
        # Pass input into language model
        final_vec = self.language_model(x, segs, mask_attn)
        
         # Select out only clss vectors
        encoded_clss_tokens = final_vec[torch.arange(final_vec.size(0)).unsqueeze(1), clss]
        
         # For each of the 768 bert indices, apply same value of mask
        encoded_clss_tokens *= mask_clss[:,:,None]
        
        # Put the clss tokens into fine tune layers
        sent_scores = self.finetune_model(encoded_clss_tokens, mask_clss).squeeze(-1)
        
        return sent_scores, mask_clss

In [264]:
for batch in train_loader:
    print(batch.mask_clss.shape)
    break
    

torch.Size([256, 36])


In [265]:
from pytorch_transformers import BertModel

In [266]:
language_model = Bert(temp_dir='../temp' , load_pretrained_bert=True, bert_config=None)

In [267]:
finetune_model = Classifier(512)

In [268]:
model = Summarizer(language_model, finetune_model)

In [269]:
criterion = nn.BCELoss()

In [270]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

In [271]:
for ep in range(100):
    for idx, batch in tqdm(enumerate(train_loader)):
        optimizer.zero_grad()
        loss = torch.Tensor([0.0]).to('cpu')
        output = model(batch.src, batch.segs, batch.clss, batch.mask_attn, batch.mask_clss)
        loss += criterion(output, batch.labels)
        loss.backward()
        optimizer.step()

0it [00:00, ?it/s]

RuntimeError: [enforce fail at CPUAllocator.cpp:64] . DefaultCPUAllocator: can't allocate memory: you tried to allocate 3221225472 bytes. Error code 12 (Cannot allocate memory)


In [247]:
x = torch.tensor([1,1234])

In [250]:
x.to('cpu')

tensor([   1, 1234])

In [252]:
x

tensor([   1, 1234])

In [254]:
x.type(torch.long)

tensor([   1, 1234])

In [256]:
import torch
from pytorch_transformers import *

# PyTorch-Transformers has a unified API
# for 7 transformer architectures and 30 pretrained weights.
#          Model          | Tokenizer          | Pretrained weights shortcut
MODELS = [(BertModel,       BertTokenizer,      'bert-base-uncased')]

# Let's encode some text in a sequence of hidden-states using each model:
for model_class, tokenizer_class, pretrained_weights in MODELS:
    # Load pretrained model/tokenizer
    tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
    model = model_class.from_pretrained(pretrained_weights)

    # Encode text
    input_ids = torch.tensor([tokenizer.encode("Here is some text to encode")])
    with torch.no_grad():
        last_hidden_states = model(input_ids)[0]  # Models outputs are now tuples

In [258]:
input_ids.type()

'torch.LongTensor'