In [3]:
s =  'This is a book \nNo'
s.strip()

'This is a book \nNo'

In [6]:
s = 's==ss==\n'
s.split('==')

['s', 'ss', '\n']

In [8]:
from transformers import BertModel, BertTokenizer, BertConfig

config = BertConfig.from_pretrained('bert-base-chinese', output_hidden_states=True)
bert = BertModel.from_pretrained('bert-base-chinese', config=config)

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [13]:
import torch
from torch.autograd import Variable
tensored_targets = torch.zeros(10).long()
tensored_targets[torch.LongTensor([2,7])] = 1
print(tensored_targets)
print(torch.LongTensor([2,7]))
print(tensored_targets)
torch.cat(tensored_targets, 0)

tensor([0, 0, 1, 0, 0, 0, 0, 1, 0, 0])
tensor([2, 7])
tensor([0, 0, 1, 0, 0, 0, 0, 1, 0, 0])


TypeError: cat() received an invalid combination of arguments - got (Tensor, int), but expected one of:
 * (tuple of Tensors tensors, int dim, *, Tensor out)
 * (tuple of Tensors tensors, name dim, *, Tensor out)


In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertModel, BertTokenizer, BertConfig
from pathlib2 import Path
from tqdm import tqdm
import torch.nn.functional as F

tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')

def collate_fn(batch):
    batched_data = []
    batched_targets = []
    paths = []
    
    for text, targets, path in batch:
        paths.append(path)
        
        tensor_targets = torch.LongTensor(targets)
        batched_targets.append(tensor_targets)
        
        bert_input = tokenizer.batch_encode_plus(text, pad_to_max_length=True, return_tensors='pt')
        batched_data.append(bert_input)
        
    return batched_data, batched_targets, paths
        

class DRCDdataset(Dataset):
    def __init__(self, data_path):
        super().__init__()
        self.files = list(Path(data_path).glob('*.txt'))
    
    def __getitem__(self, index):
        path = self.files[index]
        
        return self.read_DRCD_file(path)
    
    def __len__(self):
        return len(self.files)
    
    def read_DRCD_file(self, path):
        seperator = '=========='
        with Path(path).open('r', encoding='utf-8') as f:
            raw_text = f.read()
        paragraphs = [p for p in raw_text.strip().split(seperator) if len(p)>2]
        
        targets = []
        text = []
        for paragraph in paragraphs:
            sentences = [s for s in paragraph.split('\n') if len(s.split()) > 0]
            sentences_targets = [0 for s in sentences[:-1]]
            sentences_targets.append(1)
            targets.extend(sentences_targets)
        
        
            for sentence in sentences:
                text.append(sentence)
        
        return text, targets, path
        

class Model(nn.Module):
    def __init__(self, hidden_dim, hidden_layer, batch_size):
        super().__init__()
        
        self.config = BertConfig.from_pretrained('bert-base-chinese', output_hidden_states=True)
        self.bert = BertModel.from_pretrained('bert-base-chinese', config=self.config)
        self.hidden_dim = hidden_dim
        self.hidden_layer = hidden_layer
        self.batch_size = batch_size
        
        
        self.lstm = nn.LSTM(768, hidden_dim, hidden_layer, bidirectional=True)
        self.linear = nn.Linear(hidden_dim * 2,2)
        
    def pad_document(self, d, max_document_length):
        d_length = d.size()[0]
        v = d.unsqueeze(0).unsqueeze(0)
        padded = F.pad(v, (0,0,0, max_document_length - d_length ))  # (1, 1, max_length, 768)
        shape = padded.size()
        return padded.view(shape[2], 1, shape[3])  # (max_length, 1, 768)
    
    def forward(self, batch):
        
        batched_cls_lhs = []
        doc_len = []
        for x in batch:
            _, cls_lhs, _ = self.bert(x['input_ids'], x['attention_mask'], return_dict=False)
            doc_len.append(cls_lhs.shape[0])
            batched_cls_lhs.append(cls_lhs)
        max_doc_len = max(doc_len)
        padded_doc = [self.pad_document(d,max_doc_len) for d in batched_cls_lhs]
        docs_tensor = torch.cat(padded_doc, 1)
        
        x, _ = self.lstm(docs_tensor)
        x = self.linear(x)

        return x
        

train_dataset = DRCDdataset(data_path=r'C:\Users\vince_wang\research\evaluate\8.24\DRCD\train')
train_dl = DataLoader(train_dataset, batch_size=3, collate_fn=collate_fn, shuffle=True)
val_dl = DataLoader(train_dataset, batch_size=5, collate_fn=collate_fn, shuffle=True)
model = Model(300,1,1)

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
with tqdm(desc='Testing', total=len(val_dl)) as pbar:
    model.eval()
    for i, (data, targets, path) in enumerate(val_dl):
        pbar.update()
        model.zero_grad()
        out = model(data)
        print("out : ",out)
        t=out
        print("out.data.cpu : ",out.data.cpu().numpy().argmax(axis=1))
        s = F.softmax(out, 1)
        print("softmax :", s)
        print("targets : ", targets)

Testing:   0%|                                                                                   | 0/1 [00:00<?, ?it/s]

out :  tensor([[[ 0.0302,  0.0638],
         [-0.1270,  0.0902]],

        [[-0.0028,  0.0966],
         [-0.0892,  0.0539]],

        [[-0.0163,  0.1791],
         [-0.0986,  0.0594]],

        [[-0.0553,  0.1464],
         [-0.0427,  0.1128]],

        [[-0.0456,  0.1178],
         [-0.0667,  0.1029]],

        [[-0.0443,  0.1871],
         [-0.1292,  0.1443]],

        [[-0.0733,  0.1796],
         [-0.1027,  0.1617]],

        [[-0.0267,  0.1852],
         [-0.1364,  0.1602]],

        [[-0.0454,  0.2020],
         [-0.0483,  0.1681]],

        [[-0.0804,  0.2077],
         [-0.1023,  0.1831]],

        [[-0.1548,  0.1890],
         [-0.1048,  0.1923]],

        [[-0.1001,  0.1974],
         [-0.0208,  0.2049]],

        [[-0.0884,  0.1881],
         [-0.0229,  0.1732]],

        [[-0.1029,  0.1737],
         [-0.0396,  0.1322]],

        [[-0.0223,  0.1419],
         [-0.0701,  0.0939]],

        [[-0.0460,  0.2277],
         [-0.0237,  0.0964]],

        [[-0.0913,  0.2396],
    

Testing: 100%|███████████████████████████████████████████████████████████████████████████| 1/1 [00:12<00:00, 12.67s/it]

softmax : tensor([[[0.5392, 0.4934],
         [0.4608, 0.5066]],

        [[0.5216, 0.5107],
         [0.4784, 0.4893]],

        [[0.5206, 0.5299],
         [0.4794, 0.4701]],

        [[0.4968, 0.5084],
         [0.5032, 0.4916]],

        [[0.5053, 0.5037],
         [0.4947, 0.4963]],

        [[0.5212, 0.5107],
         [0.4788, 0.4893]],

        [[0.5074, 0.5045],
         [0.4926, 0.4955]],

        [[0.5274, 0.5063],
         [0.4726, 0.4937]],

        [[0.5007, 0.5085],
         [0.4993, 0.4915]],

        [[0.5055, 0.5062],
         [0.4945, 0.4938]],

        [[0.4875, 0.4992],
         [0.5125, 0.5008]],

        [[0.4802, 0.4981],
         [0.5198, 0.5019]],

        [[0.4836, 0.5037],
         [0.5164, 0.4963]],

        [[0.4842, 0.5104],
         [0.5158, 0.4896]],

        [[0.5119, 0.5120],
         [0.4881, 0.4880]],

        [[0.4944, 0.5328],
         [0.5056, 0.4672]],

        [[0.4819, 0.5491],
         [0.5181, 0.4509]],

        [[0.4722, 0.5422],
         [0




In [11]:
print(t.shape)
print(t.data.numpy().argmax(axis=1))

torch.Size([46, 2, 2])
[[0 1]
 [0 0]
 [0 0]
 [1 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [1 0]
 [1 0]
 [0 0]
 [1 0]
 [1 0]
 [1 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [1 0]
 [1 0]
 [0 0]
 [0 0]
 [1 0]
 [0 0]
 [0 0]
 [1 0]
 [0 0]
 [1 0]
 [1 0]
 [1 0]
 [1 0]
 [1 0]
 [1 0]
 [0 0]
 [0 0]
 [1 0]
 [1 0]
 [1 0]
 [1 0]
 [1 0]
 [1 0]]
