In [1]:
import numpy as np
import torch
from torch import nn, optim
from torchtext import data, datasets
from torchtext.vocab import GloVe
from torchtext.vocab import Vectors
from torch.nn import init
from tqdm import tqdm
import pandas as pd
from torchtext.data import get_tokenizer
from sklearn import preprocessing
import math
from sklearn.metrics import f1_score,precision_score,recall_score
import jsonlines
import json
window_size=31
torch.cuda.empty_cache()


In [2]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
LABEL = data.Field(sequential=False,use_vocab=False,is_target=True)
TEXT = data.Field(fix_length=window_size,lower=True)



In [73]:
train = []
with jsonlines.open('maven/train.jsonl') as reader:
    for obj in reader:
        train += [obj]
        
test = []
with jsonlines.open('maven/test.jsonl') as reader:
    for obj in reader:
        test += [obj]
        
dev = []
with jsonlines.open('maven/valid.jsonl') as reader:
    for obj in reader:
        dev += [obj]

In [70]:
# train = []
# with jsonlines.open('DMCNN_MAVEN/train.jsonl') as reader:
#     for obj in reader:
#         train += [obj]
        
# test = []
# with jsonlines.open('DMCNN_MAVEN/test.jsonl') as reader:
#     for obj in reader:
#         test += [obj]
        
# dev = []
# with jsonlines.open('DMCNN_MAVEN/dev.jsonl') as reader:
#     for obj in reader:
#         dev += [obj]

In [71]:
# count = dict()
# for t in train:
#     pre_c = count.get(t['label'],0)
#     count[t['label']] = pre_c +1

In [72]:
count

{1: 810,
 2: 178,
 3: 3145,
 4: 662,
 5: 2728,
 6: 1236,
 7: 417,
 8: 153,
 9: 772,
 10: 2165,
 11: 981,
 12: 1176,
 13: 1001,
 14: 346,
 15: 777,
 16: 654,
 0: 2001,
 17: 283,
 18: 242,
 19: 2856,
 20: 1625,
 21: 1432,
 22: 114,
 23: 2920,
 24: 891,
 25: 453,
 26: 793,
 27: 275,
 28: 505,
 29: 1120,
 30: 678,
 31: 2628,
 32: 258,
 33: 459,
 34: 1024,
 35: 22,
 36: 1392,
 37: 903,
 38: 203,
 39: 626,
 40: 620,
 41: 911,
 42: 554,
 43: 764,
 44: 350,
 45: 1022,
 46: 1282,
 47: 103,
 48: 97,
 49: 225,
 50: 709,
 51: 520,
 52: 199,
 53: 504,
 54: 677,
 55: 350,
 56: 104,
 57: 124,
 58: 842,
 59: 251,
 60: 936,
 61: 1392,
 62: 115,
 63: 265,
 64: 155,
 65: 684,
 66: 191,
 67: 93,
 68: 561,
 69: 618,
 70: 817,
 71: 434,
 72: 224,
 73: 379,
 74: 101,
 75: 163,
 76: 91,
 77: 684,
 78: 410,
 79: 471,
 80: 55,
 81: 326,
 82: 1653,
 83: 57,
 84: 2460,
 85: 95,
 86: 132,
 87: 640,
 88: 4,
 89: 922,
 90: 199,
 91: 533,
 92: 48,
 93: 85,
 94: 403,
 95: 167,
 96: 704,
 97: 764,
 98: 282,
 99: 295,
 

In [74]:
train_dict = []
negative_c = 0
for t in train:
    content = t['content']
    events = t['events']
    negative_triggers = t['negative_triggers']
    for event in events:
        label = event['type_id']
        for mention in event['mention']:
            sentence = content[mention['sent_id']]
            
            trigger_first_token_index = mention['offset'][0]
            left_index = max(0,trigger_first_token_index - int((window_size-1)/2))
            right_index = trigger_first_token_index+int((window_size-1)/2) +1 #超过也没关系
            
            left_tokens = sentence['tokens'][left_index:trigger_first_token_index]
            right_tokens = sentence['tokens'][trigger_first_token_index+1:right_index]
            
            token_half_len = int((window_size-1)/2)
            left_pad_num=0
            right_pad_num=0
            if len(left_tokens)!=token_half_len:#需要padding
                left_pad_num = token_half_len - len(left_tokens)
            
            if len(right_tokens)!=token_half_len:#需要padding
                right_pad_num = token_half_len - len(right_tokens)    
            
            text_windows = [TEXT.pad_token] * left_pad_num + left_tokens \
                            + [sentence['tokens'][trigger_first_token_index]] \
                            + right_tokens + [TEXT.pad_token] * right_pad_num
                    
            if len(text_windows) != window_size:
                print('error')
            
            train_dict += [{
                'text':text_windows,
                'label':label
            }]
    for negative_trigger in negative_triggers:
        if negative_c >700:
            break
        sentence = content[negative_trigger['sent_id']]
        
        trigger_first_token_index = negative_trigger['offset'][0]
        left_index = max(0,trigger_first_token_index - int((window_size-1)/2))
        right_index = trigger_first_token_index+int((window_size-1)/2) +1 #超过也没关系

        left_tokens = sentence['tokens'][left_index:trigger_first_token_index]
        right_tokens = sentence['tokens'][trigger_first_token_index+1:right_index]

        token_half_len = int((window_size-1)/2)
        left_pad_num=0
        right_pad_num=0
        if len(left_tokens)!=token_half_len:#需要padding
            left_pad_num = token_half_len - len(left_tokens)

        if len(right_tokens)!=token_half_len:#需要padding
            right_pad_num = token_half_len - len(right_tokens)    

        text_windows = [TEXT.pad_token] * left_pad_num + left_tokens \
                        + [sentence['tokens'][trigger_first_token_index]] \
                        + right_tokens + [TEXT.pad_token] * right_pad_num

        if len(text_windows) != window_size:
            print('error')
        negative_c +=1
        train_dict += [{
            'text':text_windows,
            'label':0
        }]

    
with jsonlines.open('DMCNN_MAVEN/train.jsonl', mode='w') as writer:
    writer.write_all(train_dict)

In [76]:
dev_dict = []
negative_c = 0
for t in dev:
    content = t['content']
    events = t['events']
    negative_triggers = t['negative_triggers']
    for event in events:
        label = event['type_id']
        for mention in event['mention']:
            sentence = content[mention['sent_id']]
            
            trigger_first_token_index = mention['offset'][0]
            left_index = max(0,trigger_first_token_index - int((window_size-1)/2))
            right_index = trigger_first_token_index+int((window_size-1)/2) +1 #超过也没关系
            
            left_tokens = sentence['tokens'][left_index:trigger_first_token_index]
            right_tokens = sentence['tokens'][trigger_first_token_index+1:right_index]
            
            token_half_len = int((window_size-1)/2)
            left_pad_num=0
            right_pad_num=0
            if len(left_tokens)!=token_half_len:#需要padding
                left_pad_num = token_half_len - len(left_tokens)
            
            if len(right_tokens)!=token_half_len:#需要padding
                right_pad_num = token_half_len - len(right_tokens)    
            
            text_windows = [TEXT.pad_token] * left_pad_num + left_tokens \
                            + [sentence['tokens'][trigger_first_token_index]] \
                            + right_tokens + [TEXT.pad_token] * right_pad_num
                    
            if len(text_windows) != window_size:
                print('error')
            
            dev_dict += [{
                'text':text_windows,
                'label':label
            }]
    for negative_trigger in negative_triggers:
        if negative_c >700:
            break
        sentence = content[negative_trigger['sent_id']]
        
        trigger_first_token_index = negative_trigger['offset'][0]
        left_index = max(0,trigger_first_token_index - int((window_size-1)/2))
        right_index = trigger_first_token_index+int((window_size-1)/2) +1 #超过也没关系

        left_tokens = sentence['tokens'][left_index:trigger_first_token_index]
        right_tokens = sentence['tokens'][trigger_first_token_index+1:right_index]

        token_half_len = int((window_size-1)/2)
        left_pad_num=0
        right_pad_num=0
        if len(left_tokens)!=token_half_len:#需要padding
            left_pad_num = token_half_len - len(left_tokens)

        if len(right_tokens)!=token_half_len:#需要padding
            right_pad_num = token_half_len - len(right_tokens)    

        text_windows = [TEXT.pad_token] * left_pad_num + left_tokens \
                        + [sentence['tokens'][trigger_first_token_index]] \
                        + right_tokens + [TEXT.pad_token] * right_pad_num

        if len(text_windows) != window_size:
            print('error')
        negative_c +=1
        dev_dict += [{
            'text':text_windows,
            'label':0
        }]
    
with jsonlines.open('DMCNN_MAVEN/dev.jsonl', mode='w') as writer:
    writer.write_all(dev_dict)

In [77]:
test_dict = []
for t in test:
    content = t['content']
    candidates = t['candidates']
    for candidate in candidates:
        sentence = content[candidate['sent_id']]

        trigger_first_token_index = candidate['offset'][0]
        left_index = max(0,trigger_first_token_index - int((window_size-1)/2))
        right_index = trigger_first_token_index+int((window_size-1)/2) +1 #超过也没关系

        left_tokens = sentence['tokens'][left_index:trigger_first_token_index]
        right_tokens = sentence['tokens'][trigger_first_token_index+1:right_index]

        token_half_len = int((window_size-1)/2)
        left_pad_num=0
        right_pad_num=0
        if len(left_tokens)!=token_half_len:#需要padding
            left_pad_num = token_half_len - len(left_tokens)

        if len(right_tokens)!=token_half_len:#需要padding
            right_pad_num = token_half_len - len(right_tokens)    

        text_windows = [TEXT.pad_token] * left_pad_num + left_tokens \
                        + [sentence['tokens'][trigger_first_token_index]] \
                        + right_tokens + [TEXT.pad_token] * right_pad_num

        if len(text_windows) != window_size:
            print('error')

        test_dict += [{
            'text':text_windows,
            'label':0
        }]


    
with jsonlines.open('DMCNN_MAVEN/test.jsonl', mode='w') as writer:
    writer.write_all(test_dict)

In [78]:
train,test,dev=data.TabularDataset.splits(
    path="DMCNN_MAVEN",train='train.jsonl',
    test='test.jsonl',validation='dev.jsonl',
    format='json',fields={
        'text': ('text', TEXT),
        'label': ('label', LABEL)
    }
)



In [79]:
TEXT.build_vocab(train,test,dev, vectors=GloVe())

In [80]:
pretrained_embeddings=TEXT.vocab.vectors

In [81]:
BATCH_SIZE = 128
train_iterator, test_iterator, dev_iterator = data.BucketIterator.splits(
    (train, test, dev), 
    batch_size = BATCH_SIZE, 
    sort=False,
    device = DEVICE,
    
)



In [82]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0) # (batch_size,seq_len,embed,size)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

In [83]:
class CNN_Block(nn.Module):
    def __init__(self,window_size,out_channel,kernel_size,embedding_dim,drop_out):
        super(CNN_Block,self).__init__()
        self.cnn = nn.Conv2d(in_channels=1,out_channels=out_channel,kernel_size=(kernel_size,embedding_dim),padding=(kernel_size-1,0))
        self.relu = nn.ReLU()
        self.drop_out = nn.Dropout(p=drop_out)
        self.maxpool1d = nn.MaxPool1d(window_size+kernel_size-1)
    
    def forward(self,x):
        cnn_out = self.cnn(x).squeeze(dim=-1)
        relu_out = self.relu(cnn_out)
        drop_out_result = self.drop_out(relu_out)
        return self.maxpool1d(drop_out_result)

In [84]:
class DMCNN(nn.Module):
    def __init__(self,window_sizes,kernel_sizes,out_channel,pretrained_embeddings,drop_out,total_classes):
        super(DMCNN, self).__init__()
        self.window_size = window_size
        self.word_embedding = nn.Embedding.from_pretrained(pretrained_embeddings)
        self.position_encoding = PositionalEncoding(d_model = pretrained_embeddings.size(1),max_len=window_size)
        
        self.cnn_blocks = nn.ModuleList([
            CNN_Block(window_size,
                      out_channel,
                      kernel_size,
                      pretrained_embeddings.size(1),
                      drop_out) for kernel_size in kernel_sizes
        ])
        self.drop_out = nn.Dropout(p=drop_out)
        self.linear = nn.Linear(in_features=out_channel*len(kernel_sizes)+pretrained_embeddings.size(1),out_features=total_classes)
        self.softmax = nn.Softmax()
        
        
    def forward(self, x):
        
        word_embedding = self.word_embedding(x) # (batch_size,win_size,embed_size)
        position_encoding = self.position_encoding(word_embedding) # (batch_size,win_size,embed_size)
        we_pe = position_encoding.unsqueeze(dim=1) # (batch_size,1,win_size,embed_size)
        cnn_outs = []
        for cnn_block in self.cnn_blocks:
            cnn_outs += [cnn_block(we_pe)]
        total_cnn_outs = torch.cat(cnn_outs,dim=2) # (batch_size,out_channel,1)
        total_cnn_outs = total_cnn_outs.view(total_cnn_outs.size(0),-1) #(batch_size,out_channel * kernel number)
        
        #下面是论文中没有提到的trick，把cnn输出和前半段的embedding vector和在一起进linear层
        total_cnn_outs = torch.cat((total_cnn_outs, word_embedding[:, int((window_size-1)/2)]), dim=-1)
        
        #print(total_cnn_outs.shape)
        drop_out_result = self.drop_out(total_cnn_outs)
        y = self.linear(drop_out_result)        
        return y
    

In [85]:
model = DMCNN(31,[2,3,4,5],150,pretrained_embeddings,0.5,169)
model.to(DEVICE)



DMCNN(
  (word_embedding): Embedding(48971, 300)
  (position_encoding): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (cnn_blocks): ModuleList(
    (0): CNN_Block(
      (cnn): Conv2d(1, 150, kernel_size=(2, 300), stride=(1, 1), padding=(1, 0))
      (relu): ReLU()
      (drop_out): Dropout(p=0.5, inplace=False)
      (maxpool1d): MaxPool1d(kernel_size=32, stride=32, padding=0, dilation=1, ceil_mode=False)
    )
    (1): CNN_Block(
      (cnn): Conv2d(1, 150, kernel_size=(3, 300), stride=(1, 1), padding=(2, 0))
      (relu): ReLU()
      (drop_out): Dropout(p=0.5, inplace=False)
      (maxpool1d): MaxPool1d(kernel_size=33, stride=33, padding=0, dilation=1, ceil_mode=False)
    )
    (2): CNN_Block(
      (cnn): Conv2d(1, 150, kernel_size=(4, 300), stride=(1, 1), padding=(3, 0))
      (relu): ReLU()
      (drop_out): Dropout(p=0.5, inplace=False)
      (maxpool1d): MaxPool1d(kernel_size=34, stride=34, padding=0, dilation=1, ceil_mode=False)
    )
    (3): CNN_Bl

In [86]:
loss = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())

In [97]:
num_epochs = 1000
model.train()
for epoch in range(1, num_epochs + 1):
    correct = 0
    for batch_idx,batch in enumerate(train_iterator):
        
        output = model(batch.text.t())
        l = loss(output, batch.label)
        optimizer.zero_grad()
        l.backward()
        optimizer.step()
        
        predicted = torch.max(output.data,1)[1]
        correct += (predicted == batch.label).sum()
        
    break
    print('epoch %d, loss: %f, correct: %f' % (epoch, l.item(),float(correct)/float((batch_idx+1)*128)))



torch.Size([31, 128])

In [100]:
batch.label

tensor([ 19,  28,  49,   6,  20, 118,  10,  84,  90,  12,  26,   4,   4,   3,
         26,  66,  13,  21,   3,  54,  87,  51,  28,  45,  97,  77,   9,   5,
         81,  50,  31,  24,  41,  45,   3,  14,  43,  19,   6,  45,  15,  53,
         77,  19, 102,  31,  45,  72, 131,  70, 103,  16,  52,   7, 126,  51,
          0, 126, 106,  10, 126,  34,   3,   3, 104, 102,  10,  99,  58,  15,
         77,  19,  40,  36,  20,  53,  54,  31,  84,  34,  39,  65,  31, 158,
         48,  34,  99,  94,  97,  19,  13,  19,  52,   3,  41,  23,  41,  70,
         84,   3,  87, 147,  37,  58,  84,  53,  37,   3,   3,  82,  24,  24,
         46,  34,  84,  24,  46,  23,  82,  84,  23, 122,  21,  33,   3,  86,
          4,  23], device='cuda:0')

In [88]:
model.train(False)
correct = 0
l = torch.tensor([],device=DEVICE)
o = torch.tensor([],device=DEVICE)
for batch_idx,batch in enumerate(dev_iterator):
    output = model(batch.text.t())
    l = torch.cat([l,batch.label])
    o = torch.cat([o,torch.max(output.data,1)[1]])
    #predicted = torch.max(output.data,1)[1]
    #correct += (predicted == batch.label).sum()

#print('correct: %f' % (float(correct)/float((batch_idx+1)*128)))



In [89]:
f1_score(o.cpu(),l.cpu(),average='macro')

0.002739647045593597

In [90]:
precision_score(o.cpu(),l.cpu(),average='macro')

  _warn_prf(average, modifier, msg_start, len(result))


0.0017792928011289434

In [91]:
recall_score(o.cpu(),l.cpu(),average='macro')

0.005952380952380952