# Train the model
Adapted from the original `main.py`. Intergrated with AWS SageMaker.

## Install dependencies
(actually only tqdm, since other packages are pre-installed in aws pytorch environments)

In [1]:
!pip install -r requirements.txt

Collecting tqdm
  Downloading tqdm-4.61.1-py2.py3-none-any.whl (75 kB)
[K     |████████████████████████████████| 75 kB 5.2 MB/s  eta 0:00:01
[?25hCollecting tensorboard
  Downloading tensorboard-2.5.0-py3-none-any.whl (6.0 MB)
[K     |████████████████████████████████| 6.0 MB 27.9 MB/s eta 0:00:01
[?25hCollecting tensorboard-data-server<0.7.0,>=0.6.0
  Downloading tensorboard_data_server-0.6.1-py3-none-manylinux2010_x86_64.whl (4.9 MB)
[K     |████████████████████████████████| 4.9 MB 60.7 MB/s eta 0:00:01
Collecting google-auth-oauthlib<0.5,>=0.4.1
  Downloading google_auth_oauthlib-0.4.4-py2.py3-none-any.whl (18 kB)
Collecting google-auth<2,>=1.6.3
  Downloading google_auth-1.31.0-py2.py3-none-any.whl (147 kB)
[K     |████████████████████████████████| 147 kB 67.5 MB/s eta 0:00:01
[?25hCollecting tensorboard-plugin-wit>=1.6.0
  Downloading tensorboard_plugin_wit-1.8.0-py3-none-any.whl (781 kB)
[K     |████████████████████████████████| 781 kB 57.4 MB/s eta 0:00:01
Collecting grpc

In [2]:
from collections import defaultdict
import time
import json
import numpy as np
from random import choice
from tqdm import tqdm
import model
import torch
import torch.nn as nn
from torch.autograd import Variable
#import data_prepare
import os
import torch.utils.data as Data
import torch.nn.functional as F


Define a tensorboard logger

In [3]:
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter(log_dir='./logs')

In [4]:
# os.environ["CUDA_VISIBLE_DEVICES"]="0,1,2,3,4,5,6,7"

In [4]:
# for macOS compatibility
#os.environ['KMP_DUPLICATE_LIB_OK']='True'

torch.backends.cudnn.benchmark = True
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

CHAR_SIZE = 128
SENT_LENGTH = 4
HIDDEN_SIZE = 64
EPOCH_NUM = 210
BATCH_SIZE = 5096

In [5]:
device

device(type='cuda')

Helper functions:

In [6]:
def get_now_time():
    a = time.time()
    return time.ctime(a)


def seq_padding(X):
    L = [len(x) for x in X]
    ML = max(L)
    # print("ML",ML)
    return [x + [0] * (ML - len(x)) for x in X]


def seq_padding_vec(X):
    L = [len(x) for x in X]
    ML = max(L)
    # print("ML",ML)
    return [x + [[1, 0]] * (ML - len(x)) for x in X]

In [7]:
class DataGenerator:
    def __init__(self, data, batch_size=64):
        self.data = data
        self.batch_size = batch_size
        self.steps = len(self.data) // self.batch_size
        if len(self.data) % self.batch_size != 0:
            self.steps += 1

    def __len__(self):
        return self.steps

    def pro_res(self):
        idxs = list(range(len(self.data)))
        # print(idxs)
        np.random.shuffle(idxs)
        T, S1, S2, K1, K2, O1, O2, = [], [], [], [], [], [], []
        for i in idxs:
            d = self.data[i]
            text = d['text']
            items = {}
            items = defaultdict(list)
            for sp in d['spo_list']:
                subjectid = text.find(sp[0])
                objectid = text.find(sp[2])
                if subjectid != -1 and objectid != -1:
                    key = (subjectid, subjectid+len(sp[0])) # key is the span(start, end) of the subject
                    # items is {(S_start, S_end): list of (O_start_pos, O_end_pos, predicate_id)}
                    items[key].append(
                        (objectid, objectid+len(sp[2]), predicate2id[sp[1]]))
            if items:
                # T is list of text tokens(ids)
                T.append([char2id.get(c, 1) for c in text])  # 1是unk，0是padding
         
                # s1: one-hot vector where start of subject is 1
                # s2: one-hot vector where end of subject is 1
                s1, s2 = [0] * len(text), [0] * len(text)
                for j in items:
                    s1[j[0]] = 1
                    s2[j[1]-1] = 1
                # print(items.keys())
                # k1, k2: randomly sampled (S_start, S_end) pair?
                k1, k2 = choice(list(items.keys()))
                # o1: zero vector, the start of each O is marked with its predicate ID
                # o2: zero vector, the end of each O is marked with its predicate ID
                o1, o2 = [0] * len(text), [0] * len(text)  # 0是unk类（共49+1个类）
                for j in items[(k1, k2)]:
                    o1[j[0]] = j[2]
                    o2[j[1]-1] = j[2]
                S1.append(s1)
                S2.append(s2)
                K1.append([k1])
                K2.append([k2-1])
                O1.append(o1)
                O2.append(o2)

        T = np.array(seq_padding(T))
        S1 = np.array(seq_padding(S1))
        S2 = np.array(seq_padding(S2))
        O1 = np.array(seq_padding(O1))
        O2 = np.array(seq_padding(O2))
        K1, K2 = np.array(K1), np.array(K2)
        return [T, S1, S2, K1, K2, O1, O2]


class MyDataset(Data.Dataset):
    """
        下载数据、初始化数据，都可以在这里完成
    """

    def __init__(self, _T, _S1, _S2, _K1, _K2, _O1, _O2):
        # xy = np.loadtxt('../dataSet/diabetes.csv.gz', delimiter=',', dtype=np.float32) # 使用numpy读取数据
        self.x_data = _T
        self.y1_data = _S1
        self.y2_data = _S2
        self.k1_data = _K1
        self.k2_data = _K2
        self.o1_data = _O1
        self.o2_data = _O2
        self.len = len(self.x_data)

    def __getitem__(self, index):
        return self.x_data[index], self.y1_data[index], self.y2_data[index], self.k1_data[index], self.k2_data[index], self.o1_data[index], self.o2_data[index]

    def __len__(self):
        return self.len


def collate_fn(data):
    t = np.array([item[0] for item in data], np.int32)
    s1 = np.array([item[1] for item in data], np.int32)
    s2 = np.array([item[2] for item in data], np.int32)
    k1 = np.array([item[3] for item in data], np.int32)

    k2 = np.array([item[4] for item in data], np.int32)
    o1 = np.array([item[5] for item in data], np.int32)
    o2 = np.array([item[6] for item in data], np.int32)
    return {
        'T': torch.LongTensor(t),  # targets_i
        'S1': torch.FloatTensor(s1),
        'S2': torch.FloatTensor(s2),
        'K1': torch.LongTensor(k1),
        'K2': torch.LongTensor(k2),
        'O1': torch.LongTensor(o1),
        'O2': torch.LongTensor(o2),
    }

In [8]:
def extract_items(text_in):
    R = []
    _s = [char2id.get(c, 1) for c in text_in]
    _s = np.array([_s])
    _k1, _k2, t, t_max, mask = s_m(torch.LongTensor(_s).to(device))
    _k1, _k2 = _k1[0, :, 0], _k2[0, :, 0]
    _kk1s = []
    for i, _kk1 in enumerate(_k1):
        if _kk1 > 0.5:
            _subject = ''
            for j, _kk2 in enumerate(_k2[i:]):
                if _kk2 > 0.5:
                    _subject = text_in[i: i+j+1]
                    break
            if _subject:
                _k1, _k2 = torch.LongTensor([[i]]), torch.LongTensor(
                    [[i+j]])  # np.array([i]), np.array([i+j])
                _o1, _o2 = po_m(t.to(device), t_max.to(
                    device), _k1.to(device), _k2.to(device))
                _o1, _o2 = _o1.cpu().data.numpy(), _o2.cpu().data.numpy()

                _o1, _o2 = np.argmax(_o1[0], 1), np.argmax(_o2[0], 1)

                for i, _oo1 in enumerate(_o1):
                    if _oo1 > 0:
                        for j, _oo2 in enumerate(_o2[i:]):
                            if _oo2 == _oo1:
                                _object = text_in[i: i+j+1]
                                _predicate = id2predicate[_oo1]
                                # print((_subject, _predicate, _object))
                                R.append((_subject, _predicate, _object))
                                break
        _kk1s.append(_kk1.data.cpu().numpy())
    _kk1s = np.array(_kk1s)
    return list(set(R))

def para_extract_items(loader_res):
    t_s = loader_res["T"].to(device)
    k1 = loader_res["K1"].to(device)
    k2 = loader_res["K2"].to(device)
    s1 = loader_res["S1"].to(device)
    s2 = loader_res["S2"].to(device)
    o1 = loader_res["O1"].to(device)
    o2 = loader_res["O2"].to(device)

    ps_1, ps_2, t, t_max, mask = s_m(t_s)

    t, t_max, k1, k2 = t.to(device), t_max.to(
        device), k1.to(device), k2.to(device)
    po_1, po_2 = po_m(t, t_max, k1, k2)

    ps_1 = ps_1.to(device)
    ps_2 = ps_2.to(device)
    po_1 = po_1.to(device)
    po_2 = po_2.to(device)

    s1 = torch.unsqueeze(s1, 2)
    s2 = torch.unsqueeze(s2, 2)
    
    
    
            
def para_evaluate():
    A, B, C = 1e-10, 1e-10, 1e-10
    cnt = 0
    s_m.eval()
    po_m.eval()
    with torch.no_grad():
        for step, loader_res in tqdm(iter(enumerate(loader))):
            R = set(para_extract_items(loader_res))
            T = None
            A += len(R & T)
            B += len(R)
            C += len(T)
            cnt += 1
    return 2 * A / (B + C), A / B, A / C
    

def evaluate():
    A, B, C = 1e-10, 1e-10, 1e-10
    cnt = 0
    s_m.eval()
    po_m.eval()
    for d in tqdm(iter(dev_data)):
        if cnt > 5000:
            break
        R = set(extract_items(d['text']))
        T = set([tuple(i) for i in d['spo_list']])
        A += len(R & T)
        B += len(R)
        C += len(T)
        # if cnt % 1000 == 0:
        #     print('iter: %d f1: %.4f, precision: %.4f, recall: %.4f\n' % (cnt, 2 * A / (B + C), A / B, A / C))
        cnt += 1
    return 2 * A / (B + C), A / B, A / C


# Download training data
Skip the downloading step if you have alreay done it.

In [9]:
#!wget https://dataset-bj.cdn.bcebos.com/qianyan/DuIE_2_0.zip

In [10]:
#!unzip -j DuIE_2_0.zip -d data

Transofm raw data to easier usable format

In [11]:
# !mkdir generated
# !python trans.py

## Load training data

In [12]:
train_path = 'generated/train_data_me.json'
dev_path = 'generated/dev_data_me.json'
generated_schema_path =  'generated/schemas_me.json'
generated_char_path = 'generated/all_chars_me.json'
train_data = json.load(open(train_path))
dev_data = json.load(open(dev_path))
id2predicate, predicate2id = json.load(open(generated_schema_path))
id2predicate = {int(i): j for i, j in id2predicate.items()}
id2char, char2id = json.load(open(generated_char_path))
num_classes = len(id2predicate)

In [13]:
dg = DataGenerator(train_data)
T, S1, S2, K1, K2, O1, O2 = dg.pro_res()
# print("len",len(T))

torch_dataset = MyDataset(T, S1, S2, K1, K2, O1, O2)

In [14]:
loader = Data.DataLoader(
    dataset=torch_dataset,      # torch TensorDataset format
    batch_size=BATCH_SIZE,      # mini batch size
    shuffle=True,               # random shuffle for training
    num_workers=64,
    collate_fn=collate_fn,      # subprocesses for loading data
)

In [None]:
dev_dg = DataGenerator(dev_data)
T_dev, S1_dev, S2_dev, K1_dev, K2_dev, O1_dev, O2_dev = dev_dg.pro_res()
dev_dataset = MyDataset(T_dev, S1_dev, S2_dev, K1_dev, K2_dev, O1_dev, O2_dev)
dev_loader = Data.DataLoader(
    dataset=dev_dataset,      # torch TensorDataset format
    batch_size=BATCH_SIZE,      # mini batch size
    shuffle=True,               # random shuffle for training
    num_workers=64,
    collate_fn=collate_fn,      # subprocesses for loading data
)

### Define model and loss
Data are parallimised  to multiple GPUs

In [12]:
s_m = model.s_model(len(char2id)+2, CHAR_SIZE, HIDDEN_SIZE)
po_m = model.po_model(len(char2id)+2, CHAR_SIZE, HIDDEN_SIZE, 49)

if torch.cuda.device_count() > 1:
    print('Using', torch.cuda.device_count(), "GPUs!")
    s_m = nn.DataParallel(s_m)
    po_m = nn.DataParallel(po_m)

s_m = s_m.to(device)
po_m = po_m.to(device)

### Load model if needed
Uncomment lines below to load pre-trained model

In [15]:
breakpoint_epoch = 197
model_dir = 'models_real'
s_m = torch.load(os.path.join(model_dir, "s_{}.pkl".format(breakpoint_epoch)), map_location=device)
po_m = torch.load(os.path.join(model_dir, "po_{}.pkl".format(breakpoint_epoch)), map_location=device)

In [16]:
s_m = nn.DataParallel(s_m.module)
po_m = nn.DataParallel(po_m.module)


### Define loss metrics

In [17]:
params = list(s_m.parameters())
params += list(po_m.parameters())
optimizer = torch.optim.Adam(params, lr=0.001)

loss = torch.nn.CrossEntropyLoss().to(device)
b_loss = torch.nn.BCEWithLogitsLoss().to(device)

## Training

In [None]:
best_f1 = 0
best_epoch = 0

starting_epoch = 197

try:
    breakpoint_epoch
except NameError:
    print("breakpoint epoch not defined, start training from epoch 0")
else:
    print("continue training from epoch", breakpoint_epoch)
    starting_epoch = breakpoint_epoch

for i in range(starting_epoch, EPOCH_NUM):
    epoch_start_time = time.time()
    s_m.train()
    po_m.train()
    for step, loader_res in tqdm(iter(enumerate(loader))):
        # print(get_now_time())
        t_s = loader_res["T"].to(device)
        k1 = loader_res["K1"].to(device)
        k2 = loader_res["K2"].to(device)
        s1 = loader_res["S1"].to(device)
        s2 = loader_res["S2"].to(device)
        o1 = loader_res["O1"].to(device)
        o2 = loader_res["O2"].to(device)

        ps_1, ps_2, t, t_max, mask = s_m(t_s)

        t, t_max, k1, k2 = t.to(device), t_max.to(
            device), k1.to(device), k2.to(device)
        po_1, po_2 = po_m(t, t_max, k1, k2)

        ps_1 = ps_1.to(device)
        ps_2 = ps_2.to(device)
        po_1 = po_1.to(device)
        po_2 = po_2.to(device)

        s1 = torch.unsqueeze(s1, 2)
        s2 = torch.unsqueeze(s2, 2)

        s1_loss = b_loss(ps_1, s1)
        s1_loss = torch.sum(s1_loss.mul(mask))/torch.sum(mask)
        s2_loss = b_loss(ps_2, s2)
        s2_loss = torch.sum(s2_loss.mul(mask))/torch.sum(mask)

        po_1 = po_1.permute(0, 2, 1)
        po_2 = po_2.permute(0, 2, 1)

        o1_loss = loss(po_1, o1)
        o1_loss = torch.sum(o1_loss.mul(mask[:, :, 0])) / torch.sum(mask)
        o2_loss = loss(po_2, o2)
        o2_loss = torch.sum(o2_loss.mul(mask[:, :, 0])) / torch.sum(mask)

        loss_sum = 2.5 * (s1_loss + s2_loss) + (o1_loss + o2_loss)

        # if step % 500 == 0:
        # 	torch.save(s_m, 'models_real/s_'+str(step)+"epoch_"+str(i)+'.pkl')
        # 	torch.save(po_m, 'models_real/po_'+str(step)+"epoch_"+str(i)+'.pkl')

        optimizer.zero_grad()

        loss_sum.backward()
        optimizer.step()


    torch.save(s_m, 'models_real/s_'+str(i)+'.pkl')
    torch.save(po_m, 'models_real/po_'+str(i)+'.pkl')
    f1, precision, recall = evaluate()

    print("epoch:", i, "loss:", loss_sum.data)
    epoch_end_time = time.time()
    epoch_time_elapsed = epoch_end_time - epoch_start_time
    print("epoch {} used {} seconds (with bsz={})".format(i, epoch_time_elapsed, BATCH_SIZE))
    writer.add_scalar('Loss/train', loss_sum.data, i)
    writer.add_scalar('f1', f1, i)
    writer.add_scalar('precision', precision, i)
    writer.add_scalar('recall', recall, i)

    if f1 >= best_f1:
        best_f1 = f1
        best_epoch = i

    print('f1: %.4f, precision: %.4f, recall: %.4f, bestf1: %.4f, bestepoch: %d \n ' % (
        f1, precision, recall, best_f1, best_epoch))

writer.flush()

continue training from epoch 197


  self.dropout, self.training, self.bidirectional, self.batch_first)
34it [02:36,  4.59s/it]
5001it [01:37, 51.54it/s]


epoch: 197 loss: tensor(0.0262, device='cuda:0')
epoch 197 used 255.26342701911926 seconds (with bsz=5096)
f1: 0.5739, precision: 0.7953, recall: 0.4489, bestf1: 0.5739, bestepoch: 197 
 


34it [02:15,  3.99s/it]
5001it [01:23, 59.85it/s]

epoch: 198 loss: tensor(0.0256, device='cuda:0')
epoch 198 used 222.451429605484 seconds (with bsz=5096)
f1: 0.5829, precision: 0.7959, recall: 0.4598, bestf1: 0.5829, bestepoch: 198 
 



34it [02:15,  3.99s/it]
5001it [01:25, 58.24it/s]

epoch: 199 loss: tensor(0.0276, device='cuda:0')
epoch 199 used 224.8002152442932 seconds (with bsz=5096)
f1: 0.5754, precision: 0.7856, recall: 0.4539, bestf1: 0.5829, bestepoch: 198 
 



34it [02:15,  4.00s/it]
5001it [01:23, 60.18it/s]

epoch: 200 loss: tensor(0.0270, device='cuda:0')
epoch 200 used 222.2125391960144 seconds (with bsz=5096)
f1: 0.5821, precision: 0.7888, recall: 0.4613, bestf1: 0.5829, bestepoch: 198 
 



34it [02:15,  3.98s/it]
2863it [00:47, 64.94it/s]

In [19]:
writer.close()

## Test the trained model on some texts
Extract plain model from Dataparalell

In [None]:
s_m = s_m.module
po_m = po_m.module

In [42]:
to_print = 20
for cnt, d in enumerate(dev_data):
    if cnt > to_print:
        break
    print('Text: ', d['text'])
    print('Predicted SPOs: ', extract_items(d['text']))
    print('Ground Truth SPOs: ', d['spo_list'])

Text:  《步步惊心》改编自著名作家桐华的同名清穿小说《甄嬛传》改编自流潋紫所著的同名小说电视剧《何以笙箫默》改编自顾漫同名小说《花千骨》改编自fresh果果同名小说《裸婚时代》是月影兰析创作的一部情感小说《琅琊榜》是根据海宴同名网络小说改编电视剧《宫锁心玉》，又名《宫》《雪豹》，该剧改编自网络小说《特战先驱》《我是特种兵》由红遍网络的小说《最后一颗子弹留给我》改编电视剧《来不及说我爱你》改编自匪我思存同名小说《来不及说我爱你》
Predicted SPOs:  [('步步惊心', '改编自', '最后一颗子弹留给我'), ('步步惊心', '作者', '顾漫'), ('步步惊心', '作者', '桐华'), ('步步惊心', '改编自', '特战先驱》《我是特种兵》由红遍网络的小说《最后一颗子弹留给我'), ('步步惊心', '改编自', '甄嬛传》改编自流潋紫所著的同名小说电视剧《何以笙箫默》改编自顾漫同名小说《花千骨'), ('步步惊心', '改编自', '花千骨'), ('步步惊心', '改编自', '裸婚时代》是月影兰析创作的一部情感小说《琅琊榜》是根据海宴同名网络小说改编电视剧《宫锁心玉》，又名《宫》《雪豹》，该剧改编自网络小说《特战先驱》《我是特种兵》由红遍网络的小说《最后一颗子弹留给我'), ('步步惊心', '改编自', '何以笙箫默》改编自顾漫同名小说《花千骨')]
Ground Truth SPOs:  [['何以笙箫默', '作者', '顾漫'], ['我是特种兵', '改编自', '最后一颗子弹留给我'], ['步步惊心', '作者', '桐华'], ['甄嬛传', '作者', '流潋紫'], ['花千骨', '作者', 'fresh果果'], ['裸婚时代', '作者', '月影兰析'], ['琅琊榜', '作者', '海宴'], ['雪豹', '改编自', '特战先驱'], ['来不及说我爱你', '改编自', '来不及说我爱你'], ['来不及说我爱你', '作者', '匪我思存']]
Text:  摩尔多瓦共和国（摩尔多瓦语：Republica Moldova，英语：Republic of Moldova），简称摩尔多瓦，是位于东南欧的内陆国，与罗马尼亚和乌克兰接壤，首都基希讷乌
Predicted SPOs:  []
Grou

In [None]:
# visualize the model
writer.add_graph(s_m, torch.from_numpy(x).float())
writer.add_graph(po_m, torch.fr)