In [10]:
import time
import sys
import argparse
import random
import copy
import torch
import gc
import pickle
import os
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np

from dataCDN import Data


In [12]:
if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--embedding',  help='Embedding for words', default='None')
    parser.add_argument('--status', choices=['train', 'test'], help='update algorithm', default='train')
    parser.add_argument('--savedset', help='Dir of saved data setting', default="../data/CHIP-CDN/lettics/save_CHIP-CDN.dset")
    parser.add_argument('--train', default="../data/CHIP-CDN/CHIP-CDN_train.json")
    parser.add_argument('--dev', default="../data/CHIP-CDN/CHIP-CDN_dev.json" )
    parser.add_argument('--test', default="../data/CHIP-CDN/CHIP-CDN_test.json")
    parser.add_argument('--seg', default="True")
    parser.add_argument('--extendalphabet', default="True")
    parser.add_argument('--seed',default=1023,type=int)

    args = parser.parse_args(args=[])

    def set_seed(seed_num=1023):
        random.seed(seed_num)
        torch.manual_seed(seed_num)
        np.random.seed(seed_num)
    
    seed_num = args.seed
    set_seed(seed_num)

    train_file = args.train
    dev_file = args.dev
    test_file = args.test

    if args.seg.lower() == "true":
        seg = True
    else:
        seg = False
    status = args.status.lower()

    save_data_name = args.savedset
    gpu = torch.cuda.is_available()

    char_emb = "gigaword_chn.all.a2b.uni.ite50.vec"
    gaz_file = "ctb.50d.vec"

    sys.stdout.flush()

In [13]:
data = Data()

In [14]:
def data_initialization(data, gaz_file, train_file, dev_file, test_file):
    
    data.build_alphabet_CDN(dev_file)
    # data.build_alphabet_CDN(test_file)
    data.build_alphabet_CDN(train_file) #utils/data.py
    
    data.build_gaz_file(gaz_file)   
    data.build_gaz_alphabet_CDN(dev_file,count=True)
    # data.build_gaz_alphabet_CDN(test_file,count=True)
    data.build_gaz_alphabet_CDN(train_file,count=True) 
    
    data.fix_alphabet()
    return data
data_initialization(data, gaz_file, train_file, dev_file, test_file)

../data/CHIP-CDN/CHIP-CDN_dev.json
1288
char_alphabet_size: 1288
../data/CHIP-CDN/CHIP-CDN_train.json
1686
char_alphabet_size: 1686
gaz_file: ctb.50d.vec
Load gaz file:  ctb.50d.vec  total size: 704368
../data/CHIP-CDN/CHIP-CDN_dev.json
gaz alphabet size: 5337
../data/CHIP-CDN/CHIP-CDN_train.json
gaz alphabet size: 8956


<dataCDN.Data at 0x7faf587299f0>

In [15]:
def data_initialization(data, gaz_file, train_file, dev_file, test_file):
    data.build_alphabet_CDN(train_file) #utils/data.py
    data.build_alphabet_CDN(dev_file)
    # data.build_alphabet_CDN(test_file)
    
    data.build_gaz_file(gaz_file) 
    data.build_gaz_alphabet_CDN(train_file,count=True) 
    data.build_gaz_alphabet_CDN(dev_file,count=True)
    # data.build_gaz_alphabet_CDN(test_file,count=True)
    data.fix_alphabet()
    return data
data_initialization(data, gaz_file, train_file, dev_file, test_file)

../data/CHIP-CDN/CHIP-CDN_train.json
1686
char_alphabet_size: 1686
../data/CHIP-CDN/CHIP-CDN_dev.json
1686
char_alphabet_size: 1686
gaz_file: ctb.50d.vec
Load gaz file:  ctb.50d.vec  total size: 704368
../data/CHIP-CDN/CHIP-CDN_train.json
gaz alphabet size: 8956
../data/CHIP-CDN/CHIP-CDN_dev.json
gaz alphabet size: 8956


<dataCDN.Data at 0x7faf587299f0>

In [16]:
data.build_word_pretrain_emb(char_emb)
data.build_gaz_pretrain_emb(gaz_file)
print(char_emb) # "data/gigaword_chn.all.a2b.uni.ite50.vec"
print(gaz_file)  #'data/ctb.50d.vec'

build word pretrain emb...
Embedding:
     pretrain word:11327, prefect match:1673, case_match:0, oov:12, oov%:0.0071174377224199285
build gaz pretrain emb...
Embedding:
     pretrain word:704368, prefect match:8954, case_match:0, oov:1, oov%:0.00011165698972755694
gigaword_chn.all.a2b.uni.ite50.vec
ctb.50d.vec


In [6]:
print('Dumping data')
with open(save_data_name, 'wb') as f:
    pickle.dump(data, f)
set_seed(seed_num)

Dumping data


In [7]:
if os.path.exists(save_data_name):
    print('Loading processed data')
    with open(save_data_name, 'rb') as fp:
        data = pickle.load(fp)
print(type(data))

Loading processed data
<class 'utils1.dataCDN.Data'>


In [8]:
print(data.char_alphabet)
print(data.char_alphabet.instance2index.items())

<utils1.alphabet.Alphabet object at 0x7fb90c2785e0>
dict_items([('</unk>', 1), ('左', 2), ('膝', 3), ('退', 4), ('变', 5), ('伴', 6), ('游', 7), ('离', 8), ('体', 9), ('骨', 10), ('关', 11), ('节', 12), ('病', 13), ('#', 14), ('糖', 15), ('尿', 16), ('反', 17), ('复', 18), ('低', 19), ('血', 20), (';', 21), ('质', 22), ('疏', 23), ('松', 24), ('高', 25), ('压', 26), ('冠', 27), ('心', 28), ('不', 29), ('稳', 30), ('定', 31), ('绞', 32), ('痛', 33), ('性', 34), ('症', 35), ('状', 36), ('动', 37), ('脉', 38), ('粥', 39), ('样', 40), ('硬', 41), ('化', 42), ('脏', 43), ('右', 44), ('乳', 45), ('腺', 46), ('癌', 47), ('I', 48), ('V', 49), ('期', 50), ('恶', 51), ('肿', 52), ('瘤', 53), ('头', 54), ('晕', 55), ('骶', 56), ('裂', 57), ('半', 58), ('大', 59), ('便', 60), ('失', 61), ('控', 62), ('椎', 63), ('板', 64), ('禁', 65), ('小', 66), ('腿', 67), ('皮', 68), ('肤', 69), ('软', 70), ('组', 71), ('织', 72), ('损', 73), ('伤', 74), ('下', 75), ('肢', 76), ('未', 77), ('特', 78), ('指', 79), ('的', 80), (',', 81), ('水', 82), ('平', 83), ('外', 84), ('阴', 85), ('上',

In [9]:
print(data.gaz)

<utils1.gazetteer.Gazetteer object at 0x7fb90c2786a0>


In [10]:
print(data.gaz_count)

{2: 75, 3: 232, 4: 48, 5: 49, 6: 122, 7: 3, 8: 680, 9: 31, 10: 0, 11: 30, 12: 1, 13: 46, 14: 145, 15: 2, 16: 190, 17: 223, 18: 1, 19: 206, 20: 33, 21: 261, 22: 16039, 23: 179, 24: 0, 25: 11, 26: 351, 27: 0, 28: 1, 29: 28, 30: 16, 31: 5, 32: 6, 33: 28, 34: 106, 35: 18, 36: 12, 37: 2, 38: 192, 39: 172, 40: 2, 41: 82, 42: 0, 43: 14, 44: 1, 45: 68, 46: 266, 47: 6, 48: 65, 49: 5, 50: 15, 51: 100, 52: 6, 53: 12, 54: 4, 55: 70, 56: 22, 57: 15, 58: 249, 59: 0, 60: 0, 61: 3, 62: 0, 63: 48, 64: 0, 65: 0, 66: 0, 67: 12, 68: 99, 69: 384, 70: 8, 71: 178, 72: 224, 73: 218, 74: 55, 75: 753, 76: 6, 77: 2, 78: 203, 79: 0, 80: 86, 81: 118, 82: 131, 83: 13, 84: 12, 85: 320, 86: 194, 87: 84, 88: 0, 89: 0, 90: 9, 91: 0, 92: 0, 93: 505, 94: 59, 95: 386, 96: 89, 97: 688, 98: 81, 99: 779, 100: 67, 101: 41, 102: 28, 103: 23, 104: 110, 105: 254, 106: 1891, 107: 8, 108: 3076, 109: 51, 110: 179, 111: 26, 112: 12, 113: 53, 114: 7, 115: 46, 116: 1, 117: 111, 118: 94, 119: 65, 120: 39, 121: 104, 122: 0, 123: 1, 124: