In [1]:
import numpy as np
import pandas as pd

import os 
import sys 
module_path = os.path.abspath(os.path.join('..')) 
if module_path not in sys.path: 
    sys.path.append(module_path)

import utils as myutils

# load data

In [2]:
normal_train, normal_test = myutils.get_sentence("../data/train", "../data/test")
# data_test = myutils.get_sentence("../data/test")

In [307]:
# display(normal_test)

In [3]:
transfer_train, transfer_test = myutils.get_sentence("../data/transfer_train", "../data/transfer_test")

In [4]:
char2id, id2char, tag2id, id2tag, transfer_tag2id, transfer_id2tag = myutils.get_transform(normal_train + transfer_train, 
                                                                                       "../data/maps.pkl",
                                                                                       "../data/tag2label.json",
                                                                                       "../data/transfer_tag2label.json")

In [5]:
#字符index 从1开始，这样pad_sequence是默认填充值为0不会冲突，设置embed，mask_zero==True，请参考https://blog.csdn.net/songbinxu/article/details/80150019
char2id = dict((v, i+1) for v, i in char2id.items()) 
# id2char = dict((i+1, v)for i, v in id2char.items())

In [311]:
# id2char = 

In [312]:
# display(char2id)
# print(char2id.get("<UNK>"))
# len(char2id)

In [313]:
# train_data = myutils.preprocess_data(normal_train, char2id, tag2id)
# test_data = myutils.preprocess_data(normal_test, char2id, tag2id)
# transfer_train_data = myutils.preprocess_data(transfer_train, char2id, transfer_tag2id)
# transfer_test_data = myutils.preprocess_data(transfer_test, char2id, transfer_tag2id)

In [6]:
from keras.preprocessing.sequence import pad_sequences
doc_maxlen = 30
UNKNOWN_CHAR = myutils.UNKNOWN_CHAR
def preprocess_data(sentences, char2id, tag2id):
  char_ids = [[char2id[char[0] if char[0] in char2id else UNKNOWN_CHAR] for char in sentence] for sentence in sentences]
  tag_ids = [[tag2id[char[1]] for char in sentence ] for sentence in sentences]
   
  char_ids = pad_sequences(char_ids, doc_maxlen)
  tag_ids = pad_sequences(tag_ids, doc_maxlen, value=-1)
  tag_ids = np.expand_dims(tag_ids, 2) #扩展维度 对应sparse_target = True
  return char_ids, tag_ids



In [7]:
train_X, train_y = preprocess_data(normal_train, char2id, tag2id)
test_X, test_y = preprocess_data(normal_test, char2id, tag2id)
transfer_train_X, transfer_train_y = preprocess_data(transfer_train, char2id, transfer_tag2id)
transfer_test_X, transfer_test_y = preprocess_data(transfer_test, char2id, transfer_tag2id)

In [316]:
# display(test_y)

# preprocess data

# define model

In [8]:
from keras.layers import Input
from keras.layers import Embedding
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Bidirectional
from keras.layers import Dropout
from keras_contrib.layers import CRF
from keras_contrib.metrics import crf_accuracy
from keras_contrib.losses import crf_loss
# from keras_contrib.layers import crf
from keras.optimizers import Adam
from keras.losses import categorical_crossentropy
from keras.metrics import categorical_accuracy
from keras.models import Model

In [9]:
doc_words_maxlen = doc_maxlen
embed_output_dim = 50
tags_num = len(tag2id)
transfer_tags_num = len(transfer_tag2id)
embed_mask_zero = True
vocab_size = len(char2id) if not embed_mask_zero else len(char2id) + 1

def create_normal_model(tags_num):
    layer_input_char = Input(shape=(doc_words_maxlen, ), name = "input_char")
    print(layer_input_char.name, layer_input_char.shape)
    layer_embed_char = Embedding(input_dim=vocab_size, output_dim=embed_output_dim, name="embed_char", mask_zero=embed_mask_zero)(layer_input_char)
    print(layer_embed_char.name, layer_embed_char.shape)
    layer_bilstm = Bidirectional(LSTM(units=10, return_sequences = True), name="bilstm")(layer_embed_char)
    print(layer_bilstm.name, layer_bilstm.shape)
    # sparse_target=true值label为数字，请参考https://github.com/keras-team/keras-contrib/issues/179
    layer_crf = CRF(tags_num, sparse_target=True, name = "out_crf")(layer_bilstm)
    print(layer_crf.name, layer_crf.shape)
    
    model = Model(inputs = layer_input_char, outputs=[layer_crf])
    model.compile(optimizer="adam", 
                  loss = crf_loss,
                  metrics= [crf_accuracy]
                 )
    return model
    
def create_normal_and_transfer_models(normal_tags_num, transfer_tags_num):
    layer_input_char = Input(shape=(doc_words_maxlen, ), name = "input_char")
    print(layer_input_char.name, layer_input_char.shape)
    layer_embed_char = Embedding(input_dim=vocab_size, output_dim=embed_output_dim, name="embed_char", mask_zero=embed_mask_zero)(layer_input_char)
    print(layer_embed_char.name, layer_embed_char.shape)
    layer_bilstm = Bidirectional(LSTM(units=10, return_sequences = True), name="bilstm")(layer_embed_char)
    print(layer_bilstm.name, layer_bilstm.shape)
    layer_crf_normal = CRF(tags_num, sparse_target=True, name = "out_crf_normal")(layer_bilstm)
    layer_crf_transfer = CRF(transfer_tags_num, sparse_target=True, name = "out_crf_transfer")(layer_bilstm)
    
    normal_model = Model(inputs = layer_input_char, outputs=[layer_crf_normal])
    transfer_model = Model(inputs = layer_input_char, outputs =[layer_crf_transfer])
    normal_model.compile(optimizer="adam", 
                  loss = crf_loss,
                  metrics= [crf_accuracy]
                 )
    transfer_model.compile(optimizer="adam", 
                  loss = crf_loss,
                  metrics= [crf_accuracy]
                 )
    return normal_model, transfer_model


def create_mulittask_model():
#     layer_input_word = Input(shape=(doc_words_maxlen,), name="input_word")
#     layer_embed_word = Embedding(input_dim=vocab_size, output_dim=embed_output_dim, name="embed_word")(layer_input_word)
    layer_input_char = Input(shape=(doc_words_maxlen, ), name = "input_char")
    print(layer_input_char.name, layer_input_char.shape)
    layer_embed_char = Embedding(input_dim=vocab_size, output_dim=embed_output_dim, name="embed_char")(layer_input_char)
    print(layer_embed_char.name, layer_embed_char.shape)
    layer_bilstm = Bidirectional(LSTM(units=20, return_sequences = True))(layer_embed_char)
    print(layer_bilstm.name, layer_bilstm.shape)
    layer_crf1 = CRF(units=crf1_tags_num, sparse_target=True, name = "out_crf1")(layer_bilstm)
    print(layer_crf1.name, layer_crf1.shape)
    layer_crf2 = CRF(units=crf2_tags_num, name = "out_crf2")(layer_bilstm)
    
    model = Model(inputs = layer_input_char, outputs=[layer_crf1, layer_crf2])
    
    
    model.compile(optimizer="adam", 
                  loss={"out_crf1" : "categorical_crossentropy", "out_crf2" : "categorical_crossentropy"},
                  loss_weights={"out_crf1": 1, "out_crf2" : 1},
                  metrics={"out_crf1" : "categorical_accuracy", "out_crf2" : "categorical_accuracy"}

                 )
    return model
  
    
    

In [10]:
# def test_normal_model(normal_model):
#     normal_model.fit(train_X, train_y,
#                 epochs=1,
#                 validation_data=(
#                     test_X,
#                     test_y
#                 ),
#                 verbose=1)
#     normal_model.trainable = False
#     normal_model.compile(optimizer="adam", 
#                   loss = crf_loss,
#                   metrics= [crf_accuracy]
#                  )  
#     normal_model.save("normal_model_trained.h5")

# def test_transfer_model(transfer_model):
#     transfer_model.save("transer_model.h5")
#     pass
    
                
                  
# test_normal_and_transfer_modes()   
    

In [10]:
normal_model = create_normal_model(tags_num)    
normal_model.summary()  
normal_model.save("normal_model.h5") 


input_char:0 (?, 30)
Instructions for updating:
Colocations handled automatically by placer.
embed_char/embedding_lookup/Identity:0 (?, 30, 50)
bilstm/concat:0 (?, ?, 20)
out_crf/cond/Merge:0 (?, ?, 7)
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_char (InputLayer)      (None, 30)                0         
_________________________________________________________________
embed_char (Embedding)       (None, 30, 50)            253700    
_________________________________________________________________
bilstm (Bidirectional)       (None, 30, 20)            4880      
_________________________________________________________________
out_crf (CRF)                (None, 30, 7)             210       
Total params: 258,790
Trainable params: 258,790
Non-trainable params: 0
_________________________________________________________________


# train model

In [11]:
normal_model.fit(train_X, train_y,
                epochs=1,
                validation_data=(
                    test_X,
                    test_y
                ),
                verbose=1)

normal_model.save("normal_model_trained.h5") 

Instructions for updating:
Use tf.cast instead.
Train on 50658 samples, validate on 4631 samples
Epoch 1/1


In [324]:
# normal_model.fit(x=train, 
#           y={"out_crf1" : [],  "out_crf2" :[]},
#           validation_data = (
#               test, 
#               {"out_crf1" : [],  "out_crf2" :[]}
#           ),
#           epochs= 5,
#           verbose=1,
#           batch_size=64 
#          )

# predict 

In [12]:
pred_y = normal_model.predict(test_X)

In [326]:
# display(pred_y)

In [13]:
# pred_y.shape
# pred_y_results=[]
pred_y_results = [[np.argmax(pred_y[i][j]) for j in range(pred_y.shape[1])] for i in range(pred_y.shape[0])]

In [15]:
# true_y_results = [[np.argmax(test_y[i][j]) for j in range(test_y.shape[1])] for i in range(test_y.shape[0])]
true_y_results = test_y

In [16]:
pred_y_results = np.array(pred_y_results)
true_y_results = np.array(true_y_results)



In [17]:
pred_y_results = pred_y_results.reshape(-1, )
true_y_results = true_y_results.reshape(-1, )

In [17]:
# display(np.max(pred_y_results))
# print(true_y_results.shape)

6

In [332]:
# display(tag2id)

{'O': 0,
 'B-PER': 1,
 'I-PER': 2,
 'B-LOC': 3,
 'I-LOC': 4,
 'B-ORG': 5,
 'I-ORG': 6}

In [18]:
from collections import defaultdict
from collections import namedtuple

In [334]:
# TagScore = namedtuple("TagScore", ["tag", "id", "count_tp", "count_true", "count_pred"])

In [335]:
# s1 = TagScore("tag", 11, tp=0, fp=0, precision=0.0, recall=0.0, f1=0.0) 

In [19]:
def caculate_precision_recall_f1(tag2id, id2tag, pred_ids, true_ids):
    scores = {}
    for tag, ID in tag2id.items():
      scores[ID] = {"tag" : tag,
                    "id" : ID,
                    "count_tp" : 0,
                    "count_true" : 0,
                    "count_pred" : 0
                   }
    for i in range(len(pred_ids)):
        ID = pred_ids[i]
        if ID < 0:
            continue
        scores[ID]["count_pred"] += 1
        
    for i in range(len(true_ids)):
        ID = true_ids[i]
        if ID < 0:
            continue
        scores[ID]["count_true"] += 1
        if pred_ids[i] == true_ids[i]:
            scores[ID]["count_tp"] += 1       
    
#     for tag, ID in tag2id.items():
#         for i in range(len(pred_ids)):
#             pred_id = pred_ids[i]
#             true_id = true_ids[i]
#             score = scores[tag]
#             print("ID=%d"%ID, "pred_id=%d"%pred_id, "true_id=%d"%true_id)
#             if ID == pred_id:
#                 score["count_pred"] = score["count_pred"] + 1
#                 if pred_id == true_id:
#                     score["count_tp"] = score["count_tp"] + 1
#             elif ID == true_id:
#                 score["count_true"] = score["count_true"] + 1
#                 if pred_id == true_id:
#                     score["count_tp"] = score["count_tp"] + 1
    return scores

scores = caculate_precision_recall_f1(tag2id, id2tag, pred_y_results, true_y_results)         

In [20]:
# display(scores)
def get_results(scores):
    count_tp_total = 0
    for k in scores.keys():
        score = scores[k]
        count_tp_total += score["count_tp"]
        score["precision"] = score["count_tp"] / score["count_pred"]
        score["recall"] = score["count_tp"] / score["count_true"]
        score["f1"] =  2 * score["precision"] * score["recall"] / (score["precision"] + score["recall"])
#     score["total"] = {
#         "accuracy" : count_tp_total / total_num
#     }
    return scores
results = get_results(scores)
display(results)

{0: {'tag': 'O',
  'id': 0,
  'count_tp': 100204,
  'count_true': 101469,
  'count_pred': 130042,
  'precision': 0.7705510527368081,
  'recall': 0.9875331381998443,
  'f1': 0.8656521720350221},
 1: {'tag': 'B-PER',
  'id': 1,
  'count_tp': 322,
  'count_true': 906,
  'count_pred': 405,
  'precision': 0.7950617283950617,
  'recall': 0.3554083885209713,
  'f1': 0.49122807017543857},
 2: {'tag': 'I-PER',
  'id': 2,
  'count_tp': 988,
  'count_true': 1879,
  'count_pred': 1291,
  'precision': 0.7652982184353214,
  'recall': 0.5258116019159127,
  'f1': 0.6233438485804417},
 3: {'tag': 'B-LOC',
  'id': 3,
  'count_tp': 1079,
  'count_true': 1726,
  'count_pred': 1388,
  'precision': 0.7773775216138329,
  'recall': 0.6251448435689455,
  'f1': 0.692999357739242},
 4: {'tag': 'I-LOC',
  'id': 4,
  'count_tp': 1529,
  'count_true': 2652,
  'count_pred': 2237,
  'precision': 0.683504693786321,
  'recall': 0.5765460030165912,
  'f1': 0.6254857844139906},
 5: {'tag': 'B-ORG',
  'id': 5,
  'count_tp

# transfer learning

In [21]:
from keras.models import Sequential
from keras.utils import plot_model
from keras.layers import Wrapper
from keras.layers import Bidirectional
import keras
print(keras.__version__)

2.2.4


In [28]:
def print_model(model, name):
    print("print-------", model)
    for layer in model.layers:
        print("-", layer, layer.trainable)
        if  isinstance(layer, Wrapper):
            print("-",layer.forward_layer, layer.forward_layer.trainable)
            print("-",layer.backward_layer, layer.backward_layer.trainable)
#     plot_model(model)

def freeze_layer(layer):
    print("freeze------", layer)
    layer.trainable = False
    #修复keras 2.2.4版本的bug，Wrapper嵌套的layer也需要设置trainable 
    if  isinstance(layer, Wrapper):
        print ("-wrapper", layer)
        layer.forward_layer.trainable = False
        layer.backward_layer.trainable = False
    if hasattr(layer, "layers"):
        for l in layer.layers:
            freez_layer(l)
            
def freeze_model(model):      
    for layer in model.layers:
        freeze_layer(layer)

def create_transfer_model(base_model : Model, tags_num):
    print_model(base_model, "base_model")
    layer_input = base_model.inputs
    layer_lstm = base_model.layers[-2]
    base_model.trainable = True
    layer_crf = CRF(units = tags_num, sparse_target = True, name="out_crf_transfer")
    layer_crf_output = layer_crf(layer_lstm.output)
    model = Model(inputs = layer_input, outputs = [layer_crf_output])
    
    freeze_model(model)
    layer_crf.trainable = True
    print("-layer_crf", layer_crf, layer_crf.trainable)
    print("-layer_crf_output", layer_crf_output)
#     model.layers[-1].trainable = True
#     print("-model.layer[-1]", model.layers[-1])
    
    
    model.compile(optimizer="adam", 
                  loss = crf_loss,
                  metrics= [crf_accuracy]
                 )                                                                                    
    model.summary() 
    
    print_model(model, "transfer_model")
    return model

transfer_model = create_transfer_model(normal_model, transfer_tags_num)

transfer_model.save("transfer_model.h5")
# normal_model.save("normal_model_transfered.h5")

print------- <keras.engine.training.Model object at 0x153134a58>
- <keras.engine.input_layer.InputLayer object at 0x1516626d8> False
- <keras.layers.embeddings.Embedding object at 0x1516628d0> False
- <keras.layers.wrappers.Bidirectional object at 0x152eec400> False
- <keras.layers.recurrent.LSTM object at 0x152eec080> False
- <keras.layers.recurrent.LSTM object at 0x152eec550> False
- <keras_contrib.layers.crf.CRF object at 0x152eec7b8> True
freeze------ <keras.engine.input_layer.InputLayer object at 0x1516626d8>
freeze------ <keras.layers.embeddings.Embedding object at 0x1516628d0>
freeze------ <keras.layers.wrappers.Bidirectional object at 0x152eec400>
-wrapper <keras.layers.wrappers.Bidirectional object at 0x152eec400>
freeze------ <keras_contrib.layers.crf.CRF object at 0x163920f98>
-layer_crf <keras_contrib.layers.crf.CRF object at 0x163920f98> True
-layer_crf_output Tensor("out_crf_transfer_4/cond/Merge:0", shape=(?, ?, 13), dtype=float32)
_______________________________________

In [24]:
transfer_model.fit(transfer_train_X, transfer_train_y, verbose=1, epochs=5, validation_data=(
    transfer_test_X, transfer_test_y
))

transfer_model.save("transfer_model_fited.h5")

Train on 9445 samples, validate on 1036 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
