In [1]:
from sklearn.model_selection import train_test_split
from keras.models import Model
from keras.layers import Input, Lambda, Bidirectional, LSTM, Dense
from keras_bert import load_trained_model_from_checkpoint
from keras_contrib.layers import CRF
from keras_contrib.losses import crf_loss
from keras_contrib.metrics import crf_accuracy
from keras_bert import Tokenizer
from keras_bert import AdamWarmup, calc_train_steps
from datetime import datetime
from collections import Counter
import Named_Entity_Recognition as ner
import keras.callbacks
import re
import codecs
import time
import os
import codecs
import pandas as pd
import numpy as np

Using TensorFlow backend.


# 0. CKIP

In [2]:
from ckiptagger import WS, POS, NER
ckip_path = r'./Python - deep learning/Deep learning/keras 大神/data'
ws = WS(ckip_path, disable_cuda=False) #斷詞
pos = POS(ckip_path, disable_cuda=False) #詞性標注
ners = NER(ckip_path, disable_cuda=False) #實體辨識

In [3]:
content = '台積電（2330）緊急宣布漲價，引發市場高度關注 ，，美系外資出具報告指出，台積電漲價恐進一步壓縮IC設計客戶獲利表現，點名聯發科（2454）、群聯（8299）、神盾（6462）3家台廠明年毛利率可能受到擠壓。'

In [4]:
word_sentence_list = ws([content],
                        sentence_segmentation=True,
                        segment_delimiter_set={'?', '？', '!', '！', '。', ',', '，', ';', ':', '、'})
pos_sentence_list = pos(word_sentence_list)
entity_sentence_list = ners(word_sentence_list, pos_sentence_list)

In [5]:
entity_sentence_list

[{(0, 3, 'ORG', '台積電'),
  (4, 8, 'CARDINAL', '2330'),
  (27, 29, 'GPE', '美系'),
  (38, 41, 'ORG', '台積電'),
  (62, 65, 'ORG', '聯發科'),
  (66, 70, 'CARDINAL', '2454'),
  (72, 74, 'ORG', '群聯'),
  (84, 88, 'CARDINAL', '6462'),
  (89, 90, 'CARDINAL', '3'),
  (93, 95, 'DATE', '明年')}]

# 1. 建立 NER 模型

In [6]:
bert_dir = r'C:\Users\rocker\Python - deep learning\Deep learning\keras 大神\bert'
config_path = os.path.join(bert_dir, 'bert_config.json')
checkpoint_path = os.path.join(bert_dir, 'bert_model.ckpt')
dict_path = os.path.join(bert_dir, 'vocab.txt')

In [7]:
token_dict = ner.create_tokenizer(dict_path)
tokenizer = Tokenizer(token_dict)

In [8]:
#org_list, label = transfer_NER(train_ner, tokenizer, maxlen=maxlen_ner)
#train_ner['org_list'] = org_list
#train_ner.to_csv('financial_news_ner.csv', encoding="utf_8_sig", index=False)

## 1-1. load data

In [9]:
news = pd.read_csv(r'./financial_news_ner.csv',encoding='big5hkscs')

In [10]:
news.head()

Unnamed: 0,content,news_id,org_list
0,德微近期有價證券近期多次達公佈注意交易資訊標準，被櫃買中心要求公布自結損益，7月稅後淨利03...,6,"[(210, 214, 'ORG', '亞昕晶片')]"
1,中鋼集團中鴻每股分配03元現金股利，今日除息，早盤買盤強拉，股價一度漲停4815元，不但大幅...,8,"[(1, 4, 'ORG', 'CLS'), (5, 9, 'ORG', '中鋼集團'), ..."
2,證券分析師黃勇文表示，台股26日早盤受到台積電漲價效應激勵，惟後續卻隨著權值股漲幅收斂，低點...,9,"[(25, 28, 'ORG', '台積電'), (92, 96, 'ORG', '南韓央行..."
3,「情歌天后」梁靜茹疫情期間沒停下腳步，不但推出新作《時光隨想•三日思》，也努力精進廚藝，舉凡...,12,"[(155, 161, 'ORG', '金曲獎評審團'), (222, 225, 'ORG'..."
4,長興自結7月稅前盈餘373億元，年成長1101％，每股稅前盈餘03元；前7月稅前盈餘2785...,24,"[(5, 7, 'ORG', '長興')]"


In [11]:
org_list = list(news['org_list'].apply(lambda x: eval(x)))

In [12]:
maxlen_ner = 512
input_shape = (maxlen_ner, )
label = ner.transfer_NER_array(news, org_list, tokenizer, maxlen=maxlen_ner)
input_id, segment_id, mask_input = ner.encoded(tokenizer, news, maxlen=maxlen_ner)

In [13]:
def bert_BiLSTM_CRF_model():
    
    ner_model = load_trained_model_from_checkpoint(config_path, checkpoint_path, training=True, seq_len=maxlen_ner)
    bert_output = ner_model.layers[-9].output
    X = Lambda(lambda x: x[:, 0: input_shape[0]])(bert_output)
    X = Bidirectional(LSTM(128, return_sequences=True))(X)
    #X = TimeDistributed(Dense(len(y_token_dict), activation='relu'))(X)
    output = CRF(3, sparse_target = True)(X)    
    ner_model = Model(ner_model.input, output)
    
    for layer in ner_model.layers:
        layer.trainable = False
    ner_model.layers[-1].trainable = True
    ner_model.layers[-2].trainable = True
    
    return ner_model

In [14]:
ner_model = bert_BiLSTM_CRF_model()

In [15]:
ner_model.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Input-Token (InputLayer)        (None, 512)          0                                            
__________________________________________________________________________________________________
Input-Segment (InputLayer)      (None, 512)          0                                            
__________________________________________________________________________________________________
Embedding-Token (TokenEmbedding [(None, 512, 768), ( 16226304    Input-Token[0][0]                
__________________________________________________________________________________________________
Embedding-Segment (Embedding)   (None, 512, 768)     1536        Input-Segment[0][0]              
____________________________________________________________________________________________

In [16]:
set_trainable = False
for layer in ner_model.layers:
    if layer.name == 'Encoder-12-MultiHeadSelfAttention':
        set_trainable = True
    if set_trainable:
        layer.trainable = True
    else:
        layer.trainable = False

In [17]:
batch_size = 8
epochs = 5

In [18]:
total_steps, warmup_steps = calc_train_steps(
    num_example=news.shape[0],
    batch_size=batch_size,
    epochs=epochs,
    warmup_proportion=0.1,
)

optimizer = AdamWarmup(total_steps, warmup_steps, lr=1e-3, min_lr=1e-5)

In [19]:
callback_list = [
                 keras.callbacks.EarlyStopping(monitor='val_crf_accuracy', patience=1)
                 #,ModelCheckpoint(filepath='AML_bert.h5', monitor='val_loss', save_best_only=True)
                ]

In [20]:
ner_model.compile(optimizer=optimizer,
                  loss=crf_loss,
                  metrics=[crf_accuracy])

ner_model.fit([input_id, segment_id, mask_input],
          label,
          epochs=epochs,
          batch_size=batch_size,
          validation_split=0.1,
          callbacks=callback_list)

Train on 793 samples, validate on 89 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5


<keras.callbacks.callbacks.History at 0x13a24e58f08>

In [21]:
ner_model.save_weights('financial_news_model.h5')

In [22]:
# 3. NER 預測公司
text = '台積電（2330）緊急宣布漲價，引發市場高度關注，美系外資出具報告指出，台積電漲價恐進一步壓縮IC設計客戶獲利表現，點名聯發科（2454）、群聯（8299）、神盾（6462）3家台廠明年毛利率可能受到擠壓。'
input_id, segment_id = tokenizer.encode(text, max_len=maxlen_ner)
mask_input = [ner.transfer(i) for i in input_id]
prediction = ner_model.predict([[input_id], [segment_id], [mask_input]])
y_pred = np.argmax(prediction, axis=-1)
org_list = ner.get_name(token_dict, [input_id], y_pred)

In [23]:
org_list

[['聯發科', '台積電', '群聯', '台廠', '神盾']]