In [1]:
import keras
from keras.layers import Input, BatchNormalization, LSTM, GRU, Lambda, TimeDistributed, Conv1D, Embedding, Dense, Softmax, Add, Multiply, Concatenate, Permute, Dropout, Activation, CuDNNGRU, CuDNNLSTM, Bidirectional, Reshape
from keras.models import Model, Sequential
import keras.backend as K
import numpy as np
from numpy.random import randn
import tensorflow as tf

from glob import glob

import os
import json
import time

Using TensorFlow backend.


In [2]:
# TensorFlow wizardry
config = tf.ConfigProto()

# Don't pre-allocate memory; allocate as-needed
config.gpu_options.allow_growth = False

# Only allow a total of half the GPU memory to be allocated
config.gpu_options.per_process_gpu_memory_fraction = 1.0

# Create a session with the above options specified.
K.tensorflow_backend.set_session(tf.Session(config=config))

In [3]:
class Dictionary:
    def __init__(self):
        # 0 reserved for padding char
        # 1 reserved for unknown char
        self.num_chars = 2
        self.dict = {}
        
    def add_string(self, string):
        for c in string:
            self.add_char(c)
            
    def add_char(self, c):
        if c not in self.dict:
            self.dict[c] = self.num_chars
            self.num_chars += 1
    
    def find_char(self, c):
        if c in self.dict:
            return self.dict[c]
        else:
            return 1
    
    def write_metadata(self, filename='metadata.tsv'):
        rest = [name for idx,name in sorted((idx,name) for name,idx in self.dict.items())]
        with open(filename, 'w') as f:
            f.write('PAD\n')
            f.write('UNK\n')
            for s in rest:
                f.write(repr(s)[1:-1])
                f.write('\n')
    
    # output shape: (len(ss))
    def prepare_input_sequence(self, s, seq_len=None):
        if seq_len is None:
            return np.array([self.find_char(c) for c in s])
        else:
            t = np.zeros([seq_len], dtype='long')
            t[:len(s)] = np.array([self.find_char(c) for c in s])[:seq_len]
            return t

    # output shape: (seq_len)
    def prepare_tag_sequence(self, seq_len, entity_beg, entity_len):
        t = np.zeros(seq_len)
        t[entity_beg:entity_beg+entity_len] = 1.0
        return t
    
    # output type: (in, targ), in.size() = targ.size() = (len(s))
    def prepare_example(self, s, entity_beg, entity_len):
        inputs = self.prepare_input_sequence(s)
        targets = self.prepare_tag_sequence(len(s), entity_beg, entity_len)
        return inputs, targets
    
    # input type:  array of [title, entity_beg, entity_len]
    # output type: (in, targ), in.size() = targ.size() = (len(records), max(len(records[0])))
    def prepare_examples(self, records, fixlen=None):
        seq_len = None
        if fixlen is None:
            seq_len = max(len(r[0]) for r in records)
        else:
            seq_len = fixlen

        batch_size = len(records)
        input_tensor = np.zeros([batch_size, seq_len], dtype='long')
        target_tensor = np.zeros([batch_size, seq_len], dtype='float32')
        for i,r in enumerate(records):
            input_tensor[i, :] = self.prepare_input_sequence(r[0], seq_len)
            target_tensor[i, :] = self.prepare_tag_sequence(seq_len, r[1], r[2])
        # return input_tensor, target_tensor.reshape(batch_size, max_seq_len, 1)
        return input_tensor, target_tensor
        
    
    def save(self, file):
        with open(file, 'w') as f:
            json.dump({ 'n': self.num_chars, 'd': self.dict }, f)
    def load(self, file):
        with open(file) as f:
            obj = json.load(f)
            self.num_chars = obj['n']
            self.dict = obj['d']

In [101]:
import pandas as pd

corpus = pd.read_csv("corpus/company-news-ext-freq.csv", low_memory=True)
UNROLL_SIZE = 100
corpus.title.str.len().describe()

count    4.245927e+06
mean     2.593703e+01
std      7.388736e+00
min      4.000000e+00
25%      2.100000e+01
50%      2.500000e+01
75%      3.000000e+01
max      1.090000e+02
Name: title, dtype: float64

In [102]:
vocab = Dictionary()
if os.path.exists('vocab.db'):
    vocab.load('vocab.db')
else:
    for news_title in corpus['title']:
        vocab.add_string(news_title)
    vocab.save('vocab.db')
vocab.write_metadata()
vocab.num_chars

5328

In [103]:
from IPython.display import Markdown, display
sample_news = [
    '小米MIX 3首发上手：滑盖全面屏，均衡无短板的体验',
    '广达回应Apple Watch工厂聘用学生：未与任何学校合作',
    '游族网络：前三季度净利润6.83亿元，同比增长49%',
    '彭博：SpaceX将通过高盛寻求5亿美元杠杆贷款',
    '无桩共享式自行车“摩拜单车”进入北京，半小时一元'
    '中信重工：拟设立两家子公司 注册资本各5000万元'
]

def format_tagged_sentence(model, sentence):
    s = vocab.prepare_input_sequence(sentence)
    pred = model.predict(s.reshape(1, -1)).reshape(-1)
    def format_c(c, p):
        if p < 0.1:
            return f'<span style="color: #f20" title="{p}">{c}</span>'
        elif 0.1 <= p < 0.5:
            return f'<span style="color: #a80" title="{p}">{c}</span>'
        elif 0.5 <= p < 0.9:
            return f'<span style="color: #480" title="{p}">{c}</span>'
        else:
            return f'<span style="color: #2f0" title="{p}">{c}</span>'

    return ''.join([format_c(c, p) for p,c in zip(pred, sentence)])

# def extract_tagged_sentence(model, sentence):
#     s = vocab.prepare_input_sequence(sentence).view(-1, 1)
#     pred = torch.sigmoid(model(s.to(device=device))).view(-1).tolist()
#     return ''.join(c for p,c in zip(pred, sentence) if p > 0.5)

def pprint_text(txt):
    from IPython.display import display, HTML
    display(HTML(txt))

# pprint_text('|'.join(format_tagged_sentence(model, s) for s in sample_news))

In [109]:
GLOBAL_OFFSET = 0
TRAINING_SIZE = 4150000
VALIDATION_SIZE = 50000
BATCH_SIZE = 6000


class DataSet(keras.utils.Sequence):
    def __init__(self, offset, size, batch_size):
        self.offset = offset
        self.size = size
        self.batch_size = batch_size
        
    def __len__(self):
        return int(np.ceil(self.size / float(self.batch_size)))
    
    def __getitem__(self, idx):
        offset = self.offset + self.batch_size * idx
        limit = offset + self.batch_size
        if limit > self.offset + self.size:
            limit = self.offset + self.size
        df = corpus[offset:limit][['title', 'begin', 'length']]
        (inputs, targets) = vocab.prepare_examples(df.values.tolist())
        # print(targets.shape)
        return inputs, targets

training_set = DataSet(0, TRAINING_SIZE, BATCH_SIZE)
validation_set = DataSet(TRAINING_SIZE, VALIDATION_SIZE, BATCH_SIZE)

In [110]:
model_no = 'ef4'

dropout = 0.2

def generate_model():
    inp = Input(shape=(None,))
    #emb = Embedding(vocab.num_chars, 48, name='embedding', mask_zero=True)(inp)
    emb = Embedding(vocab.num_chars, 48, name='embedding')(inp)
    norm0 = BatchNormalization()(emb)
    
    rnn1 = Bidirectional(CuDNNGRU(32, return_sequences=True))(norm0)
    res1 = Dropout(dropout)(BatchNormalization()(Concatenate()([norm0, rnn1])))
    
    rnn2 = Bidirectional(CuDNNGRU(32, return_sequences=True))(res1)
    res2 = Dropout(dropout)(BatchNormalization()(Concatenate()([norm0, rnn2])))
    
    rnn3 = Bidirectional(CuDNNGRU(32, return_sequences=True))(res2)
    res3 = Dropout(dropout)(BatchNormalization()(Concatenate()([norm0, rnn3])))
    con = Concatenate()([norm0, res3])

    fc1 = Dropout(dropout)(Dense(64, activation='relu')(con))
    fc2 = Dense(1, activation='sigmoid', name='fc')(fc1)
    
    # disable the masking
    # fc2 = Lambda(lambda x: x, output_shape=lambda s:s)(fc2)
    out = Reshape((-1,))(fc2)
    
    return Model(inp, out)

model = generate_model()
    
print(model.input_shape)
print(model.output_shape)

print(model.summary())

variance_ratio = 0.00001
def loss(y_true, y_pred):
    l = keras.losses.binary_crossentropy(y_true, y_pred)
#     y_pred_int = tf.round(y_pred)
#     var = tf.reduce_mean(tf.square(y_pred - y_pred_int))
    # var in [0.0, 0.25]
    return l
model.compile(optimizer='adam', loss=keras.losses.binary_crossentropy, metrics=['acc'])


# model.predict(np.arange(UNROLL_SIZE).reshape(1, -1))

(None, None)
(None, None)
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_26 (InputLayer)           (None, None)         0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, None, 48)     255744      input_26[0][0]                   
__________________________________________________________________________________________________
batch_normalization_101 (BatchN (None, None, 48)     192         embedding[0][0]                  
__________________________________________________________________________________________________
bidirectional_76 (Bidirectional (None, None, 64)     15744       batch_normalization_101[0][0]    
___________________________________________________________________________________

In [111]:
import tensorflow as tf
news = '中信重工：拟设立两家子公司 注册资本各5000万元'
# x = training_set[0][0][0]
# y_true = K.variable(training_set[0][1][0])
# y_pred = K.variable(model.predict(x).reshape(-1))
# print(K.eval(y_true))
# print(K.eval(y_pred))
# print(K.eval(keras.losses.binary_crossentropy(y_true, y_pred)))
# print(K.eval(keras.metrics.binary_accuracy(y_true, y_pred)))
# pprint_text(format_tagged_sentence(model, news))

In [112]:
sample_news = [
    '焦点分析丨马云刘强东都行动了，奢牌为什么还是更愿意在微信上开店？',
    '“阿里”股东的惊天计划 “中国天使投资人学院”人去楼空',
    '科技早报：iPhone XR在中国很受欢迎，360赴美IPO',
    '联想创投贺志强：两种创企将引领智能互联网未来'
]

# sample_news = corpus[0:10]['title']

class PrintSampleText(keras.callbacks.Callback):
    def on_epoch_begin(self, batch, logs={}):
        pprint_text('|'.join(format_tagged_sentence(model, s) for s in sample_news))
        

callbacks = [
    keras.callbacks.BaseLogger(),
    keras.callbacks.History(),
    keras.callbacks.ModelCheckpoint('weights/%s-{epoch:02d}-{val_loss:.4f}.hdf5' % model_no, save_best_only=True, ),
    keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=5, verbose=1),
    keras.callbacks.TensorBoard(
        log_dir=f'./logs/{model_no}',
        update_freq='epoch',
        write_graph=False,
        embeddings_freq=0,
        embeddings_layer_names=['embedding'],
        embeddings_metadata={ 'embedding': 'metadata.tsv' },
        embeddings_data=np.arange(vocab.num_chars)
    ),
    keras.callbacks.ReduceLROnPlateau(monitor='loss'),
    PrintSampleText()
]

In [114]:
model.fit_generator(
    generator=training_set,
    epochs=300,
    validation_data=validation_set,
    callbacks=callbacks,
    initial_epoch=0,
    shuffle=True
)

Epoch 1/300


Epoch 2/300


Epoch 3/300


Epoch 4/300


Epoch 5/300


Epoch 6/300


Epoch 7/300


Epoch 8/300


Epoch 9/300


Epoch 10/300


Epoch 11/300


Epoch 12/300


Epoch 13/300


Epoch 14/300


Epoch 15/300


Epoch 16/300


Epoch 17/300


Epoch 18/300


Epoch 19/300


Epoch 20/300


Epoch 21/300


Epoch 22/300


Epoch 23/300


Epoch 24/300


Epoch 25/300


Epoch 26/300


Epoch 27/300


Epoch 28/300


Epoch 29/300


Epoch 30/300


Epoch 31/300


Epoch 32/300


Epoch 33/300


Epoch 34/300


Epoch 35/300


Epoch 36/300


Epoch 37/300


Epoch 38/300


Epoch 39/300


Epoch 40/300


Epoch 41/300


Epoch 42/300


Epoch 43/300


Epoch 44/300


Epoch 45/300


Epoch 46/300


Epoch 47/300


Epoch 48/300


Epoch 49/300




IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [None]:
### model.predict([[1,0,0]])

In [115]:
def print_failed_tests(model, dataframe):
    count = 0
    for i in range(int(np.ceil(len(dataframe) / BATCH_SIZE))):
        offset = i * BATCH_SIZE
        examples = (dataframe.iloc[offset:offset+BATCH_SIZE][['title', 'begin', 'length']]).values.tolist()
        (inputs, targets) = vocab.prepare_examples(examples)
        outputs = np.round(model.predict(inputs)).astype('long')
        rows, = np.where(np.any(outputs != targets.astype('long'), axis=1))
        
        for row in rows:
            df = dataframe.iloc[offset+row]
            highlight = format_tagged_sentence(model, df['title'])
            pprint_text(f'({offset+row}) {df["entity"]} / {highlight}')
        count += len(rows)
    print(f'{count} out of {len(dataframe)} failed examples found and printed')

print_failed_tests(model, corpus[:200])

26 out of 200 failed examples found and printed


In [None]:
validation_set[0][0][128]

In [None]:
inp = np.array([[2200,   27,  127,  128,  557,  210,   30,    6,  409,  900,  234,
         283,  284,  209,  357,  522, 1950, 1951  
           ]])
model.predict(inp)

In [119]:
epoch = 42
# saved_state = list(glob(f'weights/{model_no}-{epoch:02d}-*.hdf5'))[0]
saved_state = list(glob(f'weights/ef4-{epoch:02d}-*.hdf5'))[0]
model.load_weights(saved_state)

# news = corpus[15:100]['title']
news = """360金融申请上市 斗士周鸿祎的孤独旅行
​卖好车创始人&CEO李研珠：B2B方法论的玩法变了
早讯丨5G系统频率使用许可将于年内发放；无人驾驶进入商用新阶段
潮科技 | ​搭载长征二号丙火箭，天仪研究院四星成功发射
消逝的创始人
腾讯反思录：这家“巨无霸”到底在怎么做投资？
虽然卖了1个亿，网易还是发现卖情趣用品这件事没那么容易
科大讯飞还能“飞”多远？
沸点资本创始合伙人姚亚平确认参加“2018中国家居家装产业创新论坛”
锤子整合迎生死时刻 罗永浩盈利模式再临考验
资本看好的新型诊所行业融资盘点
B站和腾讯达成战略合作，ACG产业又迎来春天了吗？
云锋基金领投云学堂C轮融资 企业学习领域会讲出什么新故事？
泛音乐必需的两块拼图：视频和社交
打造AI应用开发平台，Paperspace获1300万美元A轮融资
点融网再曝人事震荡：创始人郭宇航淡出、高管为抢公章打人
首发丨地上铁宣布获3亿元B1轮融资，布局精细化经营能力建设
大麻监管系统Metrc获5000万美元融资，赋能产业合规性
「地上铁」获近3亿元B1轮融资，博将资本领投
【 上周投资人都在看 】前十名被投资人抢夺的项目10-26
焦点分析丨马云刘强东都行动了，奢牌为什么还是更愿意在微信上开店？
亲宝宝完成数亿元C轮融资，联姻好未来发力家庭智能育儿服务
建材供应链平台中装速配完成2000万元Pre-A轮融资，银河系创投领投
钟鼎创投尹军平：美国物流25年走过的路，中国10年就能走完
美媒：云计算出现增长放缓迹象 IT业繁荣见顶了？
360金融计划登陆纽交所，周鸿祎为实控制人
微软市值超过亚马逊，成美国市值第二高企业
黄金钱包：互联网黄金第一平台的颠覆与重塑
建材供应链平台“中装速配”完成2000万元Pre-A轮融资 赋能装企实现科技化升级
谁能救救贾跃亭和吃中国韭菜长大的FF？
威马被指9.28交付大会发车造假 物流车绕一圈后又开回工厂
“阿里”股东的惊天计划  “中国天使投资人学院”人去楼空
Tether一蹶不振，稳定币又成加密货币“香饽饽”？
上市、转型、营销：头部P2P绝地求生，行业淘汰赛加速
36氪专访 | 联想副总裁张华：联想消费业务向To C转型，刘军回归是转折点
【猎云早报】传滴滴正在探索酒店业务；饿了么被指挪用商户执照；老虎山黑糖专卖完成1.5亿元A轮融资
IBM拟斥资340亿美元收购开源解决方案供应商红帽
将AI视觉用于外观缺陷检测，「数优」已累计融资超8000万元
深度丨2018年软装供应链行业的三个大坑和四个趋势（上）
科技早报：iPhone XR在中国很受欢迎，360赴美IPO
达晨新基金成立：规模46.3亿元  LP阵容曝光  详解背后募资历程
IBM迄今最大规模收购案，拟斥资340亿美元收购Linux发行商“红帽”
热点 | IBM将以334亿美元的价格收购红帽公司 成IBM107年历史中最豪的一笔
信息泛滥，时间稀缺，媒体的后真相时代正在到来
8点1氪 | 库克：很开心iPhone XR在中国大受欢迎；小米手机今年出货量破亿；马蜂窝点评一条2元
点融网再曝人事震荡：创始人郭宇航淡出、高管为抢公章打人
THE ONE集团执行总监Jack受邀出席联合国2018世界投资论坛并作主题发言
美国采用3D打印技术，研发出智能机器猫，中国设计参与其中
点融网人事大震荡：创始人被架空  新任执行总裁为抢公章打人
云帐房完成3亿元C轮融资，高瓴资本领投
再生资源智能回收交易平台“小黄狗”获上市公司易事特1.5亿元战略投资
美媒：IBM红帽在一起是好事，能挑战亚马逊微软
财税SaaS服务商云帐房完成3亿元C轮融资，B轮领投方高瓴资本继续领投
柳传志视频致辞：要吃着碗里的看着锅里的
定制家居问题多，牢记五点避开商家陷阱
马云：阿里20年，我的十次生死危机
联想创投贺志强：两种创企将引领智能互联网未来
消费金融行业反思：我们能从危机中吸取教训吗？
获嘉德资本9000万元A轮融资 日光旅文着力度假综合体运营
新能源运营服务平台地上铁完成近3亿元B1轮融资，博将资本领投
半年两轮融资超2亿 他为300品牌外卖代运营 覆盖85城万家店 月GMV3.5亿
泛音乐必需的两块拼图：视频和社交
巴西“支付宝”Stoneco赴美上市首日涨30.62% 蚂蚁金服、巴菲特持股
马云在南非被一只猫抢了风头 南非总统宣布：全力投入天猫双11
自动驾驶时代已来！Waymo无人车投入商用，正在测试定价模型
花38亿买981个公号交易黄了！上市公司买家现在才知道不值？
达晨成立新基金，规模46.3亿元人民币
360金融递交IPO招股书 拟筹最多2亿美金 2018上半年净亏损5.7亿元人民币
传滴滴正在探索酒店业务，项目处于初期考察阶段
24大奖项 330榜单 3000+参选企业！欢迎点击参评
一周智慧城市丨安博会企业亮点节选；腾讯发力智慧交通
一周智造丨阿里云推IoT全球原产地溯源计划，百度打造AI+钢铁示范样本
福特中国区大调整欲摆脱下滑困境三大难题待解如何得“生机”？
首发丨建材供应链平台中装速配获2000万元Pre-A轮融资
播客的未来
急需“傍大款”的Snapchat该委身Netflix吗？
「创新的年轮」2003年，新一轮互联网布局从这里开始
从平民窟到东海岸创新基地，萨默维尔市二十年颠覆之路
医院检验科、输血科、病理科合并，将给医改带来哪些影响？
不到半年融资规模超20亿  社区团购只是一场“新瓶装旧酒”的尝试？
起底比特币顶级“庄家联盟”：迅速集结数百亿，操纵比特币
一文打尽所有爆料，关于最新款Mac、iPad的信息都在这里了
Update | 「中装速配」获2000万元Pre-A轮融资，整装趋势下诞生的建材供应链平台
Teikametrics获1000万美元A轮融资，帮助第三方商家应对亚马逊广告战
圆通速递董事长喻渭蛟：超越与被超越
FF称恒大健康给资本市场释放错误信息 恒大：考虑起诉贾跃亭
36氪首发 | 用大数据和语义分析撬动专家网络市场，「六度智囊」获百万级天使轮投资
南非总统给马云送礼物，马云的回答亮了
高灯电子发票生态沙龙在京举行，助力中小微企业开启智慧财税大门
代购落幕，全球化妆品品牌如何再造中国市场?
国潮崛起，主打"无性别化"的「Bosie」要做快时尚设计师品牌
微软75亿美元收购开源代码库GitHub，后者将保持独立运营
36氪首发 |「亲宝宝」获好未来领投数亿元C轮融资，为-1至6岁家庭提供一站式育儿服务
锤子整合迎生死时刻，罗永浩盈利模式再临考验
新资本 | 翊翎资本王斌：用产业逻辑做投资，不按图索骥，要亲探水温
原凤凰网旅游品牌总监打造村落游 已布局10个村落 小程序11月上线
尼尔森过招人人车背后：二手车平台谁是王者？
2018餐创新未来（青岛站）完美落幕，看行业大佬都讲了哪些干货？
刷新世界纪录，发布国家平台，云从科技AI帝国正在悄然崛起
石油产量受限，汽油价格多次连涨，有人却想趁机投资赚大钱？"""

for title in news.split('\n'):
    pprint_text(format_tagged_sentence(model, title))