# <center>Python课程实践--深度学习</center>

### 命名实体识别

![](images/ner.png)

图中输入是word embedding,使用双向lstm进行encode，对于lstm的hidden层，接入一个大小为[hidden_dim,num_label]的一个全连接层就可以得到每一个step对应的每个label的概率，也就是上图黄色框的部分。

### 1. 数据处理

In [1]:
import pandas as pd
import numpy as np

data = pd.read_csv("data/ner_dataset.csv", encoding="latin1")
data = data.fillna(method="ffill")

In [2]:
data.head(10)

Unnamed: 0,Sentence,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O
5,Sentence: 1,through,IN,O
6,Sentence: 1,London,NNP,B-geo
7,Sentence: 1,to,TO,O
8,Sentence: 1,protest,VB,O
9,Sentence: 1,the,DT,O


In [3]:
words = list(set(data["Word"].values))

words.append("UNKNOWN")
words.append("ENDPAD")

n_words = len(words)
print(n_words)

35167


In [4]:
tags = list(set(data["Tag"].values))
n_tags = len(tags); n_tags
n_tags

17

In [5]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
                                                           s["POS"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [6]:
getter = SentenceGetter(data)

In [7]:
sent = getter.get_next()

In [8]:
sentences = getter.sentences

In [9]:
max_len = 35
word2idx = {w: i for i, w in enumerate(words)}
tag2idx = {t: i for i, t in enumerate(tags)}

In [10]:
word2idx["sup"]

34676

In [11]:
from keras.preprocessing.sequence import pad_sequences
x_data = [[word2idx[w[0]] for w in s] for s in sentences]
x_data = pad_sequences(maxlen=max_len, sequences=x_data, padding="post", value=n_words - 1)

Using TensorFlow backend.


In [12]:
y_data = [[tag2idx[w[2]] for w in s] for s in sentences]
y_data = pad_sequences(maxlen=max_len, sequences=y_data, padding="post", value=tag2idx["O"])

In [13]:
print(x_data[9])
print(y_data[9])

[26540 17019  6300 11557 10900 16768  5201 16834  6751 28740 24899 30342
 29858 21603  9685 19599  8659 19547 32177 10696 28574  6751 13953  3750
 27600 17859 27985 35166 35166 35166 35166 35166 35166 35166 35166]
[ 3 13 13 13 12 13 13 13 13 13 13 13 13  9 13 13 13 13 13 13 13 13 13 13
 13 13 13 13 13 13 13 13 13 13 13]


In [14]:
from keras.utils import to_categorical
y_data = [to_categorical(i, num_classes=n_tags) for i in y_data]

In [15]:
print(x_data[1])
print(y_data[1])

[33611 10048 23348 29281  4913 19547 19634 30144 19547 26640 25548  8351
 29858 13953  7615 25425 10006 20873 19599 18923 25647 13916  3557 13633
 27985 35166 35166 35166 35166 35166 35166 35166 35166 35166 35166]
[[0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0

In [16]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2)

### 2. 构建模型

In [17]:
from keras.models import Model, Sequential
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional,CuDNNLSTM

model = Sequential()
model.add(Embedding(input_dim=n_words, output_dim=100, input_length=max_len))
model.add(Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1)))
model.add(TimeDistributed(Dense(n_tags, activation="softmax")))

model.compile(optimizer="rmsprop", loss="categorical_crossentropy", metrics=["accuracy"])

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [18]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 35, 100)           3516700   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 35, 200)           160800    
_________________________________________________________________
time_distributed_1 (TimeDist (None, 35, 17)            3417      
Total params: 3,680,917
Trainable params: 3,680,917
Non-trainable params: 0
_________________________________________________________________


### 3. 模型训练 

In [None]:
model.fit(x_train, np.array(y_train), batch_size=30, epochs=10, validation_split=0.1, verbose=2)

Instructions for updating:
Use tf.cast instead.
Train on 34530 samples, validate on 3837 samples
Epoch 1/10
 - 128s - loss: 0.1770 - acc: 0.9551 - val_loss: 0.0835 - val_acc: 0.9763
Epoch 2/10
 - 125s - loss: 0.0708 - acc: 0.9793 - val_loss: 0.0708 - val_acc: 0.9794
Epoch 3/10
 - 129s - loss: 0.0592 - acc: 0.9825 - val_loss: 0.0673 - val_acc: 0.9802
Epoch 4/10


### 4. 模型评价

In [None]:
score,acc = model.evaluate(x_test, np.array(y_test), verbose = 2, batch_size = 15)
print("score: %.2f" % (score))
print("acc: %.2f" % (acc))