# <center>自然语言处理--高级应用</center>

### 命名实体识别

命名实体识别就是把不属于实体的字用O标注，把实体用BIO规则标注，最后按照BIO规则把实体提取出来。

![](images/ner.png)

图中输入是word embedding,使用双向lstm进行encode，对于lstm的hidden层，接入一个大小为[hidden_dim,num_label]的一个全连接层就可以得到每一个step对应的每个label的概率，也就是上图黄色框的部分。

### 1. 数据处理

In [None]:
import pandas as pd
import numpy as np

data = pd.read_csv("data/ner_dataset.csv", encoding="latin1")
data = data.fillna(method="ffill")

In [None]:
data.head(50)

In [None]:
words = list(set(data["Word"].values))

words.append("UNKNOWN")
words.append("ENDPAD")

n_words = len(words)
print(n_words)

In [None]:
tags = list(set(data["Tag"].values))
n_tags = len(tags); n_tags
n_tags

In [None]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
                                                           s["POS"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [None]:
getter = SentenceGetter(data)

In [None]:
sent = getter.get_next()

In [None]:
sent

In [None]:
sentences = getter.sentences

In [None]:
sentences

In [None]:
max_len = 35
word2idx = {w: i for i, w in enumerate(words)}
tag2idx = {t: i for i, t in enumerate(tags)}

In [None]:
word2idx["sup"]

In [None]:
from keras.preprocessing.sequence import pad_sequences
x_data = [[word2idx[w[0]] for w in s] for s in sentences]
x_data = pad_sequences(maxlen=max_len, sequences=x_data, padding="post", value=n_words - 1)

In [None]:
y_data = [[tag2idx[w[2]] for w in s] for s in sentences]
y_data = pad_sequences(maxlen=max_len, sequences=y_data, padding="post", value=tag2idx["O"])

In [None]:
print(x_data[9])
print(y_data[9])

In [None]:
from keras.utils import to_categorical
y_data = [to_categorical(i, num_classes=n_tags) for i in y_data]

In [None]:
print(x_data[1])
print(y_data[1])

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2)

### 2. 构建模型

In [None]:
from keras.models import Model, Sequential
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional,CuDNNLSTM

model = Sequential()
model.add(Embedding(input_dim=n_words, output_dim=100, input_length=max_len))
model.add(Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1)))
model.add(TimeDistributed(Dense(n_tags, activation="softmax")))
model.summary()

In [None]:
model.compile(optimizer="rmsprop", loss="categorical_crossentropy", metrics=["accuracy"])

### 3. 模型训练 

In [None]:
model.fit(x_train, np.array(y_train), batch_size=64, epochs=2, validation_split=0.1, verbose=2)

### 4. 模型评价

In [None]:
scores = model.evaluate(x_test, np.array(y_test), verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

### 5. 模型拓展

**BiLSTM与CRF**
![](images/crf.png)

**keras-contrib安装**
```py
pip install git+https://www.github.com/keras-team/keras-contrib.git
```

**CRF模型**
```py
from keras.models import Model, Sequential
from keras.layers import Embedding
from keras_contrib.layers import CRF
from keras_contrib.losses import crf_loss
from keras_contrib.metrics import crf_viterbi_accuracy

model = Sequential()
model.add(Embedding(input_dim=n_words, output_dim=100, input_length=max_len))
crf = CRF(n_tags, sparse_target=True)
model.add(crf)
model.summary()

model.compile('adam', loss=crf_loss, metrics=[crf_viterbi_accuracy])
```

**BiLSTM-CRF模型**
```py
from keras.models import Model, Sequential
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional,CuDNNLSTM
from keras_contrib.layers import CRF
from keras_contrib.losses import crf_loss
from keras_contrib.metrics import crf_viterbi_accuracy

model = Sequential()
model.add(Embedding(input_dim=n_words, output_dim=100, input_length=max_len))
model.add(Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1)))
crf = CRF(n_tags, sparse_target=True)
model.add(crf)
model.summary()

model.compile('adam', loss=crf_loss, metrics=[crf_viterbi_accuracy])
```

# Any Questions?