Dive into Deep Learning 

Chapter 10

Section 8

In [1]:
import d2lzh as d2l
from mxnet import gluon
from mxnet.gluon import data as gdata, loss as gloss, nn 
from mxnet import gluon, init, nd
from mxnet.contrib import text
import collections

In [3]:
import pandas as pd
import numpy as np
from gensim.models.word2vec import Word2Vec

In [4]:
# load model
model = Word2Vec.load("./word2vec.bin")

In [5]:
def get_vocab(data):
    wordsets = (' '.join(data['text'])).split(' ')
    counter = collections.Counter(wordsets)
    return text.vocab.Vocabulary(counter, min_freq=5, reserved_tokens=['<pad>'])


def preprocess_data(data,vocab):
    max_l = 1000
    
    def pad(x):
        return x[:max_l] if len(x) > max_l else x + [vocab.token_to_idx['<pad>']] * (max_l - len(x))
    
    features = nd.array([pad(vocab.to_indices(text.split())) for text in data['text']])
    labels = nd.array(data['label'])
    return features, labels



In [6]:
train_df = pd.read_csv('../input/train_set.csv', sep='\t', nrows=20000)
train_df.head()

Unnamed: 0,label,text
0,2,2967 6758 339 2021 1854 3731 4109 3792 4149 15...
1,11,4464 486 6352 5619 2465 4802 1452 3137 5778 54...
2,3,7346 4068 5074 3747 5681 6093 1777 2226 7354 6...
3,2,7159 948 4866 2109 5520 2490 211 3956 5520 549...
4,3,3646 3055 3055 2490 4659 6065 3370 5814 2465 5...


In [8]:
vocab = get_vocab(train_df)

In [9]:
batch_size = 64
#d2l.download_imdb()

In [10]:
train_data = train_df.iloc[:18000,:]
test_data = train_df.iloc[18000:20000,:]

In [11]:
vocab

<mxnet.contrib.text.vocab.Vocabulary at 0x7facee231b90>

In [12]:
train_iter = gdata.DataLoader(gdata.ArrayDataset(
    *preprocess_data(train_data, vocab)), batch_size, shuffle = True)
test_iter = gdata.DataLoader(gdata.ArrayDataset(
    *preprocess_data(test_data, vocab)), batch_size)

In [13]:
class TextCNN(nn.Block):
    def __init__(self, vocab, embed_size, kernel_sizes, num_channels,
                     **kwargs):
        super(TextCNN, self).__init__(**kwargs)
        self.embedding = nn.Embedding(len(vocab), embed_size)
        # 不参与训练的嵌入层
        self.constant_embedding = nn.Embedding(len(vocab),embed_size)
        self.dropout = nn.Dropout(0.5)
        self.decoder = nn.Dense(2)
        # 时序最大池化层没有权重， 所以可以共用一个实例
        self.pool = nn.GlobalMaxPool1D()
        self.convs = nn.Sequential() # 创建多个一维卷积层
        for c,k in zip(num_channels, kernel_sizes):
            self.convs.add(nn.Conv1D(c,k,activation='relu'))
            
    def forward(self, inputs):
        # 将两个形状是（批量大小，词数，词向量维度）的嵌入层的输出按词向量连接
        embeddings = nd.concat(
            self.embedding(inputs), self.constant_embedding(inputs),dim=2)
        # 根据Conv1D要求的输入格式，将词向量维，即一维卷积层的通道维，变换到前一维
        embeddings = embeddings.transpose((0,2,1))
        # 对于每一个一维卷积层， 在时许最大池化后会得到一个形状为（批量大小，通道大小，1）的
        # NDArray。使用flatten函数去掉最后一维，然后再通道维上连接
        encoding = nd.concat(*[nd.flatten(
            self.pool(conv(embeddings))) for conv in self.convs], dim=1)
        # 应用dropout后使用全连接层得到输出
        outputs = self.decoder(self.dropout(encoding))
        return outputs

In [14]:
#创建一个TextCNN实例。它有3个卷积层，它们的核宽分别为3、4、5，输出通道数均为100。
embed_size, kernel_sizes, num_channels = 100, [3,4,5], [100,100,100]
ctx = d2l.try_all_gpus()
net = TextCNN(vocab, embed_size, kernel_sizes, num_channels)
net.initialize(init.Xavier(), ctx=ctx)

In [32]:
# 1. 加载预训练的词向量


In [31]:
dim=100
def transform(word):
            try:
                vec = model.wv.get_vector(word)
            except KeyError:
                vec = np.array([0]*dim)
            return vec
       # [model.wv.get_vector(word) for word in txt.split(' ')]

def data_to_vec(data):
    return np.array([transform(word) for word in data])

In [33]:
net.embedding.weight.set_data(data_to_vec(vocab.idx_to_token))
net.constant_embedding.weight.set_data(data_to_vec(vocab.idx_to_token))
net.constant_embedding.collect_params().setattr('grad_req','null')

In [34]:
# 2. 训练模型
lr, num_epochs = 0.001, 5
trainer = gluon.Trainer(net.collect_params(), 'adam', {'learning_rate': lr})
loss = gloss.SoftmaxCrossEntropyLoss()
d2l.train(train_iter, test_iter, net, loss, trainer, ctx, num_epochs)

training on [cpu(0)]
epoch 1, loss 0.2418, train acc 0.310, test acc 0.318, time 553.5 sec
epoch 2, loss 0.1202, train acc 0.351, test acc 0.334, time 500.4 sec
epoch 3, loss 0.0919, train acc 0.360, test acc 0.332, time 543.5 sec
epoch 4, loss 0.0705, train acc 0.366, test acc 0.335, time 512.5 sec
epoch 5, loss 0.0547, train acc 0.370, test acc 0.337, time 477.4 sec


In [38]:
text = test_data['text'][0]
d2l.predict_sentiment(net, vocab, text.split())

KeyError: 0

In [21]:
import pickle

In [83]:
glove_embedding_handle = open('../pkl/glove_embedding.pkl','wb')
pickle.dump(glove_embedding,glove_embedding_handle)
glove_embedding_handle.close()

In [24]:
glove_embedding= pickle.load(open('../../textCNN/pkl/glove_embedding.pkl','rb'))


In [27]:
glove_embedding.idx_to_vec.shape

(46151, 100)

In [84]:
vocab_handle = open('../pkl/vocab.pkl','wb')
pickle.dump(vocab,vocab_handle)
vocab_handle.close()

In [85]:
net_handle = open('../pkl/net.pkl','wb')
pickle.dump(net,net_handle)
net_handle.close()