In [1]:
import numpy as np 

深度学习模型不会接受原始文本所为输入，它只能吃力数值张量。文本向量化(Vectorize)是指将文本转换为张量的过程。

文本向量化有多种实现方式：

- 将文本分割为单词，并将每个单词转换为一个向量
- 将文本分割为字符，并将每个字符转换为一个向量
- 提取单词或者字符的n-gram,并将每个n-gram转换为一个向量。n-gram是多个连续单词或者字符的结合(n-gram间可以重叠)

## 单词和字符的one-hot编码

In [2]:
# 单词级别的one-hot编码
samples = ['The cat sat on the mat.', 'The dog ate my homework.']
token_index = {}
for sample in samples:
    for word in sample.split():
        if word not in token_index:
            token_index[word] = len(token_index) + 1 
max_length = 10 
results = np.zeros(shape=(len(samples), 
                             max_length,
                             max(token_index.values())+1 ))

# 将结果保存在results里面
for i ,sample in enumerate(samples):
    for j, word in list(enumerate(sample.split()))[:max_length]:
        index = token_index.get(word)
        results[i,j,index]=1 

In [3]:
results

array([[[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],

       [[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0

In [4]:
# 使用keras实现one-hot编码
from keras.preprocessing.text import Tokenizer

samples = ['The cat sat on the mat.', 'The dog ate my homework.']
tokenizer = Tokenizer(num_words=1000)
# 构建单词索引
tokenizer.fit_on_texts(samples)
# 将字符串转换为整数索引组成的列表
sequences = tokenizer.texts_to_sequences(samples)

Using TensorFlow backend.


In [5]:
tokenizer.document_count

2

In [6]:
tokenizer.word_index

{'the': 1,
 'cat': 2,
 'sat': 3,
 'on': 4,
 'mat': 5,
 'dog': 6,
 'ate': 7,
 'my': 8,
 'homework': 9}

In [7]:
sequences

[[1, 2, 3, 4, 1, 5], [1, 6, 7, 8, 9]]

In [8]:
# 直接得到one-hot二进制表示
one_hot_results = tokenizer.texts_to_matrix(samples, mode='freq')

In [9]:
one_hot_results

array([[0.        , 0.33333333, 0.16666667, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.2       , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [10]:
one_hot_results.shape

(2, 1000)

In [11]:
# 加载IMDB数据
from keras.datasets import imdb
from keras import preprocessing

In [12]:
max_features = 10000 
maxlen = 20 
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)

In [13]:
x_train.shape, y_train.shape

((25000,), (25000,))

In [14]:
len(x_train[0]), len(x_train[1])

(218, 189)

In [15]:
x_train = preprocessing.sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = preprocessing.sequence.pad_sequences(x_test, maxlen=maxlen)

In [16]:
x_train.shape

(25000, 20)

In [17]:
from keras.models import Sequential
from keras.layers import Flatten, Dense, Embedding

In [18]:
model = Sequential()
model.add(Embedding(1000, 8, input_length=maxlen))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='rmsprop', loss='binary_crossentropy',
             metrics=['acc'])

Instructions for updating:
Colocations handled automatically by placer.


In [19]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 20, 8)             8000      
_________________________________________________________________
flatten_1 (Flatten)          (None, 160)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 161       
Total params: 8,161
Trainable params: 8,161
Non-trainable params: 0
_________________________________________________________________


In [20]:
history = model.fit(x_train,y_train, epochs=10, batch_size=32, validation_split=0.2)

Instructions for updating:
Use tf.cast instead.
Train on 20000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [21]:
# 使用预训练的词嵌入
import os 

In [22]:
imdb_dir = '../../data/imdb/aclImdb/'
train_dir = os.path.join(imdb_dir, 'train')

In [23]:
labels = []
texts = []

for label_type in ['neg', 'pos']:
    dir_name = os.path.join(train_dir, label_type)
    for fname in os.listdir(dir_name):
        if fname[-4:] == '.txt':
            with open(os.path.join(dir_name, fname)) as f:
                texts.append(f.read())
                if label_type == 'neg':
                    labels.append(0)
                else:
                    labels.append(1)

In [24]:
from keras import preprocessing

In [25]:
maxlen = 100 
training_samples = 200 
validation_samples = 10000 
max_words = 10000 

# 只考虑数据集中前10000个最常见的单词
tokenizer =  preprocessing.text.Tokenizer(num_words=max_words)


In [26]:
len(texts)

25000

In [27]:
tokenizer.fit_on_texts(texts=texts)

In [28]:
sequences = tokenizer.texts_to_sequences(texts)

In [29]:
len(sequences)

25000

In [30]:
data = preprocessing.sequence.pad_sequences(sequences, maxlen=maxlen)

In [31]:
labels = np.asarray(labels)

In [32]:
labels.shape

(25000,)

In [33]:
data.shape

(25000, 100)

In [34]:
indices = np.arange(data.shape[0])

In [35]:
# 将数据划分为训练集和验证集，需要打乱数据
np.random.shuffle(indices)

In [36]:
data = data[indices]
labels = labels[indices]

In [37]:
training_samples, validation_samples

(200, 10000)

In [38]:
x_train = data[:training_samples]
y_train = labels[:training_samples]

x_val = data[training_samples: training_samples + validation_samples]
y_val = labels[training_samples: training_samples + validation_samples]

### 对词嵌入进行预处理

对解压后的文件进行解析，构建一个将单词映射成其向量表示的索引

In [39]:
glove_file = '../../data/glove.6B.100d.txt'
embeddings_index = {}
i = 0 
with open(glove_file, 'r') as f:
    for line in f:
        i += 1
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
        
print(i)

400000


In [42]:
word_index = tokenizer.word_index

In [None]:
# 构建一个可以加载到Embedding层中的嵌入矩阵
# 其形状为(max_words, embedding_dim)

embedding_dim = 100 
embedding_matrix = np.zeros((max_words, embedding_dim))

for embedding_vector, i in word_index.items():
    if i < max_words:
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

In [None]:
from keras.models import Sequential
fromrom keras.layers import Embedding, Flatten, Dense