# Word2vec Text Classification

The dataset we'll be classifying is the 20-newsgroups dataset, available from:

http://www.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/news20.html

It's decribed as:

`This data set is a collection of 20,000 messages, collected from 20 different netnews newsgroups. One thousand messages from each of the twenty newsgroups were chosen at random and partitioned by newsgroup name. The list of newsgroups from which the messages were chose is as follows:`

- `alt.atheism`
- `talk.politics.guns`
- `talk.politics.mideast`
- `talk.politics.misc`
- `talk.religion.misc`
- `soc.religion.christian`

- `comp.sys.ibm.pc.hardware`
- `comp.graphics`
- `comp.os.ms-windows.misc`
- `comp.sys.mac.hardware`
- `comp.windows.x`

- `rec.autos`
- `rec.motorcycles`
- `rec.sport.baseball`
- `rec.sport.hockey`

- `sci.crypt`
- `sci.electronics`
- `sci.space`
- `sci.med`

- `misc.forsale`


The model is borrowed from the [keras examples database](https://github.com/fchollet/keras/tree/master/examples), with some code tweaks in order to be more efficient and improve model performance.

In [1]:
import os
import sys
import numpy as np

# Process data
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split

# Build model
from keras.layers import Dense, Input, Flatten, Conv1D, MaxPooling1D, Embedding
from keras.models import Model

# Callbacks
from keras.callbacks import EarlyStopping, ReduceLROnPlateau

Using TensorFlow backend.


In [2]:
BASE_DIR = '/input'
TEXT_DATA_DIR = BASE_DIR + '/newsgroups/'
MAX_SEQUENCE_LENGTH = 1000
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 300


print('Indexing word vectors.')
embeddings_index = {}
with open(os.path.join(BASE_DIR, 'glove.6B.300d.txt'), encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

print('Found %s word vectors.' % len(embeddings_index))

Indexing word vectors.
Found 400000 word vectors.


In [3]:
print('Processing text dataset.')

texts = []        # text samples
labels_index = {} # maps label name to numeric id
labels = []       # label ids

for name in sorted(os.listdir(TEXT_DATA_DIR)):
    path = os.path.join(TEXT_DATA_DIR, name)
    if os.path.isdir(path):
        label_id = len(labels_index)   # create a new label
        labels_index[name] = label_id  # assign label to folder name
        for fname in sorted(os.listdir(path)):
            if fname.isdigit():
                fpath = os.path.join(path, fname)
                f = open(fpath, encoding='latin-1')
                post = f.read()
                i = post.find('\n\n')  # skip header
                if 0 < i:
                    post = post[i:]
                texts.append(post)
                f.close()
                labels.append(label_id)

print('Found %s texts.' % len(texts))

Processing text dataset.
Found 19997 texts.


In [4]:
print('Vectorizing text.')

tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

Vectorizing text.
Found 174074 unique tokens.
Shape of data tensor: (19997, 1000)
Shape of label tensor: (19997, 20)


In [5]:
print('Splitting the training and validation data.')

x_train, x_val, y_train, y_val = train_test_split(data, labels, test_size=0.2)

Splitting the training and validation data.


In [6]:
print('Preparing embedding matrix.')

# Prepare embedding matrix
num_words = min(MAX_NB_WORDS, len(word_index))
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i >= MAX_NB_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
        # Words not found in embedding index will remain all zeros

# Load pre-trained word embeddings into an Embedding layer
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

Preparing embedding matrix.


In [7]:
# Callbacks
reduce_lr = ReduceLROnPlateau(monitor='loss', factor=0.75,
              patience=3, min_lr=0.0001)

early_stop = EarlyStopping(monitor='val_acc', min_delta=.01,
                           patience=4, verbose=1, mode='auto')

# 1D convnet with global maxpooling
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32', name='input')
embedded_sequences = embedding_layer(sequence_input)
x = Conv1D(512, 5, activation='relu', name='convolution_1')(embedded_sequences)
x = MaxPooling1D(5, name='maxpool_1')(x)

x = Conv1D(512, 5, activation='relu', name='convolution_2')(x)
x = MaxPooling1D(5, name='maxpool_2')(x)

x = Conv1D(512, 5, activation='relu', name='convolution_3')(x)
x = MaxPooling1D(35, name='maxpool_3')(x)

x = Flatten(name='flatten')(x)
x = Dense(512, activation='relu', name='fully_connected')(x)

preds = Dense(len(labels_index), activation='softmax', name='softmax')(x)

model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

model.summary()

model.fit(x_train, y_train,
          batch_size=128,
          epochs=20,
          validation_data=(x_val, y_val),
          callbacks=[early_stop, reduce_lr])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input (InputLayer)           (None, 1000)              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 1000, 300)         6000000   
_________________________________________________________________
convolution_1 (Conv1D)       (None, 996, 512)          768512    
_________________________________________________________________
maxpool_1 (MaxPooling1D)     (None, 199, 512)          0         
_________________________________________________________________
convolution_2 (Conv1D)       (None, 195, 512)          1311232   
_________________________________________________________________
maxpool_2 (MaxPooling1D)     (None, 39, 512)           0         
_________________________________________________________________
convolution_3 (Conv1D)       (None, 35, 512)           1311232   
__________

<keras.callbacks.History at 0x7fd0614836a0>