In [15]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.datasets import fetch_20newsgroups
import numpy as np
from sklearn import metrics
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import LSTM
from keras.preprocessing import text

In [37]:
import re
token_pattern=r"(?u)\b\w\w+\b"
def build_tokenizer():
    """Return a function that splits a string into a sequence of tokens"""
    pattern = re.compile(token_pattern)
    return lambda doc: pattern.findall(doc)

In [5]:
newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_test = fetch_20newsgroups(subset='test')
vectorizer = CountVectorizer(max_df = 0.95, min_df = 5, stop_words = 'english')
vectors = vectorizer.fit_transform(newsgroups_train.data)
vectors_test = vectorizer.transform(newsgroups_test.data)
vectors.shape

(11314, 25634)

In [9]:
vocab = vectorizer.get_feature_names()
vocab[19999]
vectorizer.vocabulary_['roid']

u'roid'

In [108]:
tokenize = build_tokenizer()
X_train = []
for seq in newsgroups_train.data:
    s = []
    for word in tokenize(seq):
        if word in vocab:
            s.append(vectorizer.vocabulary_[word] + 1)
    X_train.append(s)
    

In [51]:
X_test = []
for seq in newsgroups_test.data:
    s = []
    for word in tokenize(seq):
        if word in vocab:
            s.append(vectorizer.vocabulary_[word] + 1)
    X_test.append(s)

In [96]:
MAX_LEN = max(max([len(x) for x in X_train], max([len(x) for x in X_test])))
MAX_LEN = 100
X_train = sequence.pad_sequences(X_train, maxlen=MAX_LEN) #30 seems good
X_test  = sequence.pad_sequences(X_test,  maxlen=MAX_LEN)
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)

X_train shape: (11314, 100)
X_test shape: (7532, 100)


In [99]:
print('Build model...')
model = Sequential()
model.add(Embedding(len(vocab) + 1, 128, input_length=MAX_LEN, dropout=0.5))
model.add(LSTM(128, dropout_W=0.5, dropout_U=0.1))  # try using a GRU instead, for fun
model.add(Dropout(0.5))
model.add(Dense(20, init='uniform'))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adadelta')

Build model...


In [107]:
batch_size = 32
print('Train...')
y_train, y_test = [np_utils.to_categorical(x) for x in (newsgroups_train.target, newsgroups_test.target)]
model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=20, show_accuracy=True)
score, acc = model.evaluate(X_test, y_test,
                            batch_size=batch_size,
                            show_accuracy=True)
print('Test score:', score)
print('Test accuracy:', acc)

Train...
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Test score: 1.25328503232
Test accuracy: 0.740175252289


In [91]:
X_train[3].shape

(200,)