In [1]:
import os
import sys
import numpy as np
import pandas as pd
# import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

from keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras import layers

# from keras.layers import Dense, Input, GlobalMaxPooling1D
# from keras.layers import Conv1D, MaxPooling1D, Embedding
# from keras.models import Model
# from keras.initializers import Constant

Using TensorFlow backend.


In [2]:
# create input dataframe
def create_input_dataframe(dirpath):
    texts = []  # list of text samples
    labels_index = {}  # dictionary mapping label name to numeric id
    labels = []  # list of label ids
    for name in sorted(os.listdir(dirpath)):
        path = os.path.join(dirpath, name)
        if os.path.isdir(path):
            label_id = len(labels_index)
            labels_index[name] = label_id
            for fname in sorted(os.listdir(path)):
                if fname.isdigit():
                    fpath = os.path.join(path, fname)
                    if sys.version_info < (3,):
                        f = open(fpath)
                    else:
                        f = open(fpath, encoding='latin-1')
                t = f.read()
                i = t.find('\n\n')  # skip header
                if 0 < i:
                    t = t[i:]
                texts.append(t)
                f.close()
                labels.append(label_id)

    data = pd.DataFrame({'document': texts, 'label': labels})
    data = data.sample(frac=1)
    return data

In [3]:
# create embedding matrix from pretrained embedding model
def create_embedding_matrix(filepath, word_index, embedding_dim):
    vocab_size = len(word_index) + 1  # Adding again 1 because of reserved 0 index
    embedding_matrix = np.zeros((vocab_size, embedding_dim))

    with open(filepath, encoding='utf8') as f:
        for line in f:
            word, *vector = line.split()
            if word in word_index:
                idx = word_index[word]
                embedding_matrix[idx] = np.array(
                    vector, dtype=np.float32)[:embedding_dim]

    return embedding_matrix

In [4]:
data = create_input_dataframe('data/2_newsgroup/')
print('input dataframe shape: ' + str(data.shape))
data = data.sample(frac=1)
data.head()

input dataframe shape: (2000, 2)


Unnamed: 0,document,label
121,\n\n<MVS104@psuvm.psu.edu> writes:\n\n>>Many p...,0
130,\n\nTony Lezard <tony@mantis.co.uk> writes:\n\...,0
41,"\n\nAs I was created in the image of Gaea, the...",0
93,\n\n>DATE: 5 Apr 1993 23:32:28 GMT\n>FROM: ...,0
1342,\n\nNO E-MAIL ADDRESS@eicn.etna.ch writes:\n\n...,1


In [5]:
# see the distribution of the document length
doc_len = [len(s.split()) for s in data['document']]
print('min tweet len: ' + str(np.min(doc_len)))
print('max tweet len: ' + str(np.max(doc_len)))
print('avg tweet len: ' + str(np.mean(doc_len)))
# plt.hist(doc_len, bins='auto')
# plt.show()

min tweet len: 0
max tweet len: 9207
avg tweet len: 276.7885


In [6]:
# get the documents and labels in numpy array
documents = data['document'].values
y = to_categorical(np.asarray(data['label'].values))

In [7]:
documents_train, documents_test, y_train, y_test = train_test_split(documents, y, test_size=0.25, random_state=1000)

In [8]:
MAX_NUM_WORDS = 20000
tokenizer = Tokenizer(num_words = MAX_NUM_WORDS)
tokenizer.fit_on_texts(documents_train)
vocab_size = len(tokenizer.word_index) + 1  # Adding 1 because of reserved 0 index
print('vocab_size: ' + str(vocab_size))

vocab_size: 25235


In [9]:
X_train = tokenizer.texts_to_sequences(documents_train)
X_test = tokenizer.texts_to_sequences(documents_test)

maxlen = 1000

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

In [10]:
embedding_dim = 50
embedding_matrix = create_embedding_matrix('resource/glove.6B.50d.txt', tokenizer.word_index, embedding_dim)

In [14]:
nonzero_elements = np.count_nonzero(np.count_nonzero(embedding_matrix, axis=1))
overlap = nonzero_elements / vocab_size
print('vocab overlap: ' + str(overlap))

vocab overlap: 0.7459534698747265


In [15]:
model = Sequential()
model.add(layers.Embedding(input_dim=vocab_size,
                           output_dim=embedding_dim,
                           weights=[embedding_matrix],
                           input_length=maxlen,
                           trainable=True))
# model.add(layers.Flatten())
# model.add(layers.Dense(10, activation='relu'))
# model.add(layers.Dense(1, activation='sigmoid'))
model.add(layers.Conv1D(128, 5, activation='relu'))
model.add(layers.GlobalMaxPool1D())
model.add(layers.Dense(128, activation='relu'))
model.add(layers.Dense(20, activation='softmax'))

In [16]:
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1000, 50)          1257250   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 996, 128)          32128     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 128)               16512     
_________________________________________________________________
dense_2 (Dense)              (None, 20)                2580      
Total params: 1,308,470
Trainable params: 1,308,470
Non-trainable params: 0
_________________________________________________________________


In [59]:
history = model.fit(X_train, y_train,
                    epochs=20,
                    verbose=True,
                    validation_data=(X_test, y_test),
                    batch_size=10)

Train on 14997 samples, validate on 5000 samples
Epoch 1/20
  720/14997 [>.............................] - ETA: 6:26 - loss: 3.0129 - acc: 0.0833

KeyboardInterrupt: 

In [36]:
# print('Shape of data tensor:', X_train.shape)
# print('Shape of label tensor:', to_categorical(np.asarray(y_train)).shape)

In [33]:
# MAX_NUM_WORDS = 20000
# MAX_SEQUENCE_LENGTH = 1000

# # tokenizing the text and creating vecotr tensor out of that
# tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
# tokenizer.fit_on_texts(texts)

# word_index = tokenizer.word_index
# print('Found %s unique tokens.' % len(word_index))

# # vetorizing the text samples into 2D integer tensor 
# sequences = tokenizer.texts_to_sequences(texts)
# # pad sequences
# # MAX_SEQUENCE_LENGTH = max([len(s.split()) for s in texts])
# data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

# labels = to_categorical(np.asarray(labels))
# print('Shape of data tensor:', data.shape)
# print('Shape of label tensor:', labels.shape)

In [37]:
# # split the data into a training set and a validation set
# indices = np.arange(data.shape[0])
# np.random.shuffle(indices)
# data = data[indices]
# labels = labels[indices]
# num_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

# x_train = data[:-num_validation_samples]
# y_train = labels[:-num_validation_samples]
# x_val = data[-num_validation_samples:]
# y_val = labels[-num_validation_samples:]

In [38]:
# # first, build index mapping words in the embeddings set
# # to their embedding vector
# print('Indexing word vectors.')

# embeddings_index = {}
# with open(os.path.join(EMBEDDING_DIR, EMBEDDING_FILE_NAME), encoding="utf8") as f:
#     for line in f:
#         values = line.split()
#         word = values[0]
#         coefs = np.asarray(values[1:], dtype='float32')
#         embeddings_index[word] = coefs

In [39]:
# print('Found %s word vectors.' % len(embeddings_index))

In [40]:
# print('Preparing embedding matrix.')
# # prepare embedding matrix
# num_words = min(MAX_NUM_WORDS, len(word_index)) + 1
# embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
# for word, i in word_index.items():
#     if i > MAX_NUM_WORDS:
#         continue
#     embedding_vector = embeddings_index.get(word)
#     if embedding_vector is not None:
#         # words not found in embedding index will be all-zeros.
#         embedding_matrix[i] = embedding_vector

In [37]:
# # load pre-trained word embeddings into an Embedding layer
# # note that we set trainable = False so as to keep the embeddings fixed
# embedding_layer = Embedding(num_words,
#                             EMBEDDING_DIM,
#                             embeddings_initializer=Constant(embedding_matrix),
#                             input_length=MAX_SEQUENCE_LENGTH,
#                             trainable=False)

In [42]:
# print('Training model.')

# # train a 1D convnet with global maxpooling
# sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
# embedded_sequences = embedding_layer(sequence_input)
# x = Conv1D(128, 5, activation='relu')(embedded_sequences)
# x = MaxPooling1D(5)(x)
# x = Conv1D(128, 5, activation='relu')(x)
# x = MaxPooling1D(5)(x)
# x = Conv1D(128, 5, activation='relu')(x)
# x = GlobalMaxPooling1D()(x)
# x = Dense(128, activation='relu')(x)
# # preds = Dense(len(labels_index), activation='softmax')(x)
# preds = Dense(20, activation='softmax')(x)

In [41]:
# model = Model(sequence_input, preds)
# model.compile(loss='categorical_crossentropy',
#               optimizer='rmsprop',
#               metrics=['acc'])

In [41]:
# model.fit(x_train, y_train,
#           batch_size=128,
#           epochs=10,
#           validation_data=(x_val, y_val))