In [1]:
import numpy as np
import pandas as pd

In [2]:
from gensim.models import Word2Vec



In [3]:
from keras.models import Sequential, load_model
from keras.layers import Conv1D, Dense, Dropout, Activation, Embedding, GlobalMaxPool1D
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ModelCheckpoint

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [4]:
max_len = 500
embedding_size = 100
dropout=0.5
epochs = 10
validation_split = 0.1
batch_size = 128
filters= 200 
kernel_size= 3
num_words = 100000

In [5]:
tokenizer = Tokenizer(num_words=num_words)

In [6]:
train = pd.read_pickle('train.pkl')

In [7]:
tokenizer.fit_on_texts(train.Content)

In [8]:
vocab_size = len(tokenizer.word_index)

In [9]:
word_vec = Word2Vec.load('./word2vec')

In [10]:
vocab_size

878856

In [11]:
train_seq = tokenizer.texts_to_sequences(train.Content)

In [12]:
X_train = pad_sequences(train_seq, maxlen=max_len, truncating='post', padding='post')

In [13]:
y_train = pd.get_dummies(train.Lable).values

In [14]:
n_class = y_train.shape[1]

In [15]:
embedding_matrix = np.zeros((vocab_size, embedding_size), dtype=np.float32)

In [16]:
unknown_count = 0
unknown_freq = {}
for word, index in tokenizer.word_index.items():
    if index >= vocab_size: 
            continue
    try:
        embedding_matrix[index, :] = word_vec.wv[word]
    except KeyError:
        unknown_freq[word] = tokenizer.word_counts[word]
        unknown_count += 1

In [17]:
def computation_graph():
    model  = Sequential()
    model.add(Embedding(vocab_size, embedding_size, weights=[embedding_matrix], name='Embedding_Layer'))
    model.add(Conv1D(filters= filters, 
                     kernel_size= kernel_size,
                     activation = 'relu',
                     name= '_'.join(['Convolution_1D', str(filters), str(kernel_size)])))
    model.add(GlobalMaxPool1D(name='Global_Max_Pooling'))
    model.add(Dense(units=filters, name='Dense_'+str(filters)))
    model.add(Dropout(rate=dropout, name = 'Dropout_' + str(dropout)))
    model.add(Activation('relu', name='Activation_'+str('relu')))
    model.add(Dense(units=n_class, activation='softmax', name='Dense_'+str(n_class)+'_Sigmoid'))
    return model

In [18]:
model = computation_graph()

In [19]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Embedding_Layer (Embedding)  (None, None, 100)         87885600  
_________________________________________________________________
Convolution_1D_200_3 (Conv1D (None, None, 200)         60200     
_________________________________________________________________
Global_Max_Pooling (GlobalMa (None, 200)               0         
_________________________________________________________________
Dense_200 (Dense)            (None, 200)               40200     
_________________________________________________________________
Dropout_0.5 (Dropout)        (None, 200)               0         
_________________________________________________________________
Activation_relu (Activation) (None, 200)               0         
_________________________________________________________________
Dense_14_Sigmoid (Dense)     (None, 14)                2814      
Total para

In [20]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [21]:
history = model.fit(x=X_train, y=y_train,
                          validation_split = validation_split,
                          epochs=epochs,
                          batch_size=batch_size, 
                          shuffle=True)

Train on 45000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
