In [35]:
from __future__ import absolute_import
from __future__ import print_function
import scipy.io
from keras.preprocessing.image import ImageDataGenerator
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, GRU
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.convolutional import Convolution2D, MaxPooling2D
from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.optimizers import SGD, Adadelta, Adagrad
from keras.optimizers import Adam
from keras.utils import np_utils, generic_utils
from six.moves import range
import keras.utils
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import pandas as pd
import numpy as np
import re


import os
import sys

from keras.utils import to_categorical
from keras.models import Model
from keras.initializers import Constant

BASE_DIR = '../'
GLOVE_DIR = os.path.join(BASE_DIR, '../glove.6B')
TEXT_DATA_DIR = os.path.join(BASE_DIR, 'input')
MAX_SEQUENCE_LENGTH =100
MAX_NUM_WORDS = 100000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2

coded_df = pd.read_csv(os.path.join(TEXT_DATA_DIR,'balanced_trainers_V2.csv'),engine = 'python')

# second, prepare text samples and their labels
print('Processing text dataset')

texts = []  # list of text samples
labels_index = {}  # dictionary mapping label name to numeric id
labels = []  # list of label ids

label_id = range(coded_df.shape[0])
labels_index = 1
texts = [t for t in coded_df['Tweet']]
labels = [y for y in coded_df['Y']]

print('Found %s texts.' % len(texts))

# finally, vectorize the text samples into a 2D integer tensor
tokenizer = Tokenizer(MAX_NUM_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
labels = np.asarray(labels)

print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

# split the data into a training set and a validation set
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]

labels = labels[indices]
num_validation_samples = int(VALIDATION_SPLIT * data.shape[0])


x_train = data[:-num_validation_samples]
y_train = labels[:-num_validation_samples]
x_val = data[-num_validation_samples:]
y_val = labels[-num_validation_samples:]

Processing text dataset
Found 10000 texts.
Found 26251 unique tokens.
Shape of data tensor: (10000, 100)
Shape of label tensor: (10000,)


In [36]:
import io
import json

In [37]:
import os
embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

In [38]:
tokenizer_json = tokenizer.to_json() 
with io.open('../model_objects/tokenizer_V2.json', 'w', encoding='utf-8') as f:  
      f.write(json.dumps(tokenizer_json, ensure_ascii=False))

In [39]:
print('Found %s word vectors.' % len(embeddings_index))
print('Preparing embedding matrix.')
# prepare embedding matrix
num_words = min(MAX_NUM_WORDS, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i >= MAX_NUM_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed


print('Training model.')

Found 400000 word vectors.
Preparing embedding matrix.
Training model.


In [40]:
model = Sequential()

embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)
model.add(embedding_layer)
model.add(GRU(units = 32,dropout = 0.2,recurrent_dropout = 0.2))
model.add(Dense(1, activation = 'sigmoid'))
model.compile(loss = 'binary_crossentropy',optimizer = 'adam',metrics=['accuracy'])

In [41]:
model.fit(x_train, y_train,
          batch_size=128,
          epochs=20,
          validation_data=(x_val, y_val),verbose = 2)

Train on 8000 samples, validate on 2000 samples
Epoch 1/20
 - 5s - loss: 0.6081 - accuracy: 0.6647 - val_loss: 0.5812 - val_accuracy: 0.6765
Epoch 2/20
 - 5s - loss: 0.4292 - accuracy: 0.8303 - val_loss: 0.5137 - val_accuracy: 0.7510
Epoch 3/20
 - 4s - loss: 0.3951 - accuracy: 0.8429 - val_loss: 0.4928 - val_accuracy: 0.7700
Epoch 4/20
 - 6s - loss: 0.3788 - accuracy: 0.8545 - val_loss: 0.4922 - val_accuracy: 0.7725
Epoch 5/20
 - 5s - loss: 0.3729 - accuracy: 0.8537 - val_loss: 0.4910 - val_accuracy: 0.7690
Epoch 6/20
 - 4s - loss: 0.3645 - accuracy: 0.8577 - val_loss: 0.4798 - val_accuracy: 0.7770
Epoch 7/20
 - 4s - loss: 0.3561 - accuracy: 0.8605 - val_loss: 0.4721 - val_accuracy: 0.7860
Epoch 8/20
 - 4s - loss: 0.3558 - accuracy: 0.8634 - val_loss: 0.4684 - val_accuracy: 0.7870
Epoch 9/20
 - 4s - loss: 0.3438 - accuracy: 0.8684 - val_loss: 0.4649 - val_accuracy: 0.7980
Epoch 10/20
 - 4s - loss: 0.3437 - accuracy: 0.8673 - val_loss: 0.4593 - val_accuracy: 0.7970
Epoch 11/20
 - 4s - l

<keras.callbacks.callbacks.History at 0x13b938410>

In [42]:

#x = Conv1D(128, 5, activation='relu')(embedded_sequences)
#x = MaxPooling1D(5)(x)
#x = Conv1D(128, 5, activation='relu')(x)
#x = MaxPooling1D(5)(x)
#x = Conv1D(128, 5, activation='relu')(x)
#x = GlobalMaxPooling1D()(x)
#x = Dense(128,activation='relu')(x)
#preds = Dense(labels_index, activation='softmax')(x)

#model = Model(sequence_input, preds)
#model.compile(loss='binary_crossentropy',
              #optimizer='rmsprop',
#              optimizer = Adam(0.01),
#             metrics=['acc'])



In [43]:
#%mkdir ../model_objects
# serialize model to JSON
model_json = model.to_json()
with open("../model_objects/flood_classifier_model_V2.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("../model_objects/flood_classifer_model_V2.h5")
print("Saved model to disk")
 

Saved model to disk


In [44]:
print(num_words)

26252
