In [1]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation,Flatten
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D
from keras.preprocessing.image import ImageDataGenerator, array_to_img, img_to_array, load_img
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras import metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import shuffle

from keras import metrics
from keras.layers.merge import concatenate
import sys
sys.path.append('../')
from Utilities.model_visualization import model_to_png

from PIL import Image # used for loading images
import numpy as np
from numpy import asarray
import matplotlib.pyplot as plt

import os # used for navigating to image path

from keras.layers import Input

import cv2
from keras.layers import concatenate
from keras.models import Model
from keras.applications import VGG16

import pandas as pd
from glob import glob
import tensorflow as tf

from keras.callbacks import ModelCheckpoint
from sklearn.model_selection import StratifiedKFold
from keras.preprocessing.image import ImageDataGenerator, img_to_array, load_img
from keras.callbacks import EarlyStopping

import pickle
from scipy import misc
from keras import optimizers

from keras.utils import plot_model

from numpy import array
from numpy import argmax
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

import time

%matplotlib inline

Using TensorFlow backend.


<h1>Loading Data</h1>

In [2]:
start = time.time()
df = pd.read_pickle('dataset.pkl')
print(df.iloc[0])

text          HARVEY AFTER DONNA KISSED HIM: https://t.co/mz...
text_info                                       not_informative
image_path    data_image/hurricane_harvey/14_9_2017/90825435...
damage                                               irrelevant
Name: 0, dtype: object


In [3]:
split_pos = int(df['text'].count()*0.8)
train = df[:split_pos]
test = df[split_pos:]

<h1>Text Model</h1>

In [4]:
dataColumn = 'text'
labelColumn = 'text_info'

tags = train[labelColumn]
texts = train[dataColumn]

tags_Y = test[labelColumn]
texts_Y = test[dataColumn]

In [5]:
num_max = 1000
# preprocess
le = LabelEncoder()
tags = le.fit_transform(tags.astype(str))
tok = Tokenizer(num_words=num_max)
tok.fit_on_texts(texts)
mat_texts = tok.texts_to_matrix(texts,mode='count')

# For testing data
le_Y = LabelEncoder()
tags_Y = le_Y.fit_transform(tags_Y.astype(str))
tok_Y = Tokenizer(num_words=num_max)
tok_Y.fit_on_texts(texts_Y)
mat_texts_Y = tok.texts_to_matrix(texts_Y,mode='count')

In [6]:
# for cnn preproces
max_len = 100
cnn_texts_seq = tok.texts_to_sequences(texts)
cnn_texts_mat = sequence.pad_sequences(cnn_texts_seq,maxlen=max_len)

# For testing data
cnn_texts_seq_Y = tok.texts_to_sequences(texts_Y)
cnn_texts_mat_Y = sequence.pad_sequences(cnn_texts_seq_Y,maxlen=max_len)

In [7]:
filepath = "text_weights.h5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max', period=1)
early_stopping = EarlyStopping(monitor='val_acc', min_delta=0, patience=4, verbose=1, mode='max')
callbacks_list = [checkpoint, early_stopping]

In [8]:
def get_hybrid_model():    # Pre Trained Embeddings
    
    # load the whole embedding into memory
    embeddings_index = dict()
    f = open('Embeddings/glove.6B.100d.txt', encoding="utf8")
    for line in f:
        values = line.split()
        word = values[0]
        coefs = asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()
    print('Found %s word vectors.' % len(embeddings_index))
    
    # create a weight matrix for words in training docs
    embedding_matrix = np.zeros((len(tok.word_index) + 1, 100))
    for word, i in tok.word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

    #text classifier
    inputs = Input(shape=(100,))
    e = Embedding(len(tok.word_index) + 1,
                  100, 
                  weights=[embedding_matrix],
                  input_length=max_len, 
                  trainable=False)(inputs)
    x = Dropout(0.2)(e)
    x = Conv1D(128,
               3,
               padding='valid',
               activation='relu',
               strides=1)(x)
    x = GlobalMaxPooling1D()(x)
    x = Dense(128, activation='relu')(x)
    x = Dropout(0.2)(x)
    hybrid_link = Dense(32, activation='relu', name='hybrid_link')(x)
    x = Dense(1, activation='sigmoid', name='Text_Classifier')(hybrid_link)
    text_classifier = x
    #image classifier
    IMAGE_SIZE = [224, 224]  # we will keep the image size as (64,64). You can increase the size for better results. 
    vgg = VGG16(input_shape = (224, 224, 3), weights = None, include_top = True)  # input_shape = (64,64,3) as required by VGG
    x = (vgg.layers[-2].output)
    image_model = Dense(4, name='vgg_output', activation = 'softmax')(x)  # adding the output layer with softmax function as this is a multi label classification problem.
    #hybrid model
    concatenate_layer = concatenate([image_model, hybrid_link]) 
    hybrid = Dense(4, activation='softmax', name='Hybrid_Classifier')(concatenate_layer)
    model = Model(inputs=[vgg.input, inputs], outputs=[hybrid,text_classifier])
    
    model.load_weights('initial_hybrid.h5')
    model.layers.pop()
    model.layers.pop()
    model.layers.pop()
    model.layers.pop()
    model.summary()
    model = Model(inputs=[vgg.input, inputs], outputs=[image_model,text_classifier])
    plot_model(model, to_file='divided models.png')
    return model

In [9]:
hybrid_model = get_hybrid_model()
hybrid_model.summary()


Found 400000 word vectors.
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 224, 224, 3)  0                                            
__________________________________________________________________________________________________
block1_conv1 (Conv2D)           (None, 224, 224, 64) 1792        input_2[0][0]                    
__________________________________________________________________________________________________
block1_conv2 (Conv2D)           (None, 224, 224, 64) 36928       block1_conv1[0][0]               
__________________________________________________________________________________________________
block1_pool (MaxPooling2D)      (None, 112, 112, 64) 0           block1_conv2[0][0]               
__________________________________________________________________________________

In [10]:
def get_vgg_model():    # Pre Trained Embeddings
    
    #image classifier
    IMAGE_SIZE = [224, 224]  # we will keep the image size as (64,64). You can increase the size for better results. 
    vgg = VGG16(input_shape = (224, 224, 3), weights = None, include_top = True)  # input_shape = (64,64,3) as required by VGG
    x = (vgg.layers[-2].output)
    image_model = Dense(4, activation = 'softmax',name='Hybrid_Classifier')(x)  # adding the output layer with softmax function as this is a multi label classification problem.
    model = Model(inputs=[vgg.input], outputs=[image_model])
    return model

In [11]:


hybrid_model.compile(loss=['categorical_crossentropy','binary_crossentropy'],
                       optimizer= optimizers.adam(lr=0.0001),
                       metrics=['accuracy',metrics.mae, metrics.categorical_accuracy])
# hybrid_model.summary()

<h1>CNN Image</h1>

In [12]:
IMG_SIZE =224
dataset_dir = 'H:/FYP DATASETS/FYP DATASETS/Crisis/'

def load_img(img):
    path = os.path.join(dataset_dir, img)
    rows=224
    columns=224
    img= cv2.resize(cv2.imread(path,cv2.IMREAD_COLOR),(rows,columns),interpolation=cv2.INTER_CUBIC)
    return img

In [13]:
for index, row in train.iterrows():
    train.at[index,'image_path'] = load_img(row['image_path'])

In [14]:
def encode_label(damage):
    # integer encode
    damage = np.array(damage)
    label_encoder = LabelEncoder()
    integer_encoded = label_encoder.fit_transform(damage)
    # binary encode
    onehot_encoder = OneHotEncoder(sparse=False)
    integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
    onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
    return onehot_encoded

In [15]:
y = encode_label(train.iloc[:]['damage'])
print(train.iloc[0])

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


text          HARVEY AFTER DONNA KISSED HIM: https://t.co/mz...
text_info                                       not_informative
image_path    [[[141, 150, 137], [141, 150, 137], [141, 150,...
damage                                               irrelevant
Name: 0, dtype: object


In [16]:
print(train.damage.unique())

['irrelevant' 'severe_damage' 'mild_damage' 'little_or_no_damage']


In [17]:
train_images = train['image_path'].tolist()
# no need to convert y to list as it is 1 dim encoding takes care of it
train_images = np.array(train_images)
train_text = np.array(train['text'].tolist())

In [18]:
filepath = "divided_checkpoints_1.h5"
checkpoint = ModelCheckpoint(filepath, monitor='val_vgg_output_acc', verbose=1, save_best_only=True, mode='max', period=1)
early_stopping = EarlyStopping(monitor='val_vgg_output_acc', min_delta=0, patience=3, verbose=1, mode='max')
callbacks_list = [checkpoint, early_stopping]
load_time = time.time() - start

In [19]:
start = time.time()
hybrid_history = hybrid_model.fit(x=[train_images,cnn_texts_mat], y=[y,tags],
                           epochs=40,
                           batch_size=20,
                           validation_split=0.25,
                           shuffle=True,
                           callbacks = callbacks_list,
                           verbose=1)
hybrid_time = time.time() - start


Train on 10807 samples, validate on 3603 samples
Epoch 1/40

Epoch 00001: val_vgg_output_acc improved from -inf to 0.79961, saving model to divided_checkpoints_1.h5
Epoch 2/40

Epoch 00002: val_vgg_output_acc did not improve from 0.79961
Epoch 3/40

Epoch 00003: val_vgg_output_acc improved from 0.79961 to 0.80377, saving model to divided_checkpoints_1.h5
Epoch 4/40

Epoch 00004: val_vgg_output_acc improved from 0.80377 to 0.81155, saving model to divided_checkpoints_1.h5
Epoch 5/40

Epoch 00005: val_vgg_output_acc did not improve from 0.81155
Epoch 6/40

Epoch 00006: val_vgg_output_acc did not improve from 0.81155
Epoch 7/40

Epoch 00007: val_vgg_output_acc did not improve from 0.81155
Epoch 00007: early stopping


In [20]:
# start = time.time()

# vgg_history = vgg_model.fit(x=[train_images], y=[y],
#                            epochs=40,
#                            batch_size=10,
#                            validation_split=0.2,
#                            shuffle=True,
#                            verbose=1)
# vgg_time = time.time() - start

In [21]:
print('Hybrid train time ' + str(hybrid_time))
print('load time ' + str(load_time))

Hybrid train time 2057.2990221977234
load time 759.0337851047516


In [22]:
hybrid_model.save_weights('divided.h5')
#vgg_model.save_weights('vgg.h5')