In [20]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation,Flatten
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D
from keras.preprocessing.image import ImageDataGenerator, array_to_img, img_to_array, load_img
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras import metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import shuffle

from keras import metrics
from keras.layers.merge import concatenate
import sys
sys.path.append('../')
from Utilities.model_visualization import model_to_png

from PIL import Image # used for loading images
import numpy as np
from numpy import asarray
import matplotlib.pyplot as plt

import os # used for navigating to image path

from keras.layers import Input

import cv2
from keras.layers import concatenate
from keras.models import Model
from keras.applications import VGG16

import pandas as pd
from glob import glob
import tensorflow as tf

from keras.callbacks import ModelCheckpoint
from sklearn.model_selection import StratifiedKFold
from keras.preprocessing.image import ImageDataGenerator, img_to_array, load_img
from keras.callbacks import EarlyStopping

import pickle
from scipy import misc
from keras import optimizers

from keras.utils import plot_model

from numpy import array
from numpy import argmax
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

import time
import sys

<h1>Loading Data</h1>

In [21]:
df = pd.read_pickle('processed_df.pkl')
print(df.iloc[0])

text          Sunny days sweeping clouds away #Irma #PublicP...
text_info                                           informative
image_path    data_image/hurricane_irma/18_9_2017/9098755908...
damage                                              mild_damage
Name: 0, dtype: object


In [22]:
split_pos = int(df['text'].count()*0.8)
train = df[:split_pos]
test = df[split_pos:]

<h1>CNN Text Model</h1>

In [23]:
dataColumn = 'text'
labelColumn = 'text_info'

tags = train[labelColumn]
texts = train[dataColumn]

tags_Y = test[labelColumn]
texts_Y = test[dataColumn]

In [24]:
print(train.shape)

(320, 4)


In [25]:
num_max = 1000
# preprocess
le = LabelEncoder()
tags = le.fit_transform(tags.astype(str))
tok = Tokenizer(num_words=num_max)
tok.fit_on_texts(texts)
mat_texts = tok.texts_to_matrix(texts,mode='count')
print(tags[:5])
print(mat_texts[:5])
print(tags.shape,mat_texts.shape)


# For testing data
le_Y = LabelEncoder()
tags_Y = le_Y.fit_transform(tags_Y.astype(str))
tok_Y = Tokenizer(num_words=num_max)
tok_Y.fit_on_texts(texts_Y)
mat_texts_Y = tok.texts_to_matrix(texts_Y,mode='count')

[0 0 1 0 0]
[[0. 1. 0. ... 0. 0. 0.]
 [0. 1. 1. ... 0. 0. 0.]
 [0. 1. 1. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]]
(320,) (320, 1000)


In [26]:
# for cnn preproces
max_len = 100
cnn_texts_seq = tok.texts_to_sequences(texts)
cnn_texts_mat = sequence.pad_sequences(cnn_texts_seq,maxlen=max_len)

# For testing data
cnn_texts_seq_Y = tok.texts_to_sequences(texts_Y)
cnn_texts_mat_Y = sequence.pad_sequences(cnn_texts_seq_Y,maxlen=max_len)

In [27]:
filepath = "text_weights.h5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max', period=1)
early_stopping = EarlyStopping(monitor='val_acc', min_delta=0, patience=4, verbose=1, mode='max')
callbacks_list = [checkpoint, early_stopping]

In [28]:
def get_hybrid_model():    # Pre Trained Embeddings
    
    # load the whole embedding into memory
    embeddings_index = dict()
    f = open('Embeddings/glove.6B.100d.txt', encoding="utf8")
    for line in f:
        values = line.split()
        word = values[0]
        coefs = asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()
    print('Found %s word vectors.' % len(embeddings_index))
    
    # create a weight matrix for words in training docs
    embedding_matrix = np.zeros((len(tok.word_index) + 1, 100))
    for word, i in tok.word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

    #text classifier
    inputs = Input(shape=(100,))
    e = Embedding(len(tok.word_index) + 1,
                  100, 
                  weights=[embedding_matrix],
                  input_length=max_len, 
                  trainable=False)(inputs)
    x = Dropout(0.2)(e)
    x = Conv1D(128,
               3,
               padding='valid',
               activation='relu',
               strides=1)(x)
    x = GlobalMaxPooling1D()(x)
    x = Dense(128, activation='relu')(x)
    x = Dropout(0.2)(x)
    hybrid_link = Dense(32, activation='relu', name='hybrid_link')(x)
    x = Dense(1, activation='sigmoid', name='Text_Classifier')(hybrid_link)
    text_classifier = x
    #image classifier
    IMAGE_SIZE = [224, 224]  # we will keep the image size as (64,64). You can increase the size for better results. 
    vgg = VGG16(input_shape = (224, 224, 3), weights = None, include_top = True)  # input_shape = (64,64,3) as required by VGG
    x = (vgg.layers[-2].output)
    image_model = Dense(3, activation = 'softmax',name='Hybrid_Classifier')(x)  # adding the output layer with softmax function as this is a multi label classification problem.
    #hybrid model
    concatenate_layer = concatenate([image_model, hybrid_link]) 
    hybrid = Dense(4, activation='softmax')(concatenate_layer)
    model = Model(inputs=[vgg.input, inputs], outputs=[hybrid,text_classifier])
    return model

In [29]:
model = get_hybrid_model()

model.compile(loss='binary_crossentropy',
                       optimizer= optimizers.adam(lr=0.00008),
                       metrics=['acc',metrics.binary_accuracy])
model.summary()
plot_model(model, to_file='multiple_inputs_outputs.png')


Found 400000 word vectors.
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_4 (InputLayer)            (None, 224, 224, 3)  0                                            
__________________________________________________________________________________________________
block1_conv1 (Conv2D)           (None, 224, 224, 64) 1792        input_4[0][0]                    
__________________________________________________________________________________________________
block1_conv2 (Conv2D)           (None, 224, 224, 64) 36928       block1_conv1[0][0]               
__________________________________________________________________________________________________
block1_pool (MaxPooling2D)      (None, 112, 112, 64) 0           block1_conv2[0][0]               
__________________________________________________________________________________

<h1>CNN Image</h1>

In [30]:
IMG_SIZE =224
dataset_dir = 'H:/FYP DATASETS/FYP DATASETS/Crisis/'

def load_img(img):
    path = os.path.join(dataset_dir, img)
    rows=224
    columns=224
    img= cv2.resize(cv2.imread(path,cv2.IMREAD_COLOR),(rows,columns),interpolation=cv2.INTER_CUBIC)
    return img

In [31]:
for index, row in train.iterrows():
    train.at[index,'image_path'] = load_img(row['image_path'])

In [32]:
def encode_label(damage):
    # integer encode
    damage = np.array(damage)
    label_encoder = LabelEncoder()
    integer_encoded = label_encoder.fit_transform(damage)
    # binary encode
    onehot_encoder = OneHotEncoder(sparse=False)
    integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
    onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
    return onehot_encoded

In [33]:
y = encode_label(train.iloc[:]['damage'])
print(train.iloc[0])

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


text          Sunny days sweeping clouds away #Irma #PublicP...
text_info                                           informative
image_path    [[[222, 122, 4], [219, 125, 6], [221, 131, 12]...
damage                                              mild_damage
Name: 0, dtype: object


In [34]:
print(train.damage.unique())

['mild_damage' 'little_or_no_damage' 'severe_damage'
 'dont_know_or_cant_judge']


In [35]:
train_images = train['image_path'].tolist()
# no need to convert y to list as it is 1 dim encoding takes care of it
train_images = np.array(train_images)
train_text = np.array(train['text'].tolist())

In [36]:
print(cnn_texts_mat.shape)

(320, 100)


In [37]:

history = model.fit(x=[train_images,cnn_texts_mat], y=[y,tags],
                           epochs=40,
                           batch_size=25,
                           validation_split=0.2,
                           shuffle=True,
                           verbose=1)

Train on 256 samples, validate on 64 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40


Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40


Epoch 38/40
Epoch 39/40
Epoch 40/40


In [None]:
model.save_weights('hybrid_only.h5')