In [60]:
import random
import os
import re
from glob import glob
from tqdm import tqdm
import numpy as np

import pandas as pd
import ast
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
from PIL import Image, ImageDraw, _imaging
from __future__ import print_function
import keras
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Activation
from keras.layers import Conv2D, MaxPooling2D
from keras import backend as K
from keras import metrics
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
from sklearn.model_selection import train_test_split
import tensorflow as tf
from datetime import datetime
from keras.callbacks import History, ModelCheckpoint
history = History()
from keras.models import model_from_json

In [61]:
def generateTrainData(output_height, output_width, recordsPerTrainClass, skiprows=0):

    test_df = pd.read_csv('test_simplified.csv')

    # Get names of all train csv files with the pattern match below
    fnames = glob('train_simplified/*.csv')

    rows = recordsPerTrainClass
    # Get n rows from all the csv files and append them into one dataframe
    train_df = pd.DataFrame(columns=pd.read_csv(fnames[0], nrows=1).columns)
    for name in fnames:
        if skiprows == 0:
            data = pd.read_csv(name, nrows=recordsPerTrainClass)
        else:
            data = pd.read_csv(name, nrows=recordsPerTrainClass, skiprows=range(1,skiprows))
        train_df = train_df.append(data)

#     print(train_df.shape)
        
    train_df = train_df.reset_index().drop('index', axis=1)
    # Get only those which were correctly recognized
    train_df = train_df[train_df['recognized'] == True]
    
    # Convert the drawing column to matrix
#     train_df['drawing'] = train_df['drawing'].apply(ast.literal_eval)
#     test_df['drawing'] = test_df['drawing'].apply(ast.literal_eval)
    train_df['drawing'] = train_df['drawing'].apply(eval)
    test_df['drawing'] = test_df['drawing'].apply(eval)

    # Convert drawing to images
    train_df['img'] = train_df['drawing'].apply(lambda x: draw_it(x, output_height, output_width))
    test_df['img'] = test_df['drawing'].apply(lambda x: draw_it(x, output_height, output_width))
    # train_df['img'] = train_df[['drawing']].apply(lambda x: draw_it(x['drawing'], output_height, output_width), axis=1)
    # test_df['img'] = test_df[['drawing']].apply(lambda x: draw_it(x['drawing'], output_height, output_width), axis=1)
    
    return train_df, test_df
    
def showSampleImgs():
    n_samp = 3
    train_df_sample = train_df.sample(n_samp)
    plt.figure(figsize=(4,4))
    for i in range(n_samp):
        draw = train_df_sample.iloc[i]['drawing']
        label = train_df_sample.iloc[i]['word']
        plt.subplot(n_samp,1,i+1)
        for stroke in draw:
            plt.plot(stroke[0], stroke[1], marker='.', color='black')
            plt.title(label)
            plt.axis('off')
    plt.show()    
    
# Convert drawings to images
def draw_it(raw_strokes, output_height, output_width):
    image = Image.new("P", (255,255)
#                       , color=1
            )
    image_draw = ImageDraw.Draw(image)

    for stroke in raw_strokes:
        for i in range(len(stroke[0])-1):

            image_draw.line([stroke[0][i], 
                             stroke[1][i],
                             stroke[0][i+1], 
                             stroke[1][i+1]],
                            fill=255, width=6)
    # Reduce image size
    image = image.resize((output_height,output_width),Image.ANTIALIAS)
    
    return np.array(image)

# Show an image from the dataframe
def showImgFromDf(df, index):
    # Show an image
    plt.imshow(df.iloc[index]['img'],cmap='gray')
    plt.title(df.iloc[index]['word'])
    plt.show()
    
def CNN_dataPrep(train_df, img_height, img_width):
    
    num_classes = train_df['word'].nunique()
    # Data Preprocessing

    # Shuffle the data
    random.seed(111)
    train_df = train_df.sample(train_df.shape[0])
    
    # input image dimensions
    img_rows, img_cols = img_height, img_width
    input_shape = (img_rows, img_cols)
    
    # Reshape the array
    imgArr = np.vstack(train_df['img'].values).flatten().reshape((train_df['img'].shape[0], img_rows, img_cols))
    imgArr_test = np.vstack(test_df['img'].values).flatten().reshape((test_df['img'].shape[0], img_rows, img_cols))
    
    # In Keras, the layers used for two-dimensional convolutions expect pixel values with the dimensions [width][height][pixels] for TF.
    # In the case of RGB, the first dimension pixels would be 3 for the red, green and blue components and it would be like having 3 image inputs for every color image. In this case, the pixel values are gray scale, the pixel dimension is set to 1.
    imgArr = imgArr.reshape(imgArr.shape[0], img_rows, img_cols, 1).astype('float32')
    imgArr_test = imgArr_test.reshape(imgArr_test.shape[0], img_rows, img_cols, 1).astype('float32')
    
    # Initialize the y_train
    y_train = train_df['word']
    
    # Convert class labels from categorical to numerical
    unique_classes_list = y_train.unique()
    map_class_to_numeric = {k: v for v, k in enumerate(y_train.unique())}
    map_numeric_to_class = {v: k for k, v in map_class_to_numeric.iteritems()}
    y_train_numeric = y_train.apply(lambda x: map_class_to_numeric[x])

    # convert class vectors to binary class matrices
    y_train_one_hot = keras.utils.to_categorical(y_train_numeric, num_classes)
    num_classes = y_train_one_hot.shape[1]
    
    X_train, X_test, y_train, y_test = train_test_split(imgArr, y_train_one_hot, test_size=0.2)
    
    return X_train, X_test, y_train, y_test, imgArr_test, map_class_to_numeric, map_numeric_to_class
    
    
def top_3_accuracy(y_true, y_pred):
    return metrics.top_k_categorical_accuracy(y_true, y_pred, k=3)


def baseline_conv_model(num_filters, num_classes, img_rows, img_cols):
    model = Sequential()
    model.add(Conv2D(num_filters, (5,5), input_shape=(img_rows,img_cols,1), activation='relu')) 
    model.add(MaxPooling2D(pool_size=(2,2)))
    model.add(Dropout(0.2))
    model.add(Conv2D(num_filters*2, (5,5), input_shape=(img_rows,img_cols,1), activation='relu')) 
    model.add(MaxPooling2D(pool_size=(2,2)))
    model.add(Dropout(0.2))
    model.add(Conv2D(num_filters*2, (3,3), input_shape=(img_rows,img_cols,1), activation='relu')) 
    # model.add(MaxPooling2D(pool_size=(2,2)))
    model.add(Dropout(0.2))
    model.add(Flatten())
    model.add(Dense(1000, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(500, activation='relu'))
    model.add(Dropout(0.2))
#     model.add(Dense(num_classes, activation='softmax'))
    model.add(Dense(units=num_classes))
#     model.add(Activation('softmax'))
    model.add(Activation(tf.nn.softmax))

    # Compile
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy', top_3_accuracy, 'categorical_crossentropy'])
    return model


In [62]:
%%time 
recordsPerTrainClass = 5000
skiprows = 0
img_height = img_width = 64

# Get data and transform drawing into image
train_df, test_df = generateTrainData(img_height, img_width, recordsPerTrainClass, skiprows)


CPU times: user 13min 8s, sys: 1min 48s, total: 14min 56s
Wall time: 17min 55s


In [63]:
%%time
# Transform data for CNN
X_train, X_test, y_train, y_test, imgArr_test, map_class_to_numeric, map_numeric_to_class = CNN_dataPrep(train_df, img_height, img_width)

CPU times: user 40.2 s, sys: 1min 51s, total: 2min 31s
Wall time: 3min 40s


In [55]:
# p1 = pd.read_csv('test_simplified.csv', nrows=10)
# p2 = pd.read_csv('test_simplified.csv', nrows=10)
# p1 = pd.read_csv('test_simplified.csv', nrows=10, skiprows=range(1,20))
# p2 = pd.read_csv('test_simplified.csv', nrows=10, skiprows=20)
# p1.append(p2)
# p1
# X_test

In [73]:
num_filters = 8
num_classes = train_df['word'].nunique()
continueTrain = True
if continueTrain == False:
    model = baseline_conv_model(num_filters, num_classes, img_height, img_width)

# checkpoint
filepath="Saved_Models/weights.best.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
callbacks = [checkpoint]

print(model.summary())

hist = model.fit(X_train, 
          y_train, 
          validation_data=(X_test, y_test),
          epochs=2, 
          batch_size=5000, 
          verbose=2,
          callbacks = callbacks)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1 (Conv2D)            (None, 60, 60, 8)         208       
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 30, 30, 8)         0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 30, 30, 8)         0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 26, 26, 16)        3216      
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 13, 13, 16)        0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 13, 13, 16)        0         
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 11, 11, 16)        2320      
__________

KeyboardInterrupt: 

In [72]:
scores = model.evaluate(X_test, y_test, verbose=0)
print("CNN Error: %.2f%%" % (100-scores[1]*100))
# model

CNN Error: 44.04%


In [18]:
# Make predictions
preds = model.predict(imgArr_test)
outputDf = test_df.copy()[['key_id']]
outputDf['word'] = ''
map_numeric_to_class_space_normal = map_numeric_to_class.copy()
for key in map_numeric_to_class_space_normal:
    map_numeric_to_class_space_normal[key] = (map_numeric_to_class_space_normal[key].replace(" ", "_"))

for i in tqdm(range(preds.shape[0])):
    outputDf['word'].at[i] = ' '.join(([map_numeric_to_class_space_normal[predClass] for predClass in [tup[1] for tup in sorted(zip(preds[i], range(340)), reverse=True)[:3]]]))

# Create csv
outputDf.to_csv('initial_pred.csv', index=False)

100%|██████████| 112199/112199 [00:22<00:00, 4887.68it/s]


In [70]:
# Save the model
# serialize model to JSON
model_json = model.to_json()
with open(("Saved_Models/model_" + str(datetime.today())[:16].replace(" ", "_").replace(":","-") + ".json"), "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights(("Saved_Models/model_" + str(datetime.today())[:16].replace(" ", "_").replace(":","-") + ".h5"))
print("Saved model to disk")

Saved model to disk


In [25]:
# model = ""

In [71]:
# Load the model back
# load json and create model
json_file = open('Saved_Models/model_2018-11-11_17-15.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("Saved_Models/model_2018-11-11_17-15.h5")
print("Loaded model from disk")
 
# evaluate loaded model on test data
loaded_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy', top_3_accuracy, 'categorical_crossentropy'])
# score = loaded_model.evaluate(X, Y, verbose=0)
# print("%s: %.2f%%" % (loaded_model.metrics_names[1], score[1]*100))
model = loaded_model

Loaded model from disk
