In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import seaborn as sns
%matplotlib inline

np.random.seed(2)

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import itertools

from keras.utils.np_utils import to_categorical # convert to one-hot-encoding
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPool2D
from keras.optimizers import RMSprop
from keras.preprocessing.image import ImageDataGenerator
from keras.callbacks import ReduceLROnPlateau

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

*Load data*

In [None]:
train = pd.read_csv("../input/train.csv")
test = pd.read_csv("../input/test.csv")

In [None]:
#X_train, Y_train
Y_train = train.label
X_train = train.drop(labels = ["label"],axis = 1)

In [None]:
#Normalization [0...255] to [0..1]
X_train /= 255.0
test /= 255.0

In [None]:
#transform image into 2D i.e (28,28,1)
X_train = X_train.values.reshape(-1,28,28,1)#thanks to google(Mdr)

In [None]:
input_shape = X_train.shape[1:]
print(input_shape,X_train.shape)

In [None]:
#show an image
print(X_train[0].shape)
plt.imshow(X_train[5][:,:,0])
plt.show()

*Model building*:
> I just tried classical architecture to know:
1. Conv-Pool-Conv-Pool
2. Conv-Conv-Pool-Conv-Conv-Pool
Of the two it was the second that gave me a statifaisant result.
it was enough after to vary certain parameters (number of filter, size of the filters, the stride ..) by following the rules to arrive at the result

In [None]:
from keras.models import Sequential
from keras.layers import Conv2D,MaxPooling2D,Flatten,Dense,Dropout
#cnn =Sequential()
cnn = Sequential([
    Conv2D(16, kernel_size=(3, 3), activation='relu',padding='same',input_shape = input_shape),
    Conv2D(16, kernel_size=(3, 3), activation='relu',padding='same'),
    MaxPooling2D(pool_size=(2, 2),strides=2),
    Dropout(0.2),
    Conv2D(32, kernel_size=(3, 3), activation='relu'),
    Conv2D(32, kernel_size=(3, 3), activation='relu'),
    MaxPooling2D(pool_size=(2, 2),strides=2),
    Dropout(0.3),
    Flatten(),
    Dense(512, activation='relu'),
    Dropout(0.5),
    Dense(10, activation='softmax')
])

In [None]:
cnn.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [None]:
#Train
history = cnn.fit(X_train, Y_train, batch_size = 256, epochs = 15, validation_split = 0.25)

In [None]:
# plot history for accuracy
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
# plot history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
test = test.values.reshape(-1,28,28,1)

In [None]:
test_pred = cnn.predict(test)
# Convert predictions classes to one hot vectors 
test_pred_class = np.argmax(test_pred,axis = 1) 

In [None]:
test_pred_class

In [None]:
submission = pd.concat([pd.Series(range(1,28001),name = "ImageId"),pd.Series(test_pred_class,name="Label")],
                       axis = 1)
submission.to_csv('submission.csv',index=False)

**future improvement**
1. Add dropout and BatchNormalization
2. change filter size to (5,5)