# **Digit Recognizer Challenge solved with tensorflow CNN's**

In [None]:
import numpy as np 
import pandas as pd 
import tensorflow as tf
from tensorflow.keras import layers as ly
import os

tf.random.set_seed(10)

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))




### Train Dataframe Overview: 

In [None]:
traindf = pd.read_csv('/kaggle/input/digit-recognizer/train.csv')
print(f'Dataframe shape: {traindf.shape}\nMin/Max Values: {np.min(traindf.values)} - {np.max(traindf.values)}\nLabels:{traindf.sort_values("label").label.unique()}\n')

print('Number of samples for each label:')
sample_count = traindf.groupby('label').count()['pixel0'].to_dict()
for i in sample_count:
    print(f'  Label {i} - {sample_count[i]}')

print(f'\nDataframe head:')
traindf.head()



### Dataframe overview for test.csv who will used for submission results:

In [None]:
subdf = pd.read_csv('/kaggle/input/digit-recognizer/test.csv')
print(f'Dataframe shape: {subdf.shape}\nMin/Max Values: {np.min(subdf.values)} - {np.max(subdf.values)}\n\nDataframe head:')
    
subdf.head()

---

In [None]:
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

n = 5
print(f'Showing the {n} first images of train dataframe.')

for i in range(n):
    plt.title(f'Label: {traindf.iloc[i,0]}')
    plt.imshow( traindf.iloc[i, 1:].values.reshape(28,28), cmap='Greys' )
    plt.axis('off')
    plt.show()


If we are expecting a highly efficient model in recognizing these numbers, we will need to consider a prediction of the rotated numbers.

**Data Augmentation**, is a process that we change several characteristics of the images in order to increase the input data for each label to be predicted and consequently giving more inputs to the computational model we are training.

For this data augmentation process, we will use the ImageDataGenerator and together with flow_from_directory will be necessary to convert our data to images organized in class subdirectories, as this is how these methods work.

In the code below we extract, separate the training and validation sets, retrieve the ocurrence of unique labels, create the folders and fill them with the image files with ther index name on each training and validation paths.

In [None]:
from sklearn.model_selection import train_test_split

dataset = pd.DataFrame({'path':traindf.index, 'label':traindf.label})
xtrain, xtest, ytrain, ytest = train_test_split(traindf.iloc[:,1:], traindf.label, test_size=0.2)

train_classes = [str(lbl) for lbl in ytrain.unique()]
train_classes.sort()

test_classes = [str(lbl) for lbl in ytest.unique()]
test_classes.sort()

for item in [ ('images_for_training', xtrain, ytrain), ('images_for_test', xtest, ytest) ]:
    
    label_paths = [ str(lbl) for lbl in item[2].unique() ]
    
    for path in label_paths:
        os.makedirs(f'./{item[0]}/{path}', exist_ok=True)    

    for idx,row in item[1].iterrows():
        img = row.values.reshape(28,28)
        plt.imsave(arr= img, fname= f'./{ item[0] }/{ item[2][idx] }/{ idx }.png', cmap= 'gray')
    

In [None]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

datagen = ImageDataGenerator(
    rescale=1/255,
    rotation_range=30,
    width_shift_range=0.2,
    height_shift_range=0.2
)

train_data = datagen.flow_from_directory(
    directory='/kaggle/working/images_for_training/',
    target_size=(28,28),
    color_mode="grayscale",
    class_mode="categorical",
    classes=train_classes,
    batch_size=32
)

test_data = datagen.flow_from_directory(
    directory='/kaggle/working/images_for_test/',
    target_size=(28,28),
    color_mode="grayscale",
    class_mode="categorical",
    classes=test_classes,
    batch_size=32
)

In [None]:
# This is a just simple and quick test for checking data_generator and flow_from_directory

import random

img, txt = test_data.next()
i = random.randint(0,31)
plt.axis('off')
print(f'{ txt[i] } -> label = { np.argmax(txt[i]) }')
plt.imshow(img[i], cmap='Greys')

In [None]:
# Defining some util objects for our context

def plot_loss_curves(history):
  loss = history.history['loss']
  val_loss = history.history['val_loss']

  accuracy = history.history['accuracy']
  val_accuracy = history.history['val_accuracy']

  epochs = range(len(history.history['loss']))

  # Plot loss
  plt.figure(figsize=(20,4))  
  plt.plot(epochs, loss, label='training_loss')
  plt.plot(epochs, val_loss, label='val_loss')
  plt.title('Loss')
  plt.xlabel('Epochs')
  plt.legend()

  # Plot accuracy
  plt.figure(figsize=(20,4))
  plt.plot(epochs, accuracy, label='training_accuracy')
  plt.plot(epochs, val_accuracy, label='val_accuracy')
  plt.title('Accuracy')
  plt.xlabel('Epochs')
  plt.legend();

    
class MyCheckPoint(tf.keras.callbacks.ModelCheckpoint):
    
    def __init__(self, name):
        if not os.path.exists('model_checkpoint'): 
            os.makedirs('model_checkpoint')
        else:
            if not os.path.exists(f'model_checkpoint/{name}'):
                os.makedirs(f'./model_checkpoint/{name}')

        super().__init__(
            f'./model_checkpoint/{name}',
            save_weights_only=True,
            monitor='val_accuracy',
            mode='max',
            save_best_only=True)
        

## Constructing our models

In [None]:
model_3 = tf.keras.models.Sequential([
    ly.InputLayer(input_shape=(28,28,1)),
    ly.Conv2D(filters=32, kernel_size=2, strides=1, activation='relu'),
    ly.Conv2D(28,2, activation='relu'),
    ly.MaxPool2D(),
    ly.Conv2D(24,2, activation='relu'),
    ly.MaxPool2D(),
    ly.Flatten(),
    ly.Dense(10, activation='softmax')
])

model_3.summary()

In [None]:
model_3.compile(loss= tf.keras.losses.categorical_crossentropy,
               optimizer=tf.keras.optimizers.Adam(),
               metrics=['accuracy'])

training_model_3 = model_3.fit(train_data,
                              epochs=12, steps_per_epoch=len(train_data),
                              validation_data=test_data,
                              validation_steps=len(test_data),
                              callbacks=[ MyCheckPoint('model_3') ])

In [None]:
plot_loss_curves(training_model_3)

### Testing our model with prediction samples

In [None]:
# Random choice on index submission
rand_idx = random.choices(subdf.index)[0]

# Get sample of this random index
sample = subdf.iloc[rand_idx].values.reshape(28,28)

# Sample prediction
preds = model_3.predict( tf.expand_dims( sample/255, axis=0 ), verbose=0 )

# Results
print( f'\n Model Prediction is {np.argmax(preds)} \n' )
plt.imshow(sample, cmap='Greys')
plt.axis('off')
plt.show()

### Creating submission file

In [None]:
from tqdm import tqdm

def create_submission_file(model, name=''):
    submission = pd.DataFrame()
    progress_bar = tqdm(total=subdf.shape[0])

    for idx,row in subdf.iterrows():
        image = tf.expand_dims(row.values.reshape(28,28)/255, axis=0)
        prediction = pd.DataFrame( [np.argmax(model.predict(image, verbose=0))],  columns=['Label'], index=[idx+1])
        submission = pd.concat([submission, prediction], axis=0)
        progress_bar.update(1)

    submission.index.name = 'ImageId'
    submission.to_csv(f'submission_{name}.csv')

#create_submission_file(model_3, '001') - position 1228 - score 0.96392

### Improving model

In [None]:
model_4 = tf.keras.models.Sequential([
    ly.InputLayer(input_shape=(28,28,1)),
    ly.Dense(64, activation='relu'),
    ly.Conv2D(filters=32, kernel_size=2, strides=1, activation='relu'),
    ly.Conv2D(32,2, activation='relu'),
    ly.MaxPool2D(),
    ly.Dense(32, activation='relu'),
    ly.Conv2D(32,2, activation='relu'),
    ly.AvgPool2D(),
    ly.Flatten(),
    ly.Dense(10, activation='softmax')
])

model_4.summary()

In [None]:
model_4.compile(loss= tf.keras.losses.categorical_crossentropy,
               optimizer=tf.keras.optimizers.Adam(learning_rate=0.01),
               metrics=['accuracy'])

In [None]:
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.callbacks import EarlyStopping

training_model_4 = model_4.fit(train_data,
                              epochs=50, steps_per_epoch=len(train_data),
                              validation_data=test_data,
                              validation_steps=len(test_data),
                              callbacks=[ MyCheckPoint('model_4'),
                                        ReduceLROnPlateau(monitor='val_loss', patience=3, factor=0.33, verbose=1),
                                        EarlyStopping(monitor='va_loss', patience=2, restore_best_weights=True, start_from_epoch=30)])

In [None]:
plot_loss_curves(training_model_4)

In [None]:
create_submission_file(model_4, '002')

This last model_4 reach out 0.98417 score and 805 position on leaderboard at 30/10/2023