## Import

In [4]:
import os
import pickle as pkl
import tensorflow as tf
import keras
from tensorflow.keras.preprocessing import image_dataset_from_directory
from keras import layers
from matplotlib import pyplot as plt
import seaborn as sns
import requests
import zipfile
import shutil
import glob

## Path of Data

In [5]:
# Data folder
DATA_FOLDER = "./data"
hindi_handwritten_dataset_zip_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00389/DevanagariHandwrittenCharacterDataset.zip"
zip_file_name = hindi_handwritten_dataset_zip_url.rsplit('/', 1)[1]
DEVANAGARI_ZIP_PATH = os.path.join(DATA_FOLDER, zip_file_name)
DEVANAGARI_DATA_FOLDER = os.path.join(DATA_FOLDER, zip_file_name.rsplit(".")[0])

# Ensure the data folder exists
if not os.path.exists(DATA_FOLDER):
    os.makedirs(DATA_FOLDER)

# Download the dataset if it's not already downloaded
if not os.path.exists(DEVANAGARI_ZIP_PATH):
    print("Downloading the dataset...")
    req = requests.get(hindi_handwritten_dataset_zip_url, allow_redirects=True)
    with open(DEVANAGARI_ZIP_PATH, 'wb') as output_file:
        output_file.write(req.content)
    print("Downloaded zip file.")
else:
    print("Zip file already present.")

# Extract the dataset if it's not already extracted
if not os.path.exists(DEVANAGARI_DATA_FOLDER):
    with zipfile.ZipFile(DEVANAGARI_ZIP_PATH, 'r') as zip_ref:
        zip_ref.extractall(DATA_FOLDER)
    print("Extracted zip file.")
else:
    print("Files already present on disk.")

Zip file already present.
Files already present on disk.


## Prepare Data

In [6]:
# Removing unwanted classes
labels_to_keep = [
    "digit_0", "digit_1", "digit_2", "digit_3", "digit_4", "digit_5", "digit_6", "digit_7", "digit_8", "digit_9"
]

TRAIN_FOLDER_NAME = "Train"
TEST_FOLDER_NAME = "Test"

folders = glob.glob(os.path.join(DEVANAGARI_DATA_FOLDER, TRAIN_FOLDER_NAME, "*"))
for f in folders:
    if f.rsplit("/")[-1] not in labels_to_keep:
        shutil.rmtree(f)

folders = glob.glob(os.path.join(DEVANAGARI_DATA_FOLDER, TEST_FOLDER_NAME, "*"))
for f in folders:
    if f.rsplit("/")[-1] not in labels_to_keep:
        shutil.rmtree(f)

# Dataset and model parameters
RANDOM_SEED = 42
IMG_HEIGHT = 32
IMG_WIDTH = 32
VALIDATION_SPLIT = 0.1
BATCH_SIZE = 32
KERNEL_SIZE = (3, 3)
MAX_POOLING_SIZE = (2, 2)
DROPOUT = 0.5

num_classes = len(labels_to_keep)
classes = labels_to_keep
classes_to_output_class_names = {
    "digit_0": "0", "digit_1": "1", "digit_2": "2", "digit_3": "3", "digit_4": "4", "digit_5": "5", "digit_6": "6",
    "digit_7": "7", "digit_8": "8", "digit_9": "9"
}

# Preparing datasets
train_dataset = image_dataset_from_directory(
    os.path.join(DEVANAGARI_DATA_FOLDER, TRAIN_FOLDER_NAME),
    labels="inferred",
    label_mode="int",
    class_names=classes,
    color_mode="grayscale",
    batch_size=BATCH_SIZE,
    image_size=(IMG_HEIGHT, IMG_WIDTH),
    shuffle=True,
    seed=RANDOM_SEED,
    validation_split=VALIDATION_SPLIT,
    subset="training",
)

val_dataset = image_dataset_from_directory(
    os.path.join(DEVANAGARI_DATA_FOLDER, TRAIN_FOLDER_NAME),
    labels="inferred",
    label_mode="int",
    class_names=classes,
    color_mode="grayscale",
    batch_size=BATCH_SIZE,
    image_size=(IMG_HEIGHT, IMG_WIDTH),
    shuffle=True,
    seed=RANDOM_SEED,
    validation_split=VALIDATION_SPLIT,
    subset="validation",
)

test_dataset = image_dataset_from_directory(
    os.path.join(DEVANAGARI_DATA_FOLDER, TEST_FOLDER_NAME),
    labels="inferred",
    label_mode="int",
    class_names=classes,
    color_mode="grayscale",
    batch_size=BATCH_SIZE,
    image_size=(IMG_HEIGHT, IMG_WIDTH),
    shuffle=True,
    seed=RANDOM_SEED,
)

Found 17000 files belonging to 10 classes.
Using 15300 files for training.
Found 17000 files belonging to 10 classes.
Using 1700 files for validation.
Found 3000 files belonging to 10 classes.


## Data Augmentation

In [7]:
# Data augmentation and normalization
normalization_layer = layers.Rescaling(1. / 255)
data_augmentation_layers = keras.Sequential(
    [
        layers.RandomZoom(0.05),
        layers.RandomTranslation(0.05, 0.05),
    ]
)

# Caching and prefetching
AUTOTUNE = tf.data.AUTOTUNE
train_ds = train_dataset.cache().shuffle(1000).prefetch(buffer_size=AUTOTUNE)
val_ds = val_dataset.cache().prefetch(buffer_size=AUTOTUNE)

## Build `.tf` Model

In [None]:
# Creating the model
model = keras.Sequential(
    [
        data_augmentation_layers,
        normalization_layer,
        layers.Conv2D(32, kernel_size=KERNEL_SIZE, activation="relu"),
        layers.MaxPooling2D(pool_size=MAX_POOLING_SIZE),
        layers.Conv2D(64, kernel_size=KERNEL_SIZE, activation="relu"),
        layers.MaxPooling2D(pool_size=MAX_POOLING_SIZE),
        layers.Dropout(DROPOUT),
        layers.Flatten(),
        layers.Dense(128, activation='relu'),
        layers.Dense(num_classes, activation="softmax"),
    ]
)

# Compiling and training the model
model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])
epochs = 15
history = model.fit(train_dataset, validation_data=val_dataset, epochs=epochs)

Epoch 1/15


  output, from_logits = _get_logits(


[1m479/479[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 52ms/step - accuracy: 0.7735 - loss: 0.6808 - val_accuracy: 0.9847 - val_loss: 0.0482
Epoch 2/15
[1m479/479[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 51ms/step - accuracy: 0.9625 - loss: 0.1079 - val_accuracy: 0.9876 - val_loss: 0.0336
Epoch 3/15
[1m479/479[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 59ms/step - accuracy: 0.9764 - loss: 0.0760 - val_accuracy: 0.9900 - val_loss: 0.0230
Epoch 4/15
[1m479/479[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 47ms/step - accuracy: 0.9839 - loss: 0.0556 - val_accuracy: 0.9918 - val_loss: 0.0151
Epoch 5/15
[1m479/479[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 50ms/step - accuracy: 0.9863 - loss: 0.0448 - val_accuracy: 0.9924 - val_loss: 0.0189
Epoch 6/15
[1m479/479[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 50ms/step - accuracy: 0.9889 - loss: 0.0391 - val_accuracy: 0.9935 - val_loss: 0.0175
Epoch 7/15
[1m479/479[0m 

## Plot History

In [None]:
# Plotting accuracy and loss
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs_range = range(1, epochs + 1)

fig = plt.figure(figsize=(15, 5))
fig.add_subplot(1, 2, 1)
sns.lineplot(x=epochs_range, y=acc, label='Training Accuracy')
sns.lineplot(x=epochs_range, y=val_acc, label='Validation Accuracy')

fig.add_subplot(1, 2, 2)
sns.lineplot(x=epochs_range, y=loss, label='Training Loss')
sns.lineplot(x=epochs_range, y=val_loss, label='Validation Loss')
plt.show()

# Evaluating the model
result = model.evaluate(test_dataset)
print(result)

## Save Model

In [None]:
# Saving the model
MODEL_FOLDER = "./models"
HINDI_MNIST_FOLDER = "hindi_mnist"
MODEL_SAVE_FOLDER = os.path.join(MODEL_FOLDER, HINDI_MNIST_FOLDER)
TF_MODEL_SAVE_FOLDER = os.path.join(MODEL_FOLDER, HINDI_MNIST_FOLDER, "tf_serving")
MODEL_SAVE_PATH = os.path.join(MODEL_FOLDER, HINDI_MNIST_FOLDER, "model.h5")

model.save(MODEL_SAVE_PATH, overwrite=True, include_optimizer=True)
model.save(TF_MODEL_SAVE_FOLDER, overwrite=True, save_format='tf')

# Saving classes to a pickle file
CLASSES_PKL_PATH = os.path.join(MODEL_SAVE_FOLDER, "classes.pickle")
with open(CLASSES_PKL_PATH, 'wb') as f:
    pkl.dump(classes, f)
    pkl.dump(classes_to_output_class_names, f)

# Loading the model and evaluating again
model = keras.models.load_model(MODEL_SAVE_PATH)
with open(CLASSES_PKL_PATH, 'rb') as f:
    classes = pkl.load(f)
    labels_to_class_names = pkl.load(f)

result = model.evaluate(test_dataset)
print(result)