Import packages

In [1]:
import pandas as pd
import os
import warnings
warnings.filterwarnings('ignore')
import tensorflow as tf
from tensorflow.keras import layers,models
from tensorflow.keras import regularizers
from tensorflow.keras.optimizers import Adam
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report

Import dataset

In [2]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("sujaymann/handwritten-english-characters-and-digits")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/sujaymann/handwritten-english-characters-and-digits?dataset_version_number=6...


100%|██████████| 205M/205M [00:01<00:00, 152MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/sujaymann/handwritten-english-characters-and-digits/versions/6


In [3]:
import os
print("Files in dataset folder:")
print(os.listdir(path))

Files in dataset folder:
['handwritten-english-characters-and-digits', 'augmented_images', 'image_labels.csv']


In [4]:
train_dir = os.path.join(path, "handwritten-english-characters-and-digits/combined_folder/train") #'/kaggle/input/handwritten-english-characters-and-digits/handwritten-english-characters-and-digits/combined_folder/train'
test_dir  = os.path.join(path, "handwritten-english-characters-and-digits/combined_folder/test") #'/kaggle/input/handwritten-english-characters-and-digits/handwritten-english-characters-and-digits/combined_folder/test'
augmented_data= os.path.join(path, "augmented_images/augmented_images1") #'/kaggle/input/handwritten-english-characters-and-digits/augmented_images/augmented_images1'

Data preprocessing

In [None]:
validate_ds = tf.keras.utils.image_dataset_from_directory(
    train_dir,              # Root folder containing one subfolder per class
    image_size=(64, 64),    
    batch_size=32,          # Number of samples per batch
    label_mode='categorical'# One-hot encoded labels (shape: [batch, num_classes])
)

Found 2728 files belonging to 62 classes.


In [None]:
augmented_ds=tf.keras.utils.image_dataset_from_directory(
    augmented_data, # Root folder containing one subfolder per class
    image_size=(64,64),
    batch_size=32, # Number of samples per batch
    label_mode='categorical' # One-hot encoded labels (shape: [batch, num_classes])

)

Found 13640 files belonging to 62 classes.


In [None]:
test_ds=tf.keras.utils.image_dataset_from_directory(
    test_dir, # Root folder containing one subfolder per class
    image_size=(64,64),
    batch_size=32, # Number of samples per batch
    label_mode='categorical' # One-hot encoded labels (shape: [batch, num_classes])
)

Found 682 files belonging to 62 classes.


In [8]:

# Function to convert dataset to flattened NumPy arrays
def dataset_to_numpy(ds):
    ds = ds.unbatch()
    X = []
    y = []
    for image, label in tqdm(ds):
        X.append(tf.reshape(image, [-1]).numpy())  # Flatten image
        y.append(tf.argmax(label).numpy())         # Convert one-hot to label index
    return np.array(X), np.array(y)

# Convert training data
X_train, y_train = dataset_to_numpy(augmented_ds)

# Convert test data
X_test, y_test = dataset_to_numpy(test_ds)

print("Train set:", X_train.shape, y_train.shape)
print("Test set:", X_test.shape, y_test.shape)


13640it [01:34, 144.97it/s]
682it [00:03, 175.34it/s]

Train set: (13640, 12288) (13640,)
Test set: (682, 12288) (682,)





Training logistic regression model

In [9]:
# Initialize Stratified K-Fold cross-validation with 5 splits
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Variables to store the best model and corresponding accuracy/predictions
best_accuracy = 0
best_model = None
best_y_true = None
best_y_pred = None

fold = 1
# Perform cross-validation
for train_index, val_index in skf.split(X_train, y_train):
    # Split data into training and validation sets for this fold
    X_tr, X_val = X_train[train_index], X_train[val_index]
    y_tr, y_val = y_train[train_index], y_train[val_index]

    # Initialize and train logistic regression model
    model = LogisticRegression(max_iter=1000, solver='lbfgs', multi_class='multinomial')
    model.fit(X_tr, y_tr)

    # Predict on the validation set
    y_pred = model.predict(X_val)
    acc = accuracy_score(y_val, y_pred)  # Calculate accuracy for this fold

    print(f" Fold {fold} accuracy: {acc:.4f}")

    # Update best model if current fold has higher accuracy
    if acc > best_accuracy:
        best_accuracy = acc
        best_model = model
        best_y_true = y_val
        best_y_pred = y_pred

    fold += 1  # Move to next fold

 Fold 1 accuracy: 0.0411
 Fold 2 accuracy: 0.0433
 Fold 3 accuracy: 0.0440
 Fold 4 accuracy: 0.0455
 Fold 5 accuracy: 0.0502


Classification report for logistic regression model

In [None]:
class_names = test_ds.class_names  # Automatically gets label names from directory structure
# Print a clean header so the report stands out in logs/notebooks
print("\n Classification Report for Best Fold (Highest Accuracy):")
print(classification_report(best_y_true, best_y_pred, target_names=class_names))
# Show the best validation accuracy found during cross-validation.
print(f"\n Best validation accuracy across folds: {best_accuracy:.4f}")


 Classification Report for Best Fold (Highest Accuracy):
              precision    recall  f1-score   support

           0       0.02      0.02      0.02        44
           1       0.14      0.18      0.16        44
           2       0.03      0.02      0.03        44
           3       0.03      0.02      0.02        44
           4       0.07      0.05      0.06        44
           5       0.05      0.05      0.05        44
           6       0.02      0.02      0.02        44
           7       0.06      0.05      0.05        44
           8       0.05      0.07      0.06        44
           9       0.00      0.00      0.00        44
      A_caps       0.04      0.05      0.04        44
      B_caps       0.06      0.07      0.07        44
      C_caps       0.03      0.02      0.02        44
      D_caps       0.00      0.00      0.00        44
      E_caps       0.10      0.09      0.10        44
      F_caps       0.04      0.05      0.04        44
      G_caps       0.10