<h1>Manual Feature Extraction + Processing</h1> 

<h2>Libraries</h2>

In [2]:
import cv2
import numpy as np
from scipy.signal import find_peaks
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score
from bayes_opt import BayesianOptimization
import os
import pandas as pd
import cv2
import numpy as np
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
import tensorflow as tf

<h2> Processing </h2>

In [3]:
def preprocess_image(img):
    # Load image
    #img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)

    if len(img.shape) == 3:
        img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    # Noise removal with median filter
    img = cv2.medianBlur(img, 3)
    
    # Binarize image using Otsu's threshold
    _, binary_img = cv2.threshold(img, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

    #binary_image_3_channels = cv2.cvtColor(binary_img, cv2.COLOR_GRAY2BGR)  
    
    return binary_img

def dilate_image(image):
    #
    kernel = np.ones((5,5), np.uint8)

    #
    dilated_img = cv2.dilate(image, kernel, iterations=1)

    return dilated_img

<h2>Pen Pressure<h2>

In [4]:
#Feature 1: Pen Pressure: Calculate the Mean and STD (2 Features)

def pen_pressure_features(image):
    #image = cv2.imread(image, cv2.IMREAD_GRAYSCALE)

    #Mean
    mean_intensity = np.mean(image) 

    #STD
    std_intensity = np.std(image) 

    return mean_intensity, std_intensity


<h2>Left and Right Margins<h2>

In [5]:
#Feature 2: Left and Right Margins for the first 200 rows (400 features)

def extract_margins(image):
    #Left Margin Code
    def left_margin(image):
        left_margins = []
        for i in range(200):  # Considering the first 200 rows
            row = image[i]
            first_black_pixel_index = np.argmax(row == 0) #extract the first black feature from the left
            left_margins.append((first_black_pixel_index,i))
        return left_margins

    #Right Margin Code
    def right_margin(image):
        right_margins = []
        width = image.shape[1]
        for i in range(200):  # Considering the first 200 rows
            row = image[i]
            last_black_pixel_index = width - np.argmax(np.flipud(row) == 0) #extract the last black pixel
            right_margin = width - last_black_pixel_index 
            right_margins.append((right_margin, i))
        return right_margins


    # Dilate the image
    dilated_img = dilate_image(image)

    # Extract margin features
    left_margin_features = left_margin(dilated_img)
    right_margin_features = right_margin(dilated_img)

    return left_margin_features, right_margin_features



<h2>Handwriting Irregularity</h2>

In [6]:
def test_handwriting_irregularity(image):
    def horizontal_projection_profile(image):
        # Considering only the first 700 rows
        limited_image = image[:700]

        # Horizontal projection: Summing up values along each row
        projection_profile = np.sum(limited_image, axis=1)
        
        return projection_profile


    def find_peaks_and_valleys(projection_profile):

        peaks, _ = find_peaks(projection_profile)

        valleys, _ = find_peaks(-projection_profile)
        
        return peaks, valleys
    
    def standardize_projection_profile(profile, desired_length=700):
        if len(profile) > desired_length:
            return profile[:desired_length]
        elif len(profile) < desired_length:
            return np.pad(profile, (0, desired_length - len(profile)), 'constant')
        return profile
        
    # Preprocess and dilate the image
    dilated_img = dilate_image(image)

    # Calculate horizontal projection profile
    projection_profile = horizontal_projection_profile(dilated_img)

    # Find peaks and valleys
    #peaks, valleys = find_peaks_and_valleys(projection_profile)

    standardized_profile = standardize_projection_profile(projection_profile)
    return standardized_profile

   

<h2>Feature Extraction<h2>

In [7]:
def extract_features(image):
    pp_features = np.array(pen_pressure_features(image)).flatten()  # Flatten to ensure 1D
    hi_features = test_handwriting_irregularity(image).flatten()    # Flatten
    left_margin_features = np.array(extract_margins(image)[0]).flatten()  # Flatten
    right_margin_features = np.array(extract_margins(image)[1]).flatten() # Flatten


    # Concatenate all flattened 1D arrays
    combined_features = np.concatenate([pp_features, hi_features, left_margin_features, right_margin_features])

    return combined_features

<h1>Image Loading<h1>

In [8]:
updated_excel_file = './updated_excel_file2.xlsx'
data = pd.read_excel(updated_excel_file, usecols=range(5))  # Load only the first 5 columns
width = height = 500
batch_size = 32


def load_images_and_labels(file_names_column, age_column, start_idx, end_idx):
    features_batch = []
    labels_batch = []
    
    for index in range(start_idx, end_idx):
        updated_excel_file = './updated_excel_file2.xlsx'
        data = pd.read_excel(updated_excel_file)  # Load the first 8 columns
        row = data.iloc[index]
        file_name = row[file_names_column]
        age = row[age_column]

        if pd.notnull(age) and pd.notnull(file_name):
            image = cv2.imread(os.path.join('./images', file_name))
            image = preprocess_image(image)
            # image = cv2.resize(image, (width, height))  # Uncomment if resizing is needed
            all_features = extract_features(image)  
            features_batch.append(all_features)
            labels_batch.append(age)

    labels_batch = to_categorical(labels_batch, num_classes=2)  # Confirm this is the desired encoding
    features_batch = np.array(features_batch, dtype='float32')  # Confirm appropriate scaling
    
    return features_batch, labels_batch

# Load all images and labels
all_features = []
all_labels = []

for start_idx in range(0, len(data), batch_size):
    end_idx = min(start_idx + batch_size, len(data))
    features_batch, labels_batch = load_images_and_labels('File Name', 'Gender', start_idx, end_idx)
    
    all_features.append(features_batch)
    all_labels.append(labels_batch)

# Concatenate the batches to create the final arrays
all_features = np.concatenate(all_features)
all_labels = np.concatenate(all_labels)

# Example usage
print(all_features.shape)  # Print the shape of the array containing all features
print(all_labels.shape)  # Print the shape of the array containing all labels

(3840, 1502)
(3840, 2)


In [12]:
print(all_features[0])

[2.4090695e+02 5.8267605e+01 4.7073000e+05 ... 1.9800000e+02 4.2200000e+02
 1.9900000e+02]


<h1>Classification<h1>

<h2>NN Classifer<h2>

In [9]:
def create_ann(input_size, hidden_layer_size):
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(hidden_layer_size, activation='relu', input_shape=(input_size,)),
        tf.keras.layers.Dense(2, activation='sigmoid')
    ])
    return model


def compile_model(model):

    optimizer = tf.keras.optimizers.SGD(learning_rate=0.1, momentum=0.9, nesterov=True)
    model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])


def train_ann(model, X_train, y_train, X_val, y_val, epochs, patience):
    # Early stopping callback based on validation loss
    early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=patience, min_delta=0.0863223)

    # Train the model
    history = model.fit(X_train, y_train, epochs=epochs, validation_data=(X_val, y_val), callbacks=[early_stopping])
    return history


# Define your model
input_size = 1502 
hidden_layer_size = 128  

model = create_ann(input_size, hidden_layer_size)

compile_model(model)

X_train, X_test, y_train, y_test = train_test_split(all_features, all_labels, test_size=0.2, random_state=42)

# Train your model
epochs = 100  # Adjust as per your requirements
patience = 100   # Number of epochs with no improvement on validation loss to stop training
history = train_ann(model, X_train, y_train, X_test, y_test, epochs, patience)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<h2>SVM Classifer<h2>

In [10]:
def train_svm(X_train, y_train, X_val, y_val):
    def svm_eval(C, gamma):
        # Define SVM with hyperparameters
        svm = SVC(kernel='rbf', C=C, gamma=gamma)

        # Train on training data
        svm.fit(X_train, y_train)

        # Predict on validation data
        y_pred = svm.predict(X_val)

        # Compute accuracy
        accuracy = accuracy_score(y_val, y_pred)
        return accuracy

    def optimize_svm(X_train, y_train, X_val, y_val):
        # Define the bounds of the hyperparameters
        pbounds = {'C': (0.1, 100), 'gamma': (0.0001, 0.1)}

        # Instantiate Bayesian Optimization
        optimizer = BayesianOptimization(f=svm_eval, pbounds=pbounds, random_state=1)

        # Maximize the accuracy
        optimizer.maximize(init_points=10, n_iter=20)

        return optimizer.max['params']
    
    # Hyperparameter tuning
    best_params = optimize_svm(X_train, y_train, X_val, y_val)
    best_C = best_params['C']
    best_gamma = best_params['gamma']

    # Train SVM with the best parameters
    svm = SVC(kernel='rbf', C=best_C, gamma=best_gamma)
    svm.fit(X_train, y_train)

    # Validate
    y_pred = svm.predict(X_val)
    return y_pred

<h2>SVM Testing</h2>

In [11]:
X_train, X_test, y_train, y_test = train_test_split(all_images, all_labels, test_size=0.2, random_state=42)

X_train_flattened = np.array([image.flatten() for image in X_train])
X_test_flattened = np.array([image.flatten() for image in X_test])


y_pred = train_svm(X_train_flattened, y_train, X_test_flattened, y_test)

# Evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)

print(f"Accuracy: {accuracy}, Recall: {recall}, Precision: {precision}")

NameError: name 'all_images' is not defined