In [1]:
import numpy as np
import os
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import matplotlib.cm as cm
from scipy import ndimage
from skimage.measure import regionprops
from skimage import io
from skimage.filters import threshold_otsu   # For finding the threshold for grayscale to binary conversion
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior() 
import pandas as pd
import numpy as np
from time import time
import keras
from tqdm import tqdm

ModuleNotFoundError: No module named 'tensorflow'

## Path defined

In [18]:
genuine_image_paths_train = "../Dataset/train/real"
genuine_image_paths_test = "../Dataset/test/real"

forged_image_paths_train = "../Dataset/train/forged"
forged_image_paths_test = "../Dataset/test/forged"

## Data Preparation - necessary steps to prepare the raw data, e.g. cleaning data

#### Pre-processing the images: Converting the format of the PNG images from
#### 1. RGB to Greyscale
#### 2. Greyscale to Binary

### This is to decrease the number of variables to work with, now that the images are composed of elements that are definitely either black or white. This pronounces the edges and curves of the signature by delineating clearly what is inked and what is not, making feature extraction more conclusive


In [19]:
def rgbgrey(img):
    # Converts rgb to grayscale
    greyimg = np.zeros((img.shape[0], img.shape[1]))
    for row in range(len(img)):
        for col in range(len(img[row])):
            greyimg[row][col] = np.average(img[row][col])
    return greyimg

In [20]:
def greybin(img):
    # Converts grayscale to binary
    blur_radius = 0.8
    img = ndimage.gaussian_filter(img, blur_radius)  # to remove small components or noise
#     img = ndimage.binary_erosion(img).astype(img.dtype)
    thres = threshold_otsu(img)
    binimg = img > thres
    binimg = np.logical_not(binimg)
    return binimg

In [21]:
def preproc(path, img=None, display=True):
    if img is None:
        img = mpimg.imread(path)
    if display:
        plt.imshow(img)
        plt.show()
    grey = rgbgrey(img) #rgb to grey
    if display:
        plt.imshow(grey, cmap = matplotlib.cm.Greys_r)
        plt.show()
    binimg = greybin(grey) #grey to binary
    if display:
        plt.imshow(binimg, cmap = matplotlib.cm.Greys_r)
        plt.show()
    r, c = np.where(binimg==1)
    # Now we will make a bounding box with the boundary as the position of pixels on extreme.
    # Thus we will get a cropped image with only the signature part.
    signimg = binimg[r.min(): r.max(), c.min(): c.max()]
    if display:
        plt.imshow(signimg, cmap = matplotlib.cm.Greys_r)
        plt.show()
    return signimg

## Statistical Description


### Feature extraction - Converting the preprocessed images into numbers and statistics that can be inputed into our CNN model. This data invokes the concept of clustering to classify data according to certain patterns. In this case, the patterns are the distinct features of a signature, namely:

#### ratio: The ratio of the width to height of the bounding box around the signature.
#### cent_y: The y-coordinate of the center of mass of the signature.
#### cent_x: The x-coordinate of the center of mass of the signature.
#### eccentricity: A measure of how elongated the signature is.
#### solidity: The ratio of the area of the signature to the area of the convex hull surrounding it.
#### skew_x: A measure of how asymmetric the signature is with respect to the x-axis.
#### skew_y: A measure of how asymmetric the signature is with respect to the y-axis.
#### kurt_x: A measure of how heavy the tails of the signature are with respect to the x-axis.
#### kurt_y: A measure of how heavy the tails of the signature are with respect to the y-axis.
#### The outputs '0' and '1' indicate whether the signature is genuine or forged, with '1' representing a genuine signature and '0' representing a forged signature.

### For our specific case, instead of classifying data into specific categories, we use numeric metrics to quantify how much of that particular feature that signature possesses. 

In [22]:
def Ratio(img):
    a = 0
    for row in range(len(img)):
        for col in range(len(img[0])):
            if img[row][col]==True:
                a = a+1
    total = img.shape[0] * img.shape[1]
    return a/total

In [23]:
def Centroid(img):
    numOfWhites = 0
    a = np.array([0,0])
    for row in range(len(img)):
        for col in range(len(img[0])):
            if img[row][col]==True:
                b = np.array([row,col])
                a = np.add(a,b)
                numOfWhites += 1
    rowcols = np.array([img.shape[0], img.shape[1]])
    centroid = a/numOfWhites
    centroid = centroid/rowcols
    return centroid[0], centroid[1]   # 1st row is the x-component, 2nd row is the y-component

In [24]:
def EccentricitySolidity(img):
    r = regionprops(img.astype("int8"))
    return r[0].eccentricity, r[0].solidity

In [25]:
def SkewKurtosis(img):
    h,w = img.shape
    x = range(w)  # columns value
    y = range(h)  # rows value
    #calculate projections along the x and y axes
    xp = np.sum(img,axis=0)
    yp = np.sum(img,axis=1)
    #centroid
    cx = np.sum(x*xp)/np.sum(xp)
    cy = np.sum(y*yp)/np.sum(yp)
    #standard deviation
    x2 = (x-cx)**2
    y2 = (y-cy)**2
    sx = np.sqrt(np.sum(x2*xp)/np.sum(img))
    sy = np.sqrt(np.sum(y2*yp)/np.sum(img))
    
    #skewness
    x3 = (x-cx)**3
    y3 = (y-cy)**3
    skewx = np.sum(xp*x3)/(np.sum(img) * sx**3)
    skewy = np.sum(yp*y3)/(np.sum(img) * sy**3)

    #Kurtosis
    x4 = (x-cx)**4
    y4 = (y-cy)**4
    # 3 is subtracted to calculate relative to the normal distribution
    kurtx = np.sum(xp*x4)/(np.sum(img) * sx**4) - 3
    kurty = np.sum(yp*y4)/(np.sum(img) * sy**4) - 3

    return (skewx , skewy), (kurtx, kurty)

### getFeatures function extracts the numerical values of the various features, and returns the data in the form of a tuple

### getCSVFeatures uses this tuple and converts it into a 2-dimensional matrix as input format for CNN



In [26]:
def getFeatures(path, img=None, display=False):
    if img is None:
        img = mpimg.imread(path)
    img = preproc(path, display=display)
    ratio = Ratio(img)
    centroid = Centroid(img)
    eccentricity, solidity = EccentricitySolidity(img)
    skewness, kurtosis = SkewKurtosis(img)
    retVal = (ratio, centroid, eccentricity, solidity, skewness, kurtosis)
    return retVal

In [27]:
def getCSVFeatures(path, img=None, display=False):
    if img is None:
        img = mpimg.imread(path)
    temp = getFeatures(path, display=display)
    features = (temp[0], temp[1][0], temp[1][1], temp[2], temp[3], temp[4][0], temp[4][1], temp[5][0], temp[5][1])  
    '''
    2D MATRIX, DATA FORMAT FOR CNN INPUT. Each row vector displays each feature variable. 
    For example:
    
            col 1:         col 2:
    row 0: [ratio               ]
    row 1: [cent_y        cent_x]
    row 2: [eccentricity        ]
    row 3: [solidity            ]
    row 4: [skew_y        skew_x]
    row 5: [kurt_x        kurt_y]
    ''' 
    return features

## Saving the features

#### Saves the numerical variables of the signatures into an Excel CSV file

In [28]:
def makeCSV(setType):
    # if the specified folder does not exist, create it to store the incoming CSV
    if not(os.path.exists('../Dataset/Features')):          
        os.mkdir('../Dataset/Features')   
        print('New folder "Features" created')
    if not(os.path.exists('../Dataset/Features/Training')):
        os.mkdir('../Dataset/Features/Training')
        print('New folder "Features/Training" created')
    if not(os.path.exists('../Dataset/Features/Testing')):
        os.mkdir('../Dataset/Features/Testing')
        print('New folder "Features/Testing" created')
   
    # Check what type
    if(setType != 'train' and setType != 'test'):
        print('Invalid type!')
        return
    
    if(setType == 'train'):
        gpath = genuine_image_paths_train
        fpath = forged_image_paths_train
        csvpath = '../Dataset/Features/Training/training_'
        setRange = range(1, 70)
    else:
        gpath = genuine_image_paths_test
        fpath = forged_image_paths_test
        csvpath = '../Dataset/Features/Testing/testing_'
        setRange = range(49, 70)
    

    # Extracting features based on type defined: 'train' ranges from 1-69, 'test' ranges from 49-69
    for person in tqdm(setRange, desc='Extracting Features...'):
        per = ('00'+str(person))[-3:]
               
        # In case folder is not iterative (i.e, 005 does not exist)
        if not(os.path.exists(os.path.join(gpath, per))):
            continue
    
        # In case folder is not iterative (i.e, 005 does not exist)
        if not(os.path.exists(os.path.join(fpath, per+'_forg'))):
            continue
    
        # every signature image will generate a unique CSV feature row specific to it, hence the concatenation
        with open(csvpath+per+'.csv', 'w') as handle:
            handle.write('ratio,cent_y,cent_x,eccentricity,solidity,skew_x,skew_y,kurt_x,kurt_y,output\n')

            # Genuine set
            i = 1
            fileNum = ('0' + str(i))[-2:]  # this takes care of single vs double digits
            filePath = os.path.join(gpath, per,  per + '_' + fileNum + '.png')
            while(os.path.exists(filePath)):
                source = filePath   # source image taken iteratively based on their own naming convention  
                features = getCSVFeatures(path=source)
                handle.write(','.join(map(str, features))+',1\n')  # 1 is set for genuine signatures 
                i += 1
                fileNum = ('0' + str(i))[-2:]  # this takes care of single vs double digits
                filePath = os.path.join(gpath, per,  per + '_' + fileNum + '.png')
                
            # Forged set
            i = 1
            fileNum = ('0' + str(i))[-2:]  # this takes care of single vs double digits
            filePath = os.path.join(fpath, per+'_forg',  per + '_forg_' + fileNum + '.png')
            while(os.path.exists(filePath)):
                source = filePath   # source image taken iteratively based on their own naming convention  
                features = getCSVFeatures(path=source)
                handle.write(','.join(map(str, features))+',0\n')  # 0 is set for genuine signatures 
                i += 1
                fileNum = ('0' + str(i))[-2:]  # this takes care of single vs double digits
                filePath = os.path.join(fpath, per+'_forg',  per + '_forg_' + fileNum + '.png')   

In [29]:
makeCSV('train')
makeCSV('test')

NameError: name 'tqdm' is not defined

# CNN Model via TensorFlow


#### ALL HELPER FUNCTIONS:

##### testing() function to make a new seperate CSV to store the features of the specific test image. Might be redundant if we are only using an existing test set, but this is to cover the scenario of custom external images.

In [46]:
def testing(path):    # path is the image specified
    feature = getCSVFeatures(path)   # extract features from a specific image into a tuple, suitable for storing into CSV 
    if not(os.path.exists('../Dataset/TestFeatures')):
        os.mkdir('../Dataset/TestFeatures')
    with open('../Dataset/TestFeatures/testcsv.csv', 'w') as handle:
        # create a new CSV to store the numerical features of the signature image, similar to makeCSV
        handle.write('ratio,cent_y,cent_x,eccentricity,solidity,skew_x,skew_y,kurt_x,kurt_y\n')
        handle.write(','.join(map(str, feature))+'\n')

def readCSV(train_path, test_path, type2=False):
    # Reading train data
    df = pd.read_csv(train_path, usecols=range(n_input))
    train_input = np.array(df.values)
    train_input = train_input.astype(np.float32, copy=False)  # Converting input to float_32
    df = pd.read_csv(train_path, usecols=(n_input,))
    temp = [elem[0] for elem in df.values]
    correct = np.array(temp)
    corr_train = keras.utils.to_categorical(correct,2)      # Converting to one hot
    
    # Reading test data
    df = pd.read_csv(test_path, usecols=range(n_input))
    test_input = np.array(df.values)
    test_input = test_input.astype(np.float32, copy=False)
    
    if not(type2):
        df = pd.read_csv(test_path, usecols=(n_input,))
        temp = [elem[0] for elem in df.values]
        correct = np.array(temp)
        corr_test = keras.utils.to_categorical(correct,2)      # Converting to one hot
        return train_input, corr_train, test_input, corr_test
    else:
        return train_input, corr_train, test_input

# Create model
def multilayer_perceptron(x):
    layer_1 = tf.tanh((tf.matmul(x, weights['h1']) + biases['b1']))
    layer_2 = tf.add(tf.matmul(layer_1, weights['h2']), biases['b2'])
    layer_3 = tf.add(tf.matmul(layer_2, weights['h3']), biases['b3'])
    out_layer = tf.tanh(tf.matmul(layer_1, weights['out']) + biases['out'])
    return out_layer

def evaluate(train_path, test_path, type2=False):   
    if not(type2):
        train_input, corr_train, test_input, corr_test = readCSV(train_path, test_path)
    else:
        train_input, corr_train, test_input = readCSV(train_path, test_path, type2)
    ans = 'Random'
    with tf.Session() as sess:
        sess.run(init)
        # Training cycle
        for epoch in range(training_epochs):
            # Run optimization op (backprop) and cost op (to get loss value)
            _, cost = sess.run([train_op, loss_op], feed_dict={X: train_input, Y: corr_train})
            if cost < 0.0001:
                break
#              # Display logs per epoch step
#             if epoch % 999 == 0:
#                 print("Epoch:", '%04d' % (epoch+1), "cost={:.9f}".format(cost))
#                 print("Optimization Finished!")
#                 print()
        
#         # Finding accuracies
#         accuracy1 =  accuracy.eval({X: train_input, Y: corr_train})
#         print("Accuracy for train:", accuracy1)
        if type2 is False:
            accuracy2 =  accuracy.eval({X: test_input, Y: corr_test})
            print("Accuracy for test:", accuracy2)
            return accuracy1, accuracy2
        else:
            prediction = pred.eval({X: test_input})
            if prediction[0][1]>prediction[0][0]:
                print('Genuine Image: {0:.1f} %'.format(prediction[0][1]*100))
                return True
            else:
                print('Forged Image: {0:.1f} %'.format(prediction[0][0]*100))
                return False


def trainAndTest(rate=0.001, epochs=1700, neurons=7, display=False):    
    start = time()

    # Parameters
    global training_rate, training_epochs, n_hidden_1
    learning_rate = rate
    training_epochs = epochs

    # Network Parameters
    n_hidden_1 = neurons # 1st layer number of neurons
    n_hidden_2 = 7 # 2nd layer number of neurons
    n_hidden_3 = 30 # 3rd layer

    train_avg, test_avg = 0, 0
    n = 10
    for i in range(1,n+1):
        if display:
            print("Running for Person id",i)
        temp = ('0'+str(i))[-2:]
        train_score, test_score = evaluate(train_path.replace('01',temp), test_path.replace('01',temp))
        train_avg += train_score
        test_avg += test_score
    if display:
#         print("Number of neurons in Hidden layer-", n_hidden_1)
        print("Training average-", train_avg/n)
        print("Testing average-", test_avg/n)
        print("Time taken-", time()-start)
    return train_avg/n, test_avg/n, (time()-start)/n

In [51]:
n_input = 9
train_person_id = input("Enter person's id : ")  # eg: 049
# key in path of image that you want to check (eg: ../Dataset/test/forged/049_forg/049_forg_01.png):
test_image_path = input("Enter path of signature image : ")  
train_path = '../Dataset/Features/Training/training_'+train_person_id+'.csv' # specify path of train CSV for that requested person
testing(test_image_path)   # make a new seperate CSV to store the features of the specific test image 
test_path = '../Dataset/TestFeatures/testcsv.csv'  # specify the path of newly created test CSV

tf.compat.v1.reset_default_graph()
# Parameters
learning_rate = 0.001
training_epochs = 1000
display_step = 1

# Network Parameters
n_hidden_1 = 7 # 1st layer number of neurons
n_hidden_2 = 10 # 2nd layer number of neurons
n_hidden_3 = 30 # 3rd layer number of neurons
n_classes = 2 # no. of classes (genuine or forged)

# tf Graph input
tf.compat.v1.disable_eager_execution()
X = tf.placeholder("float", [None, n_input])
Y = tf.placeholder("float", [None, n_classes])

# Store layers weight & bias
weights = {
    'h1': tf.Variable(tf.random_normal([n_input, n_hidden_1], seed=1)),
    'h2': tf.Variable(tf.random_normal([n_hidden_1, n_hidden_2])),
    'h3': tf.Variable(tf.random_normal([n_hidden_2, n_hidden_3])),
    'out': tf.Variable(tf.random_normal([n_hidden_1, n_classes], seed=2))
}
biases = {
    'b1': tf.Variable(tf.random_normal([n_hidden_1], seed=3)),
    'b2': tf.Variable(tf.random_normal([n_hidden_2])),
    'b3': tf.Variable(tf.random_normal([n_hidden_3])),
    'out': tf.Variable(tf.random_normal([n_classes], seed=4))
}

# Construct model
logits = multilayer_perceptron(X)

# Define loss and optimizer
loss_op = tf.reduce_mean(tf.math.squared_difference(logits, Y))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
train_op = optimizer.minimize(loss_op, var_list=None)

# For accuracies
pred = tf.nn.softmax(logits)  # Apply softmax to logits
correct_prediction = tf.equal(tf.argmax(pred,1), tf.argmax(Y,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

# Initializing the variables
init = tf.global_variables_initializer()

evaluate(train_path, test_path, type2=True)

Enter person's id : 069
Enter path of signature image : ../Dataset/test/forged/069_forg/069_forg_01.png
Forged Image: 73.8 %


False