In [1]:
import numpy as np
import pandas as pd
# from glob import glob
from typing import List
from dataclasses import dataclass
import pickle

import statistics
import math

In [2]:
@dataclass
class Sign:
  userID: int
  genuine: bool
  data: pd.DataFrame
  feature_vector: np.ndarray
  
@dataclass
class User:
  userID: int
  signatures: list[Sign]

In [3]:
# Deserialization of raw data
with open('rawDevSigns.pkl', 'rb') as in_file:
    devSigns = pickle.load(in_file)

with open('rawEvalSigns.pkl', 'rb') as in_file:
    evalSigns = pickle.load(in_file)

In [4]:
print(len(devSigns))   # 19004
print(len(evalSigns))  # 15696

19004
15696


# Features:

1. **Duration:** The time taken to perform the signature, in seconds.
1. **Width/Length:** This is the horizontal distance measured between the two most extreme points in the **X** direction.
1. **Height:** This is the distance between the lowest point in a word and the highest point in a word. (**Y** coordinate)
1. **Aspect ratio:** This is the ratio of the writing length to the writing height.
1. **Pen-down ratio:** This is the ratio of the pen-down time to the total writing time. Calculation is performed by taking the ratio of the number of non-zero points to the total number of points.
1. **Number of gaps/pen-ups:** The number of times the pen is lifted while signing after the first contact with the tablet and excluding the final pen-lift.

1. **Component physical spacing:** Calculation involves taking the Euclidean distance between the last point sampled in a component and the first point sampled in the following component (if any). This value is calculated for each pen-up instance and averaged to obtain the final feature value.
1. **Component time spacing:** This is the average duration of a pen-up instances in a signature.

1. **Pressure**
> - **mean tip pressure**
> - **std of tip pressure**
> - **min tip pressure**
> - **max tip pressure**
    
10. **Velocity**
> - **Horizontal velocity:** This is the average velocity over the **X** direction. It measures how fast the signature moves horizontally. This feature is calculated as the ratio of signature length to he duration of the signature.
> - **Max velocity:** Calculation is performed in terms of component velocities vx and vy, calculated as the first derivative of the x and y streams respectively.
> - **Average velocity**
> - **Std of velocity**
11. **Acceleration**
> - Average absolute acceleration: The average absolute acceleration captures the mean rate of velocity change in both positive and negative directions.
> - **Max acceleration**
> - **Mean acceleration**
> - **Std of acceleration**
    
12. **Number of strokes:** This feature is indicative of how many segments or states the handwriting goes through during the signature’s production. Calculated as
1. **Curvature:** This is a measure of how “flat” or how “curved” the handwriting is. Curvature is calculated as the ratio of the signature path length to the word length. The path length is the sum of distances between each consecutive point in the sample (large number in the order of 10,000 pixels). The word length is the physical, or Euclidean, distance between the captured writing’s first and last point.
1. **Average curvature per stroke:** The *curvature* value is calculated for each individual stroke in the handwriting sample, then averaged.
1. **Cursitivity:** The ratio of the number of strokes to the number of pen-downs.
1. **Cursiveness:** The ratio of the horizontal length of the handwriting to the number of pen-downs.

1. **Top heaviness:** This is a measure of the proportion of the signature that lies above the vertical midpoint. This feature is calculated as the ratio of the number of points above the vertical midpoint to the total number of points. The midpoint can be calculated as mean and/or mode.
1. **Horizontal dispersion:** This is a measure of the horizontal spread of the signature. It is calculated as the ratio of the number of points right to the horizontal midpoint of the signature to the total number of points. The midpoint can be calculated as mean and/or mode.

Unused:  
1. **Stroke concavity:** This measures how close the average stroke is to being a straight line. Calculation is performed using linear regression on the points in the stroke to obtain the line-of-best-fit. This measures how well the points in the stroke “fit” or approximate that line.
1. **Mean ascender height:** This is the mean height of
“ascenders” in the handwriting. 
Ascenders are letters such
as ‘d’, ‘f’ and ‘t’ in which a part of the letter extends
above the main body of the sample.
 Formal detection of ascenders in the body of a signature involves computing
the mean of the data, as well as points at one quarter and
three quarters of the maximum height. The ascender’s
peaks are the local maxima in the y direction that are
above the three quarter mark. 
The distance between a local maximum and the y mean is found and this distance
is taken as the height of that ascender. The mean height
for all ascenders is used as the value for this feature.
1. **Descender depth:** Descenders are the opposite
of ascenders. They are letters such as ‘g’, ‘y’ and ‘j’ that
typically contain parts extending below the main body of
the sample. Finding the descender extremities is
done in a similar fashion to ascenders and uses the same
frequency histogram. The descender extremities are the
local minima in the y direction that fall below the lower
quarter of the sample. The depth value for each extremity
is measured as the distance between the local minimum
and the y mean expressed as a positive integer.

**Source:** Alan McCabe, Jarrod Trevathan and Wayne Read (2008) "Neural Network-based Handwritten Signature Verification"  
        https://www.researchgate.net/publication/235993403_Neural_Network-based_Handwritten_Signature_Verification

In [5]:
def stroke_detector(sign: pd.DataFrame):
    # sign = original_sign.copy()
    sign_np = sign.to_numpy()
    # Calculate velocity and acceleration
    x_vel = [0]
    y_vel = [0]
    vel = [0]
    x_acc = [0]
    y_acc = [0]
    acc = [0]
    
    strokes = []
    stroke = []
    for i in range(1, len(sign)):
        # calculate velocities
        vx = (sign.x[i] - sign.x[i-1]) /10  # is /10 necessary?
        vy = (sign.y[i] - sign.y[i-1]) /10
        v = np.sqrt(vx**2 + vy**2)
        x_vel.append(vx)
        y_vel.append(vy)
        vel.append(v)
        # calculate accelerations
        ax = vx - x_vel[i-1]
        ay = vy - y_vel[i-1]
        a = np.sqrt(ax**2 + ay**2)
        x_acc.append(ax)
        y_acc.append(ay)
        acc.append(a)

        # direction is the same
        dot_product = x_vel[i] * x_vel[i-1] + y_vel[i] * y_vel[i-1]
        # print(dot_product)
        if(dot_product <= 0):
            strokes.append(stroke)
            stroke = []
        stroke.append(sign_np[i])
    strokes.append(stroke)
    
    sign['velocity'] = vel
    sign['acceleration'] = acc
    
    return strokes

In [6]:
def Curvature(sign):
    word_length = np.sqrt((sign[0][0] - sign[len(sign)-1][0])**2 + (sign[0][1] - sign[len(sign)-1][1])**2)
    path_length = 0
    for i in range(1, len(sign)):
        path_length += np.sqrt((sign[i][0] - sign[i-1][0])**2 + (sign[i][1] - sign[i-1][1])**2)
    
    return (path_length / word_length)

In [7]:
""" Questions:
    Do we need a preprocess that removes dots, crossings and other fragments for some features?
    Rotation normalization??? baseline
"""

' Questions:\n    Do we need a preprocess that removes dots, crossings and other fragments for some features?\n    Rotation normalization??? baseline\n'

In [8]:
def FeatureExtraction(sign: pd.DataFrame) :
    features = [] # np.ndarray?
    # 1 duration of the signature is seconds
    duration = sign['time'].iat[-1]/1000
    features.append(duration)
    # 2-4 Width, Height, Aspect Ratio
    width = sign.x.max() - sign.x.min()
    height = sign.y.max() - sign.y.min()
    aspectRatio = width / height
    features.append(width)
    features.append(height)
    features.append(aspectRatio)
    # 5 pen down ratio
    length = sign.shape[0]
    zeroes = sign.value_counts("pressure").get(0)
    if zeroes == None:
        zeroes = 0
    penDownRatio = (length - zeroes) / length
    features.append(penDownRatio)
    # 6 number of gaps
    # Itt talán jó lenne ki venni a kis pontokat, egyeneseket
    gaps = 0
    gap_lengths = []
    path_length = 0
    for i in range(1, length):
        # Calculate path length here just for less for loops
        path_length += np.sqrt((sign.x[i] - sign.x[i-1])**2 + (sign.y[i] - sign.y[i-1])**2)
        
        p = sign.pressure
        begin = 0
        if (p[i-1] > 0 and p[i] == 0):
            gaps += 1
            begin = i-1
        elif (p[i-1] == 0 and p[i] > 0):
            end = i
            gap_length = np.sqrt((sign.x[end] - sign.x[begin])**2 + (sign.y[end] - sign.y[begin])**2)
            gap_lengths.append(gap_length)
    features.append(gaps)

    # If there is no gap in the signature
    if gaps > 0:
        # 7 Component spacing
        features.append(sum(gap_lengths) / gaps)
        # 8 Component time spacing
        features.append(zeroes / gaps)
    else:
        features.append(0)
        features.append(0)
    
    # 9 mean, std, max, min pressure (nem tudom hogy kell-e mindegyik)
    # Remove all rows where pressure is 0
    signWOzeroes = sign.copy()
    signWOzeroes = signWOzeroes[signWOzeroes.pressure > 0]
    features.append(signWOzeroes.pressure.mean())
    features.append(signWOzeroes.pressure.std())
    features.append(signWOzeroes.pressure.min())
    features.append(signWOzeroes.pressure.max())
    
    # Calculating velocity, acceleration and strokes
    strokes = stroke_detector(sign)
    
    # 10-11 Velocity and acceleration features
    # Horizontal velocity
    horizontal_v = width / duration
    features.append(horizontal_v)
    
    features.append(sign.velocity.max())
    features.append(sign.velocity.mean())
    features.append(sign.velocity.std())
    # Average absolute acceleration?
    features.append(sign.acceleration.max())
    features.append(sign.acceleration.mean())
    features.append(sign.acceleration.std())

    # 12 Number of strokes
    nb_strokes = len(strokes)
    features.append(nb_strokes)
    
    # 13 Curvature
    # Word length is the Euclidean distance between the captured writing’s first and last point
    word_length = np.sqrt((sign.x[0] - sign.x[length-1])**2 + (sign.y[0] - sign.y[length-1])**2)
    features.append(path_length / word_length)
    # 14 Average curvature per stroke
    # If the stroke is too short (0, 1 or 2) then we can't calulate the curvature
    curvatures = []
    for s in strokes:
        if len(s) > 2:
            curvatures.append(Curvature(s))
        elif len(s) == 2:
            curvatures.append(1)
        else:
            curvatures.append(0)
    features.append(sum(curvatures) / len(curvatures))
    # 15 Cursitivity
    pendowns = gaps + 1
    features.append(nb_strokes / pendowns)
    # 16 Cursiveness
    features.append(width / pendowns)
    
    # Using the sign without hovering points
    # 17 Top heaviness
    height_mean = signWOzeroes.y.mean()
    # height_mode = max(set(sign.y), key=sign.y.count())
    above = signWOzeroes[signWOzeroes.y > height_mean].shape[0]
    features.append(above/length)
    # 18 Horizontal dispersion
    width_mean = signWOzeroes.x.mean()
    # width_mode = max(set(sign.x), key=sign.x.count())
    right = signWOzeroes[signWOzeroes.x > width_mean].shape[0]
    features.append(right/length)
    
    # Not implemented:
    # 19 Stroke concavity
    """
    concavity = 0
    for stroke in strokes:
        # ri is the coordinate along the line-of-best-fit that is the least distance from si.
        for i in range(len(stroke)):
            concavity += (stroke[i] - r[i])**2
    sqrt(concavity) """
    # 20 Mean ascender height
    # height_three_quarters = sign.y.quantile(0.75)
    # local maxima above this?
    # 21 Mean descender depth
    # height_one_quarter = sign.y.quantile(0.25)
    
    return features

In [9]:
# Calculating feature vectors for all signatures
for sign in devSigns:
    sign.feature_vector = FeatureExtraction(sign.data)

for sign in evalSigns:
    sign.feature_vector = FeatureExtraction(sign.data)

In [10]:
print(len(devSigns[0].feature_vector))

26


In [11]:
# Serialize signatures with feature vectors
with open('extractedDevSigns.pkl', 'wb') as out_file:
    pickle.dump(devSigns, out_file)
    
with open('extractedEvalSigns.pkl', 'wb') as out_file:
    pickle.dump(evalSigns, out_file)

In [12]:
# Deserialization of signatures with feature vectors
with open('extractedDevSigns.pkl', 'rb') as in_file:
    devSigns = pickle.load(in_file)

with open('extractedEvalSigns.pkl', 'rb') as in_file:
    evalSigns = pickle.load(in_file)

In [13]:
def SignaturesToUsers(inputSigns: List[Sign]) -> List[User]:
    users = []
    for i in range(inputSigns[0].userID, inputSigns[-1].userID+1):
        userSigns = []
        for sign in inputSigns:
            if(sign.userID == i):
                userSigns.append(sign)
        users.append(User(userID=i,signatures=userSigns))
        # print(User(userID=i,signatures=signs))
    return users

In [14]:
# Assign each sign to its user
devUsers = SignaturesToUsers(devSigns)
evalUsers = SignaturesToUsers(evalSigns)

In [15]:
print(len(devUsers))
print(len(devUsers[50].signatures))
print(len(devUsers[350].signatures))
print(len(evalUsers))
print(len(evalUsers[50].signatures))
print(len(evalUsers[120].signatures))


498
50
28
372
50
28


In [16]:
# Serialize users
with open('devUsers.pkl', 'wb') as out_file:
    pickle.dump(devUsers, out_file)
    
with open('evalUsers.pkl', 'wb') as out_file:
    pickle.dump(evalUsers, out_file)