In [2]:
import numpy as np
import pandas as pd
from glob import glob
import os
from typing import List
from dataclasses import dataclass

In [2]:
@dataclass
class Sign:
  userID: int
  genuine: bool
  data: pd.DataFrame
  
@dataclass
class User:
  userID: int
  signatures: list[Sign]

In [3]:
def DevelopmentStylus(path: str) -> List[Sign]:
    files = glob(path)
    signatures = []
    for file in files:
        df = pd.read_csv(file, sep=' ', skiprows=1, header=None)
        
        file = file.split("/")[3]
        ID = int(file[8:12])
        #print(ID)
        if(file[13] == 'g'):
            real = True
        else:
            real = False
            
        #MCYT:
        if(0<=ID<=230):
            df.drop(df.columns[[3,4]], axis=1, inplace=True)
        #BiosecureID:
        if(231<=ID<=498):
            df.drop(df.columns[[3,4,5]], axis=1, inplace=True)
        #e-BioSign DS1-2:
        if(ID>498):
            break
        #pressure is the only extra info, so we don't drop anything else
        df.columns = ("x","y","time", "pressure")
        #start time from 0
        df.time = df.time - df.time[0]
        
        #print(Sign(userID=ID, genuine=real, data=df))
        signatures.append(Sign(userID=ID, genuine=real, data=df))
    return signatures
    
DevStylusSigns = DevelopmentStylus("DeepSignDB/DeepSignDB/Development/stylus/*.txt")

In [None]:
#MCYT Users with DevStylus signatures added
users = []
for i in range(231):
    signs = []
    for s in stylusDevSigns:
        if(s.userID == i):
            signs.append(s)
    users.append(User(userID=i,signatures=signs))
    print(User(userID=i,signatures=signs))

In [5]:
def EvalStylus(path: str) -> List[Sign]:
    files = glob(path)
    signatures = []
    for file in files:
        df = pd.read_csv(file, sep=' ', skiprows=1, header=None)
        
        file = file.split("/")[3]
        ID = int(file[8:12])
        #print(ID)
        if(file[13] == 'g'):
            real = True
        else:
            real = False
            
        #MCYT:
        if(0<=ID<=100):
            df.drop(df.columns[[3,4]], axis=1, inplace=True)
        #BiosecureID and BiosecureDS2:
        if(101<=ID<=372):
            df.drop(df.columns[[3,4,5]], axis=1, inplace=True)
        #e-BioSign DS1-2:
        #pressure is the only extra info, so we don't drop anything else
        df.columns = ("x","y","time", "pressure")
        #start time from 0
        df.time = df.time - df.time[0]
        #print(Sign(userID=ID, genuine=real, data=df))
        signatures.append(Sign(userID=ID, genuine=real, data=df))
    return signatures
    
evalStylusSigns = EvalStylus("DeepSignDB/DeepSignDB/Evaluation/stylus/*.txt")

In [None]:
#MCYT Users with EvalStylus signatures added
users = []
for i in range(101):
    signs = []
    for s in evalStylusSigns:
        if(s.userID == i):
            signs.append(s)
    users.append(User(userID=i,signatures=signs))
    print(User(userID=i,signatures=signs))

In [None]:
def DevelopmentFinger(path: str) -> List[Sign]:
    files = glob(path)
    signatures = []
    for file in files:
        df = pd.read_csv(file, sep=' ', skiprows=1, header=None)
        df.drop(df.columns[[3]], axis=1, inplace=True)
        df.columns = ("x","y","time")
        #start time from 0
        df.time = df.time - df.time[0]
        
        file = file.split("/")[3]
        ID = int(file[8:12])
        if(file[13] == 'g'):
            real = True
        else:
            real = False
        
        #print(Sign(userID=ID, genuine=real, data=df))
        signatures.append(Sign(userID=ID, genuine=real, data=df))
    return signatures

devFingerSigns = DevelopmentFinger("DeepSignDB/DeepSignDB/Development/finger/*.txt")

In [None]:
def EvalFinger(path: str) -> List[Sign]:
    files = glob(path)
    signatures = []
    for file in files:
        df = pd.read_csv(file, sep=' ', skiprows=1, header=None)
        df.drop(df.columns[[3]], axis=1, inplace=True)
        df.columns = ("x","y","time")
        #start time from 0
        df.time = df.time - df.time[0]
        
        file = file.split("/")[3]
        ID = int(file[8:12])
        if(file[13] == 'g'):
            real = True
        else:
            real = False
            
        #print(Sign(userID=ID, genuine=real, data=df))
        signatures.append(Sign(userID=ID, genuine=real, data=df))
    return signatures

evalFingerSigns = EvalFinger("DeepSignDB/DeepSignDB/Evaluation/finger/*.txt")

In [26]:
# A little experimenting with the lengts of the signatures
files = glob("DeepSignDB/DeepSignDB/Development/stylus/*.txt")
length = []
for f in files:
    file = open(f, 'r')
    if(int(f[42:46]) > 498):
        break
    length.append(int(file.readline()))
    #print(int(file.readline()))
    file.close()
    

In [35]:
length.sort()
print("avg: " , sum(length)/len(length))
print("modusz: ", max(set(length), key=length.count))
print("median: ", length[len(length)//2])

avg:  699.9727952010103
modusz:  372
median:  528


In [15]:
# min max scaling - nem fog kellenni mert kell a pixel hossz/magasság
# de egy sima 0 ra transzformálás szépít a dolgokon de lehet felesleges
for sign in stylusDevSigns:
    sign.data.x = (sign.data.x - sign.data.x.min())
    sign.data.y = (sign.data.y - sign.data.y.min())

In [None]:
stylusDevSigns[10].data.plot(x='x', y='y', kind='line')
for i in range(10):
    stylusDevSigns[i*50].data.plot(x='x', y='y', kind='line')

In [None]:
# kísérletezés a DataFrammel

# COPY IS IMPORTANT if you don't want to delete original data
s = stylusDevSigns[6].data.copy()
s_del = stylusDevSigns[6].data.copy()

s.plot(x='x', y='y', kind='line')

s.head(10)
s_filtered = s[s.pressure > 0]
s_filtered.head(10)

# delete all rows where column pressure == 0
s_del.drop(s[(s.pressure == 0)].index, inplace = True)
s_del.plot(x='x', y='y', kind='line')

# Features:

1. **Duration:** The time taken to perform the signature, in seconds.
1. **Width/Length:** This is the horizontal distance measured between the two most extreme points in the **X** direction.
1. **Height:** This is the distance between the lowest point in a word and the highest point in a word. (**Y** coordinate)
1. **Aspect ratio:** This is the ratio of the writing length to the writing height.
1. **Pen-down ratio:** This is the ratio of the pen-down time to the total writing time. Calculation is performed by taking the ratio of the number of non-zero points to the total number of points.
1. **Number of gaps/pen-ups:** The number of times the pen is lifted while signing after the first contact with the tablet and excluding the final pen-lift.

1. **Pressure**
> - **mean tip pressure**
> - **std of tip pressure**
> - **max tip pressure**
> - **min tip pressure**
    
8. **Velocity**
> - **Horizontal velocity:** This is the average velocity over the **X** direction. It measures how fast the signature moves horizontally. This feature is calculated as the ratio of signature length to he duration of the signature.
> - **Max velocity:** Calculation is performed in terms of component velocities vx and vy, calculated as the first derivative of the x and y streams respectively.
> - **Average velocity**
> - **Std of velocity**
9. **Acceleration**
> - **Average absolute acceleration:** The average absolute acceleration captures the mean rate of velocity change in both positive and negative directions.
> - **Max acceleration**
> - **Std of acceleration**
> - **Max deceleration**
    
10. **number of strokes:** This feature is indicative of how many segments or states the handwriting goes through during the signature’s production. Calculated as
1. **curvature:** This is a measure of how “flat” or how “curved” the handwriting is. Curvature is calculated as the ratio of the signature path length to the word length. The path length is the sum of distances between each consecutive point in the sample (large number in the order of 10,000 pixels). The word length is the physical, or Euclidean, distance between the captured writing’s first and last point.
1. **average curvature per stroke:** The *curvature* value is calculated for each individual stroke in the handwriting sample, then averaged.
1. **cursitivity:** The ratio of the number of strokes to the number of pen-downs.
1. **cursiveness:** The ratio of the horizontal length of the handwriting to the number of pen-downs.
1. **stroke concavity:** This measures how close the average stroke is to being a straight line.

1. **top heaviness:** This is a measure of the proportion of the signature that lies above the vertical midpoint. This feature is calculated as the ratio of the number of points above the vertical midpoint to the total number of points. The midpoint can be calculated as mean and/or mode.
1. **horizontal dispersion:** This is a measure of the horizontal spread of the signature. It is calculated as the ratio of the number of points left to the horizontal midpoint of the signature to the total number of points. The midpoint can be calculated as mean and/or mode.

1. **Component physical spacing:** Calculation involves taking the Euclidean distance between the last point sampled in a component and the first point sampled in the following component (if any). This value is calculated for each pen-up instance and averaged to obtain the final feature value.
1. **Component time spaceing:** This is the average duration of a pen-up instances in a signature.

1. **mean ascender height:** This is the mean height of
“ascenders” in the handwriting. 
Ascenders are letters such
as ‘d’, ‘f’ and ‘t’ in which a part of the letter extends
above the main body of the sample.
 Formal detection of ascenders in the body of a signature involves computing
the mean of the data, as well as points at one quarter and
three quarters of the maximum height. The ascender’s
peaks are the local maxima in the y direction that are
above the three quarter mark. 
The distance between a local maximum and the y mean is found and this distance
is taken as the height of that ascender. The mean height
for all ascenders is used as the value for this feature.
1. **descender depth:** Descenders are the opposite
of ascenders. They are letters such as ‘g’, ‘y’ and ‘j’ that
typically contain parts extending below the main body of
the sample. Finding the descender extremities is
done in a similar fashion to ascenders and uses the same
frequency histogram. The descender extremities are the
local minima in the y direction that fall below the lower
quarter of the sample. The depth value for each extremity
is measured as the distance between the local minimum
and the y mean expressed as a positive integer.

In [3]:
def FeatureExtract(sign: pd.DataFrame) :
    features = []
    #1 duration of the signature is seconds
    duration = sign.time[-1]/1000
    features.append(duration)
    #2-4 Width, Height, Aspect Ratio
    width = sing.x.max() - sign.x.min()
    height = sign.y.max() - sign.y.min()
    aspectRatio = width / height
    features.append(width)
    features.append(height)
    features.append(aspectRatio)
    #5 pen down ratio
    len = sign.shape[0]
    zeroes = sign.value_counts("pressure").get(0)
    penDownRatio = len - zeroes / len
    features.append(penDownRatio)
    #6 number of gaps
    gaps = 0
    for i in range(1, len):
        if (p[i-1] > 0 and p[i] == 0):
            gaps += 1
    features.append(gaps)
    #7-10 mean, std, max, min pressure (nem tudom hogy kell-e mindegyik)
    meanPressure = sign.pressure.mean()
    stdPressure = sign.pressure.std()
    maxPressure = sign.pressure.min()
    minPressure = sign.pressure.max()
    features.append(meanPressure)
    features.append(stdPressure)
    features.append(maxPressure)
    features.append(minPressure)
    
    #Calculate velocity and acceleration
    # össze kéne vonni majd ezt sok egyforma for ciklust
    x_vel = [0]
    y_vel = [0]
    vel = [0]
    x_acc = [0]
    y_acc = [0]
    acc = [0]

    for i in range(1, len):
        # calculate velocities
        vx = x[i] - x[i-1] / 10     # /10 is needed?
        vy = y[i] - y[i-1] / 10
        v = np.sqrt(vx**2 + vy**2)
        x_vel.append(vx)
        y_vel.append(vy)
        vel.append(v)
        # calculate accelerations
        ax = vx - x_vel[i-1]
        ay = vy - y_vel[i-1]
        a = np.sqrt(ax**2 + ay**2)
        x_acc.append(ax)
        y_acc.append(ay)
        acc.append(a)
        
    sign['velocity'] = vel
    sign['acceleration'] = acc
    
    strokes = []
    for i in range(1,len):
        stroke = []
        # direction is the same
        dot_product = x_vel[i] * x_vel[i-1] + y_vel[i] * y_vel[i-1]
        if(dot_product >= 0):
            stroke.append(sign[i])
        else:
            strokes.append(stroke)
            stroke.clear()
            stroke.append(sign[i])
    
    

    
    return features
    

In [None]:
    """
    Need a preprocess that removes dots, crossings and other fragments for some features?
    Rotation normalization??? baseline
    """
