In [1]:
import numpy as np
import pandas as pd
from glob import glob
from typing import List
from dataclasses import dataclass
import pickle

In [2]:
@dataclass
class Sign:
  userID: int
  genuine: bool
  data: pd.DataFrame
  feature_vector: np.ndarray
  
@dataclass
class User:
  userID: int
  signatures: list[Sign]

In [3]:
# Need the pressure data for the feature vector so only use the stylus data
# "*DeepSignDB/DeepSignDB/Development/stylus/*.txt"
def DevelopmentStylus(path: str) -> List[Sign]:
    files = glob(path)
    signatures = []
    for file in files:
        df = pd.read_csv(file, sep=' ', skiprows=1, header=None)
        
        file = file.split("/")[3]
        ID = int(file[8:12])
        #print(ID)
        if(file[13] == 'g'):
            real = True
        else:
            real = False
            
        #MCYT:
        if(0<=ID<=230):
            df.drop(df.columns[[3,4]], axis=1, inplace=True)
        #BiosecureID:
        if(231<=ID<=498):
            df.drop(df.columns[[3,4,5]], axis=1, inplace=True)
        #e-BioSign DS1-2:
        if(ID>498):
            break
        #pressure is the only extra info, so we don't drop anything else
        df.columns = ("x","y","time", "pressure")
        
        # There is one sign with zero pressure at the start, so we remove those points from the start
        if(df.pressure[0] == 0):
            # Find the first index where pressure is not zero
            first_non_zero_pressure_index = df['pressure'].ne(0).idxmax()
            # Slice the DataFrame from this point to the end
            df = df.loc[first_non_zero_pressure_index:]
            df.reset_index(drop=True, inplace=True)

        #start time from 0
        df.time = df.time - df.time[0]
        
        #print(Sign(userID=ID, genuine=real, data=df))
        signatures.append(Sign(userID=ID, genuine=real, data=df, feature_vector=None))
    return signatures

In [4]:
# "*DeepSignDB/DeepSignDB/Evaluation/stylus/*.txt"
def EvalStylus(path: str) -> List[Sign]:
    files = glob(path)
    signatures = []
    for file in files:
        df = pd.read_csv(file, sep=' ', skiprows=1, header=None)
        
        file = file.split("/")[3]
        ID = int(file[8:12])
        #print(ID)
        if(file[13] == 'g'):
            real = True
        else:
            real = False
            
        #MCYT:
        if(0<=ID<=100):
            df.drop(df.columns[[3,4]], axis=1, inplace=True)
        #BiosecureID and BiosecureDS2:
        if(101<=ID<=372):
            df.drop(df.columns[[3,4,5]], axis=1, inplace=True)
        #e-BioSign DS1-2:
        if(ID>372):
            break
        #pressure is the only extra info, so we don't drop anything else
        df.columns = ("x","y","time", "pressure")
        #start time from 0
        df.time = df.time - df.time[0]
        
        # Fixing the time for the signatures with 0 time
        if(df.time[5] == 0):
            length = len(df)
            time = range(0, length*10, 10)
            df.time = time
        
        #print(Sign(userID=ID, genuine=real, data=df))
        signatures.append(Sign(userID=ID, genuine=real, data=df, feature_vector=None))
    return signatures

In [5]:
devSigns = DevelopmentStylus("DeepSignDB/DeepSignDB/Development/stylus/*.txt")
evalSigns = EvalStylus("DeepSignDB/DeepSignDB/Evaluation/stylus/*.txt")

In [6]:
print(len(devSigns))   # 19004
print(len(evalSigns))  # 15696

19004
15696


In [7]:
# Serialize the raw signatures
with open('rawDevSigns.pkl', 'wb') as out_file:
    pickle.dump(devSigns, out_file)
    
with open('rawEvalSigns.pkl', 'wb') as out_file:
    pickle.dump(evalSigns, out_file)