In [1]:
import pandas as pd
import copy
import math
import random

In [2]:
PASSWORD = "bridport20"
OWNER_NAME = "Mariia Turchina"

In [3]:
class EntryDivider:
    """
    Separates and cleans each entry from person's dataset
    """
    
    def __init__(self, df):
        self.df = df
        self.nr_entries = self.df["entry"].max() + 1
        self.entries = self.divide_entries()
        
    def divide_entries(self):
        """
        Separates each entry from person's dataset
        """
        entries = []
        for entry_i in range(self.nr_entries):
            curr_entry = self.df[self.df["entry"] == entry_i]
            curr_entry = self._reset_time(curr_entry.sort_values(by = ["time"]).reset_index(drop = True))
            curr_entry, is_still_clean = self._remove_faulty(curr_entry)
            if (is_still_clean and self._is_correct(curr_entry)):
                entries.append(curr_entry)
            else:
                print("not clean data, entry = ", entry_i)
            
        return entries
    
    def _reset_time(self, entry):
        """
        Resets times of the entry to their relative times
        """
        start_time = entry["time"].min()
        entry["time"] -= start_time
        
        return entry
    
    def _remove_faulty(self, entry):
        """
        Checks if entry doesnt have any other pressed keys
        """
        clean_entry = copy.deepcopy(entry)
        
        for key_char in list(clean_entry["key_char"]):
            if (not key_char in PASSWORD):
                return clean_entry, False
        return clean_entry, True
    
    def _is_correct(self, entry):
        """
        Checks if entry corresponds to password
        """
        clean_entry = entry[entry["event"] == "KeyPress"]
        clean_entry = clean_entry.sort_values(by = ["time"]).reset_index(drop = True)
        entered_text = "".join(list(clean_entry["key_char"]))
        return entered_text == PASSWORD
    

In [4]:
class EventTransformer:
    """
    Detects and removes outliers from a specific person
    """
    
    def __init__(self, entries): #[DataFrame]
        self.original_entries = entries
        self.transformed_entries = self.transform_data()

    
    def transform_data(self):
        """
        Transforms entries
        """
        entries = copy.deepcopy(self.original_entries)
        
        for entry_i in range(len(entries)):
            entries[entry_i] = entries[entry_i].sort_values(by = ["key_char", "time"]).reset_index(drop = True)
            
            for event_i in range(len(entries[entry_i])):
                entries[entry_i].loc[event_i, "time"] = self._transform(entries[entry_i].loc[event_i, "time"])
                
        return entries  
    
    
    def _transform(self, time):
        return math.log(time + 0.01)

In [5]:
class FeatureConstructor:
    
    def __init__(self, check_entry):
        self.check_entry = check_entry
        self.presstimes = self._get_presstimes()
        self.between_keypresses = self._get_between_keypresses()
        
    
    def _get_presstimes(self):
        """
        Calculates the time each key was pressed
        """
        entry = copy.deepcopy(self.check_entry)
        entry = entry.sort_values(by = ["key_char", "time"]).reset_index(drop = True)
        
        if (len(entry) % 2 != 0):
            print("we've got uneven pressing/releasing for some reason \n")
            return
        
        elapsed_times = {}
        sorted_password = ''.join(sorted(PASSWORD))
        for i in range(len(PASSWORD)):
            elapsed_times[self._key_index(i, sorted_password[i])] = []
            
        for event_i in range(0, len(entry)-1, 2):
            elapsed_time = entry.iloc[event_i+1]["time"] - entry.iloc[event_i]["time"]
            curr_key_char = entry.iloc[event_i]["key_char"]
            elapsed_times[self._key_index((event_i+1)//2, curr_key_char)] = elapsed_time
            
        return elapsed_times
        
        
    def _get_between_keypresses(self):
        """
        Calculate times between key presses
        """
        entry = copy.deepcopy(self.check_entry)
        entry = entry[entry["event"] == "KeyPress"].sort_values(by = ["time"]).reset_index(drop = True)
        
        between_times = {}
        
        for i in range(len(PASSWORD)-1):
            between_times[self._key_index(i, PASSWORD[i] + "->" + PASSWORD[i+1])] = [] 
            
        for event_i in range(len(entry)-1):
            between_time = entry.loc[event_i+1, "time"] - entry.loc[event_i, "time"]
            curr_key_char1 = entry.iloc[event_i]["key_char"]
            curr_key_char2 = entry.iloc[event_i+1]["key_char"]
            between_times[self._key_index(event_i, curr_key_char1 + "->" + curr_key_char2)] = between_time
            
        return between_times
        
        
    def _key_index(self, index, char):
        return str(index) + ": " + char

In [6]:
class OwnerData:
    
    def __init__(self, distributions):
        self.distributions = distributions
        self.stats_presstimes = self._get_presstimes()
        self.stats_between_keypresses = self._get_between_keypresses()
        
    def _get_presstimes(self):
        data = self.distributions[OWNER_NAME]
        data = data.drop(data.index[0]).dropna().to_dict()
        return self._float_convert(data)
    
    def _get_between_keypresses(self):
        data = self.distributions[OWNER_NAME+".1"]
        data = data.drop(data.index[0]).dropna().to_dict()
        return self._float_convert(data)
    
    def _float_convert(self, d):
        for key, arr in d.items():
            arr_strs = list(arr[1:-1].split(", "))
            d[key] = [float(nr) for nr in arr_strs]
        return d

In [7]:
class ScoreCalculator:
    """
    Calculates the score!!!
    """
    
    def __init__(self, 
                 stats_presstimes, stats_between_keypresses, 
                 check_presstimes, check_between_keypresses):
        
        self.stats_presstimes = stats_presstimes
        self.stats_between_keypresses = stats_between_keypresses
        self.check_presstimes = check_presstimes
        self.check_between_keypresses = check_between_keypresses
        
        self.score = self.calculate_score()
        
    def calculate_score(self):
        return self._get_presstimes_score() + self._get_between_keypresses_score()
        
    def _get_presstimes_score(self):
        
        score = 0
        
        for key, point in self.check_presstimes.items():
            weight = self.stats_presstimes[key][0]
            mean = self.stats_presstimes[key][1]
            std = self.stats_presstimes[key][2]
            score += (weight / self._get_nr_stdevs_away(mean, std, point))
            
        return score
    
    def _get_between_keypresses_score(self):
        
        score = 0
        
        for key, point in self.check_between_keypresses.items():
            weight = self.stats_between_keypresses[key][0]
            mean = self.stats_between_keypresses[key][1]
            std = self.stats_between_keypresses[key][2]
            score += (weight / self._get_nr_stdevs_away(mean, std, point))
            
        return score
        
    def _get_nr_stdevs_away(self, mean, std, point):
        return abs(point - mean) / std
        
        

In [8]:
names = []
names.append("Mariia Turchina")
names.append("Sander Steeghs")
names.append("Finn Devlin")
names.append("Christian van den Berg")
names.append("Sanderijn van Loosdrecht")

def generate_entries_of(name):
    """
    Generating entries from all datasets
    """
    all_entries = []
    
    name_entries = EntryDivider(pd.read_excel(name + ".xlsx", encoding = "utf8")).entries
    name_entries = EventTransformer(name_entries).transformed_entries
            
    return name_entries

In [9]:
def get_random_entry():
    name = random.choice(names)
    name_entries = generate_entries_of(name)
    check_entry_i = random.randrange(0, len(name_entries))
    check_entry = name_entries[check_entry_i]
    
    print("checking entry {} of {}".format(check_entry_i, name))
    
    return check_entry

In [10]:
for _ in range(100):

    check_entry = get_random_entry()

    check_features = FeatureConstructor(check_entry)

    distributions_df = pd.read_csv("distributions.csv", encoding = "utf8", index_col = [0])
    owner_data = OwnerData(distributions_df)

    score = ScoreCalculator(owner_data.stats_presstimes, owner_data.stats_between_keypresses,
                            check_features.presstimes, check_features.between_keypresses).score
    print("score =", score, "\n")

checking entry 153 of Finn Devlin
score = 29.90671166008096 

checking entry 59 of Sanderijn van Loosdrecht
score = 18.135591418733952 

checking entry 165 of Mariia Turchina
score = 37.379573729570794 

checking entry 44 of Sander Steeghs
score = 9.558225770116426 

checking entry 28 of Sanderijn van Loosdrecht
score = 25.71975610725762 

checking entry 18 of Sander Steeghs
score = 21.603997578912917 

checking entry 38 of Sander Steeghs
score = 1485.7319307887635 

checking entry 68 of Christian van den Berg
score = 104.07887741234907 

checking entry 138 of Sanderijn van Loosdrecht
score = 16.13953973829474 

checking entry 51 of Mariia Turchina
score = 132.2647098747205 

checking entry 10 of Sander Steeghs
score = 72.99593075186735 

checking entry 36 of Sander Steeghs
score = 25.53771664824824 

checking entry 171 of Mariia Turchina
score = 48.90011638552663 

checking entry 34 of Sander Steeghs
score = 40.72662646648437 

checking entry 185 of Finn Devlin
score = 14.588470037156