In [1]:
import pandas as pd
import copy
import math
import random
from collections import defaultdict
from scipy.stats import norm

In [2]:
PASSWORD = "bridport20"
OWNER_NAME = "Finn Devlin"

In [None]:
percentile = 0.8
Z_value = norm.ppf(percentile) #inverse normal

In [3]:
class EntryDivider:
    """
    Separates and cleans each entry from person's dataset
    """
    
    def __init__(self, df):
        self.df = df
        self.nr_entries = self.df["entry"].max() + 1
        self.entries = self.divide_entries()
        
    def divide_entries(self):
        """
        Separates each entry from person's dataset
        """
        entries = []
        for entry_i in range(self.nr_entries):
            curr_entry = self.df[self.df["entry"] == entry_i]
            curr_entry = self._reset_time(curr_entry.sort_values(by = ["time"]).reset_index(drop = True))
            curr_entry, is_still_clean = self._remove_faulty(curr_entry)
            if (is_still_clean and self._is_correct(curr_entry)):
                entries.append(curr_entry)
            else:
                print("not clean data, entry = ", entry_i)
            
        return entries
    
    def _reset_time(self, entry):
        """
        Resets times of the entry to their relative times
        """
        start_time = entry["time"].min()
        entry["time"] -= start_time
        
        return entry
    
    def _remove_faulty(self, entry):
        """
        Checks if entry doesnt have any other pressed keys
        """
        clean_entry = copy.deepcopy(entry)
        
        for key_char in list(clean_entry["key_char"]):
            if (not key_char in PASSWORD):
                return clean_entry, False
        return clean_entry, True
    
    def _is_correct(self, entry):
        """
        Checks if entry corresponds to password
        """
        clean_entry = entry[entry["event"] == "KeyPress"]
        clean_entry = clean_entry.sort_values(by = ["time"]).reset_index(drop = True)
        entered_text = "".join(list(clean_entry["key_char"]))
        return entered_text == PASSWORD
    

In [4]:
class EventTransformer:
    """
    Detects and removes outliers from a specific person
    """
    
    def __init__(self, entries): #[DataFrame]
        self.original_entries = entries
        self.transformed_entries = self.transform_data()

    
    def transform_data(self):
        """
        Transforms entries
        """
        entries = copy.deepcopy(self.original_entries)
        
        for entry_i in range(len(entries)):
            entries[entry_i] = entries[entry_i].sort_values(by = ["key_char", "time"]).reset_index(drop = True)
            
            for event_i in range(len(entries[entry_i])):
                entries[entry_i].loc[event_i, "time"] = self._transform(entries[entry_i].loc[event_i, "time"])
                
        return entries  
    
    
    def _transform(self, time):
        return math.log(time + 0.01)

In [5]:
class FeatureConstructor:
    
    def __init__(self, check_entry):
        self.check_entry = check_entry
        self.presstimes = self._get_presstimes()
        self.between_keypresses = self._get_between_keypresses()
        
    
    def _get_presstimes(self):
        """
        Calculates the time each key was pressed
        """
        entry = copy.deepcopy(self.check_entry)
        entry = entry.sort_values(by = ["key_char", "time"]).reset_index(drop = True)
        
        if (len(entry) % 2 != 0):
            print("we've got uneven pressing/releasing for some reason \n")
            return
        
        elapsed_times = {}
        sorted_password = ''.join(sorted(PASSWORD))
        for i in range(len(PASSWORD)):
            elapsed_times[self._key_index(i, sorted_password[i])] = []
            
        for event_i in range(0, len(entry)-1, 2):
            elapsed_time = entry.iloc[event_i+1]["time"] - entry.iloc[event_i]["time"]
            curr_key_char = entry.iloc[event_i]["key_char"]
            elapsed_times[self._key_index((event_i+1)//2, curr_key_char)] = elapsed_time
            
        return elapsed_times
        
        
    def _get_between_keypresses(self):
        """
        Calculate times between key presses
        """
        entry = copy.deepcopy(self.check_entry)
        entry = entry[entry["event"] == "KeyPress"].sort_values(by = ["time"]).reset_index(drop = True)
        
        between_times = {}
        
        for i in range(len(PASSWORD)-1):
            between_times[self._key_index(i, PASSWORD[i] + "->" + PASSWORD[i+1])] = [] 
            
        for event_i in range(len(entry)-1):
            between_time = entry.loc[event_i+1, "time"] - entry.loc[event_i, "time"]
            curr_key_char1 = entry.iloc[event_i]["key_char"]
            curr_key_char2 = entry.iloc[event_i+1]["key_char"]
            between_times[self._key_index(event_i, curr_key_char1 + "->" + curr_key_char2)] = between_time
            
        return between_times
        
        
    def _key_index(self, index, char):
        return str(index) + ": " + char

In [6]:
class OwnerData:
    
    def __init__(self, distributions):
        self.distributions = distributions
        self.stats_presstimes = self._get_presstimes()
        self.stats_between_keypresses = self._get_between_keypresses()
        
    def _get_presstimes(self):
        data = self.distributions[OWNER_NAME]
        data = data.drop(data.index[0]).dropna().to_dict()
        return self._float_convert(data)
    
    def _get_between_keypresses(self):
        data = self.distributions[OWNER_NAME+".1"]
        data = data.drop(data.index[0]).dropna().to_dict()
        return self._float_convert(data)
    
    def _float_convert(self, d):
        for key, arr in d.items():
            arr_strs = list(arr[1:-1].split(", "))
            d[key] = [float(nr) for nr in arr_strs]
        return d

In [7]:
class ScoreCalculator:
    """
    Calculates the score!!!
    """
    
    def __init__(self, 
                 stats_presstimes, stats_between_keypresses, 
                 check_presstimes, check_between_keypresses):
        
        self.stats_presstimes = stats_presstimes
        self.stats_between_keypresses = stats_between_keypresses
        self.check_presstimes = check_presstimes
        self.check_between_keypresses = check_between_keypresses
        
        self.score = self.calculate_score()
        
    def calculate_score(self):
        return self._get_presstimes_score() + self._get_between_keypresses_score()
        
    def _get_presstimes_score(self):
        
        score = 0
        
        for key, point in self.check_presstimes.items():
            weight = self.stats_presstimes[key][0]
            mean = self.stats_presstimes[key][1]
            std = self.stats_presstimes[key][2]
            score += (weight / self._get_nr_stdevs_away(mean, std, point))
            
        return score
    
    def _get_between_keypresses_score(self):
        
        score = 0
        
        for key, point in self.check_between_keypresses.items():
            weight = self.stats_between_keypresses[key][0]
            mean = self.stats_between_keypresses[key][1]
            std = self.stats_between_keypresses[key][2]
            score += (weight / self._get_nr_stdevs_away(mean, std, point))
            
        return score
        
    def _get_nr_stdevs_away(self, mean, std, point):
        return (abs(point - mean) + 0.1) #/ std
    
    def _get_nr_stdevs_away_threshold(self, mean, std):
        return ((Z_value*std) + 0.1) 
        

In [8]:
names = []
names.append("Mariia Turchina")
names.append("Sander Steeghs")
names.append("Finn Devlin")
names.append("Christian van den Berg")
names.append("Sanderijn van Loosdrecht")

def generate_entries_of(name):
    """
    Generating entries from all datasets
    """
    all_entries = []
    
    name_entries = EntryDivider(pd.read_excel(name + ".xlsx", encoding = "utf8")).entries
    name_entries = EventTransformer(name_entries).transformed_entries
            
    return name_entries

In [9]:
def get_random_entry():
    name = random.choice(names)
    name_entries = generate_entries_of(name)
    check_entry_i = random.randrange(0, len(name_entries))
    check_entry = name_entries[check_entry_i]
    
    print("checking entry {} of {}".format(check_entry_i, name))
    
    return [check_entry, name]

In [12]:
scores = defaultdict(float)
nr_attempts = defaultdict(int)
mean_scores = defaultdict(float)

nr_passes = defaultdict(int)
pass_rate = defaultdict(float)

THRESHOLD = 180

In [13]:
for _ in range(100):

    check_entry = get_random_entry()

    check_features = FeatureConstructor(check_entry[0])

    distributions_df = pd.read_csv("distributions.csv", encoding = "utf8", index_col = [0])
    owner_data = OwnerData(distributions_df)

    score = ScoreCalculator(owner_data.stats_presstimes, owner_data.stats_between_keypresses,
                            check_features.presstimes, check_features.between_keypresses).score
    print("score =", score, "\n")
    
    if score > THRESHOLD:
        nr_passes[check_entry[1]] += 1
    
    scores[check_entry[1]] += score
    nr_attempts[check_entry[1]] += 1



checking entry 37 of Christian van den Berg
score = 140.7278379786615 

checking entry 240 of Finn Devlin
score = 196.63903263377676 

checking entry 74 of Christian van den Berg
score = 129.81573430580545 

checking entry 174 of Sanderijn van Loosdrecht
score = 167.7557284298264 

checking entry 11 of Sanderijn van Loosdrecht
score = 149.5431305392038 

checking entry 29 of Sander Steeghs
score = 147.45054811312832 

checking entry 144 of Mariia Turchina
score = 133.0437931295606 

checking entry 87 of Sanderijn van Loosdrecht
score = 165.0401834466364 

checking entry 337 of Finn Devlin
score = 171.0114753594886 

checking entry 208 of Finn Devlin
score = 199.08396304488292 

checking entry 23 of Sanderijn van Loosdrecht
score = 163.49393441707056 

checking entry 39 of Sander Steeghs
score = 172.35129588157304 

checking entry 64 of Sander Steeghs
score = 168.55861184265444 

checking entry 31 of Sander Steeghs
score = 159.30147641729872 

checking entry 59 of Mariia Turchina
score 

checking entry 11 of Sander Steeghs
score = 157.57989579842354 

checking entry 11 of Sanderijn van Loosdrecht
score = 149.5431305392038 

checking entry 57 of Sander Steeghs
score = 156.40197135206353 

checking entry 5 of Sander Steeghs
score = 158.64235545177135 

checking entry 171 of Mariia Turchina
score = 136.6287752083237 

checking entry 20 of Sanderijn van Loosdrecht
score = 155.12834127887888 

checking entry 43 of Sander Steeghs
score = 164.50545481726857 

checking entry 288 of Finn Devlin
score = 200.612661083908 

checking entry 14 of Sanderijn van Loosdrecht
score = 159.85863948513224 

checking entry 53 of Sander Steeghs
score = 152.7185756394741 

checking entry 304 of Finn Devlin
score = 182.61926926614177 

checking entry 97 of Christian van den Berg
score = 134.28194850326514 

checking entry 56 of Mariia Turchina
score = 134.56004690210523 

checking entry 47 of Sander Steeghs
score = 163.9488538316611 

checking entry 63 of Sanderijn van Loosdrecht
score = 172.56

checking entry 380 of Finn Devlin
score = 183.4741741365015 

checking entry 170 of Mariia Turchina
score = 133.85283765005607 

checking entry 54 of Sanderijn van Loosdrecht
score = 166.43540092128163 

checking entry 38 of Christian van den Berg
score = 130.9375002982168 

checking entry 76 of Sanderijn van Loosdrecht
score = 175.1003596555034 

checking entry 133 of Mariia Turchina
score = 147.2125871242862 

checking entry 27 of Sanderijn van Loosdrecht
score = 152.69951703954132 

checking entry 124 of Sanderijn van Loosdrecht
score = 163.10812967982568 

checking entry 341 of Finn Devlin
score = 194.25687254684175 

checking entry 25 of Christian van den Berg
score = 118.30145999528939 

checking entry 77 of Christian van den Berg
score = 121.4024674736698 

checking entry 147 of Christian van den Berg
score = 133.14978629622124 

checking entry 150 of Christian van den Berg
score = 138.29607492142975 

checking entry 23 of Mariia Turchina
score = 135.97639200805622 

checking en

checking entry 21 of Sander Steeghs
score = 169.74280992951435 

checking entry 25 of Sander Steeghs
score = 155.63707290536934 

checking entry 64 of Mariia Turchina
score = 123.75249723630893 

checking entry 51 of Mariia Turchina
score = 129.05308936598212 

checking entry 67 of Mariia Turchina
score = 143.44090016148863 

checking entry 155 of Sanderijn van Loosdrecht
score = 155.17478934358718 

checking entry 146 of Christian van den Berg
score = 138.1308201428029 

checking entry 88 of Finn Devlin
score = 196.67147747389288 

checking entry 6 of Sander Steeghs
score = 169.4437358490705 

checking entry 77 of Christian van den Berg
score = 121.4024674736698 

checking entry 82 of Sanderijn van Loosdrecht
score = 175.74895992424752 

checking entry 378 of Finn Devlin
score = 169.38999437772975 

checking entry 250 of Finn Devlin
score = 197.58070727050398 

checking entry 16 of Finn Devlin
score = 150.3318206666987 

checking entry 37 of Sander Steeghs
score = 167.16586764879509 


checking entry 11 of Sander Steeghs
score = 157.57989579842354 

checking entry 211 of Finn Devlin
score = 185.0164704560795 

checking entry 62 of Sander Steeghs
score = 160.47597898871615 

checking entry 45 of Mariia Turchina
score = 128.70238711763113 



KeyboardInterrupt: 

In [14]:

for i in names:
    mean_scores[i] = (scores[i]/nr_attempts[i])
    pass_rate[i] = (nr_passes[i]/nr_attempts[i])


print(nr_attempts)
print(nr_passes)

print(mean_scores)
print(pass_rate)

defaultdict(<class 'int'>, {'Christian van den Berg': 88, 'Finn Devlin': 103, 'Sanderijn van Loosdrecht': 101, 'Sander Steeghs': 98, 'Mariia Turchina': 96})
defaultdict(<class 'int'>, {'Finn Devlin': 90, 'Sanderijn van Loosdrecht': 6, 'Mariia Turchina': 0, 'Sander Steeghs': 0, 'Christian van den Berg': 0})
defaultdict(<class 'float'>, {'Mariia Turchina': 134.40006932419806, 'Sander Steeghs': 159.2318875307986, 'Finn Devlin': 193.31704290182878, 'Christian van den Berg': 129.73140762416097, 'Sanderijn van Loosdrecht': 160.1379268563778})
defaultdict(<class 'float'>, {'Mariia Turchina': 0.0, 'Sander Steeghs': 0.0, 'Finn Devlin': 0.8737864077669902, 'Christian van den Berg': 0.0, 'Sanderijn van Loosdrecht': 0.0594059405940594})


In [None]:
#1000 tests for Finn
#threshold = 180,  TESTS_WEIGHT = 0.8, STD_WEIGHT = 1, distance_weight = 0.1, dividing through by std
'''average score after 1000:
Mariia = 185
Finn = 130
Sander = 123
Sanderijn = 130
Christian = 150

Pass rate:
Mariia = 0.87
Finn = 0
Sander = 0 
Sanderijn = 0
Christian = 0.03
'''

In [None]:
#1000 tests
#threshold = 170,  TESTS_WEIGHT = 0.8, STD_WEIGHT = 1, distance_weight = 0.1, dividing through by std
'''average score after 1000:
Mariia = 185
Finn = 130
Sander = 123
Sanderijn = 130
Christian = 150

Pass rate:
Mariia = 0.87
Finn = 0
Sander = 0 
Sanderijn = 0
Christian = 0.03
'''

In [None]:
#Threshold = 100, TESTS_WEIGHT = 0.8, STD_WEIGHT = 1
'''average score:
Mariia = 135, 144
Finn = 57, 68
Sander = 105, 49, 
Sanderijn = 68, 34
Christian = 61, 52

Pass rate:
Mariia = 0.37
Finn = 0.1
Sander = 0.13 
Sanderijn = 0.07
Christian = 0.1
'''

In [None]:
#threshold = 6,  TESTS_WEIGHT = 0.8, STD_WEIGHT = 1, distance_weight = 0.1
'''average score:
Mariia = 6.4
Finn = 4.3
Sander = 5.1 
Sanderijn = 4.6
Christian = 4.6

Pass rate:
Mariia = 0.70
Finn = 0.24
Sander = 0 
Sanderijn = 0.06
Christian = 0
'''