In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import copy
from scipy import stats
import statistics
import math

In [2]:
#OWNER_NAME = "Mariia Turchina"
PASSWORD = "bridport20"
Z_THRESHOLD = 2

In [3]:
class EntryDivider:
    """
    Separates and cleans each entry from person's dataset
    """
    
    def __init__(self, df):
        self.df = df
        self.nr_entries = self.df["entry"].max() + 1
        self.entries = self.divide_entries()
        
    def divide_entries(self):
        """
        Separates each entry from person's dataset
        """
        entries = []
        for entry_i in range(self.nr_entries):
            curr_entry = self.df[self.df["entry"] == entry_i]
            curr_entry = self._reset_time(curr_entry.sort_values(by = ["time"]).reset_index(drop = True))
            curr_entry, is_still_clean = self._remove_faulty(curr_entry)
            if (is_still_clean):
                entries.append(curr_entry)
            else:
                print("not clean data, entry = ", entry_i)
            
        return entries
    
    def _reset_time(self, entry):
        """
        Resets times of the entry to their relative times
        """
        start_time = entry["time"].min()
        entry["time"] -= start_time
        
        return entry
    
    def _remove_faulty(self, entry):
        """
        Checks if entry is clean
        """
        clean_entry = copy.deepcopy(entry)
        
        for key_char in list(clean_entry["key_char"]):
            if (not key_char in PASSWORD):
                return clean_entry, False
        return clean_entry, True
    
    

In [4]:
class Event_OutlierHandler_Transformer:
    """
    Detects and removes outliers from a specific person
    """
    
    def __init__(self, entries): #[DataFrame]
        self.original_entries = entries
        self.clean_entries = self.handle_event_outliers_and_transform()

    
    def handle_event_outliers_and_transform(self):
        """
        Removes outliers from entries
        """
        entries = copy.deepcopy(self.original_entries)
        keypress_times, keyrelease_times = self._get_events_dicts()
        
        z_keypress_times = self._get_z_scores(keypress_times)
        z_keyrelease_times = self._get_z_scores(keyrelease_times)
        
        for entry_i in range(len(entries)):
            entries[entry_i] = entries[entry_i].sort_values(by = ["key_char", "time"]).reset_index(drop = True)
            
            for event_i in range(0, len(entries[entry_i])-1, 2):
                curr_key_char = entries[entry_i].iloc[event_i]["key_char"]
                key = self._key_index(event_i // 2, curr_key_char)
                
                if (abs(z_keypress_times[key][entry_i]) > Z_THRESHOLD):
                    #print("KeyPress: outlier detected in key {:5}, entry {:4}".format(key, entry_i))
                    entries[entry_i].loc[event_i, "time"] = statistics.mean(keypress_times[key])
                if (abs(z_keyrelease_times[key][entry_i]) > Z_THRESHOLD):
                    #print("KeyPress: outlier detected in key {:5}, entry {:4}".format(key, entry_i))
                    entries[entry_i].loc[event_i+1, "time"] = statistics.mean(keyrelease_times[key])
                
                # comment if you dont want to transform
                entries[entry_i].loc[event_i, "time"] = self._transform(entries[entry_i].loc[event_i, "time"])
                entries[entry_i].loc[event_i+1, "time"] = self._transform(entries[entry_i].loc[event_i+1, "time"])
                
        return entries  
    
    def _transform(self, time):
        return math.log(time + 0.01)
        
        
    def _get_events_dicts(self):
        """
        Constructs dicts of press and release times for events
        """
        entries = copy.deepcopy(self.original_entries)
        keypress_times = {}
        keyrelease_times = {}
        
        sorted_password = ''.join(sorted(PASSWORD))
        for i in range(len(PASSWORD)):
            key = self._key_index(i, sorted_password[i])
            keypress_times[key] = []
            keyrelease_times[key] = []
        
        for entry_i in range(len(entries)):
            curr_entry = entries[entry_i].sort_values(by = ["key_char", "time"]).reset_index(drop = True)
            
            if (len(entries[entry_i]) % 2 != 0):
                print("we've got uneven pressing/releasing for some reason \n")
                return
            
            for event_i in range(0, len(entries[entry_i])-1, 2):
                curr_key_char = curr_entry.iloc[event_i]["key_char"]
                key = self._key_index(event_i // 2, curr_key_char)

                keypress_times[key] += [curr_entry.iloc[event_i]["time"]]
                keyrelease_times[key] += [curr_entry.iloc[event_i+1]["time"]]
        
        return keypress_times, keyrelease_times
    
    
    def _key_index(self, index, char):
        return str(index) + ": " + char
    
    
    def _get_z_scores(self, arr_dict):
        """
        Calculates z-scores for arrays in dict with array keys
        """
        z_dict = {}
        for key, arr in arr_dict.items():
            z_dict[key] = stats.zscore(arr)
            
        return z_dict  

In [5]:
class Feature_OutlierHandler_Transformer:
    
    def __init__(self, feature_arr):
        self.original_arr = feature_arr
        self.clean_arr = self.handle_ouliers()
        
    def handle_ouliers(self):
        arr = copy.deepcopy(self.original_arr)
        outlier_indices = self._get_outlier_indices(arr)

        for i in outlier_indices:
            arr[i] = statistics.mean(arr)
            
        return arr
        
    def _get_outlier_indices(self, arr):
        z_scores_arr = stats.zscore(arr)  
        return [i for i in range(len(arr)) if (abs(z_scores_arr[i]) > Z_THRESHOLD)]
    

In [6]:
class Entry:
    """
    Provides information about each entry (its data and whether it's owner's)
    """
    
    def __init__(self, name, entry):
        self.is_owner = name == OWNER_NAME
        self.data = entry

In [7]:
class EntriesData:
    """
    Holds all the entry datas together
    """
    
    def __init__(self, entries: [Entry]):
        self.entries = entries
        
        
    def overview_plot_event_times(self, event):
        """
        Plots an event of each key of each key of entry of each individual
        """
        fig, ax = plt.subplots(figsize = (15, 15))
        xval = 0
        temp_entries = self.entries
        
        for key_char in PASSWORD:
            times = []
            for i in range(len(temp_entries)):
                entry = temp_entries[i].data
                time = entry[(entry["key_char"] == key_char) & (entry["event"] == event)]["time"].iloc[0]
                temp_entries[i].data = entry[entry["time"] != time]
                times.append(time)
                
            ax.scatter(np.linspace(xval, xval, len(times)), times)
            xval += 1
        
        ax.set_title("{}'s of each key of entry of each individual".format(event))
        locs, labels = plt.xticks() 
        plt.xticks(np.arange(len(PASSWORD)), [c for c in PASSWORD])
        
        
    def owner_event_distribution(self, event):
        """
        Plots density plot for the times of an event of each key
        """
        owner_entries = self._get_owner()
        
        fig, ax = plt.subplots(nrows = len(PASSWORD), figsize = (15, 6 * len(PASSWORD)))
        plt.subplots_adjust(hspace = 0.2)
        plt.suptitle("Distribution on owner's {}'s'".format(event))
        
        row = 0
        for key_char in PASSWORD:
            
            times = []
            for i in range(len(owner_entries)):
                entry = owner_entries[i].data
                time = entry[(entry["key_char"] == key_char) & (entry["event"] == event)]["time"].iloc[0]
                owner_entries[i].data = entry[entry["time"] != time]
                times.append(time)
                
            sns.distplot(times, ax = ax[row])
            ax[row].set_title(key_char, color = "red")
            row += 1
    
    
    def owner_presstime_distribution(self):
        """
        Plots density plot for the pressing times of each key
        """
        owner_entriesdata = self._get_owner()
        owner_entries = [owner_entry.data for owner_entry in owner_entriesdata]
        
        # uncomment when plotting for one owner only
        #fig, ax = plt.subplots(nrows = len(PASSWORD), figsize = (15, 6 * len(PASSWORD)))
        plt.subplots_adjust(hspace = 0.2)
        plt.suptitle("Distribution on owner's time of pressing each key")
        
        elapsed_times = {}
        sorted_password = ''.join(sorted(PASSWORD))
        for i in range(len(PASSWORD)):
            elapsed_times[self._key_index(i, sorted_password[i])] = [] 
        
        for entry_i in range(len(owner_entries)):
            curr_entry = owner_entries[entry_i].sort_values(by = ["key_char", "time"]).reset_index(drop = True)

            if (len(owner_entries[entry_i]) % 2 != 0):
                print("we've got uneven pressing/releasing for some reason \n")
                return
            
            i = 0
            for event_i in range(0, len(owner_entries[entry_i])-1, 2):
                elapsed_time = curr_entry.iloc[event_i+1]["time"] - curr_entry.iloc[event_i]["time"]
                curr_key_char = curr_entry.iloc[event_i]["key_char"]
                elapsed_times[self._key_index(i, curr_key_char)] += [elapsed_time]
                i += 1
        
        
        try:
            c = COLOR
        except:
            c = "blue"
            
        row = 0
        for key, press_times in elapsed_times.items():
            
            clean_press_times = Feature_OutlierHandler_Transformer(press_times).clean_arr
            
            sns.distplot(clean_press_times, ax = ax[row], color = c, label = OWNER_NAME + " -> " + key)
            fig.legend()
            ax[row].set_title(key, color = "red")
            row += 1
                    
            
    def _get_owner(self):
        copy_entries = copy.deepcopy(self.entries)
        return [entry for entry in copy_entries if (entry.is_owner)]
                
            
    def _key_index(self, index, char):
        return str(index) + ": " + char
        
        
            

In [8]:
names = []
names.append("Mariia Turchina")
names.append("Sander Steeghs")
names.append("Finn Devlin")
names.append("Christian van den Berg")
names.append("Sanderijn van Loosdrecht")

colors = ["chocolate", "blue", "red", "green", "yellow"]

def generate_entries():
    """
    Generating entries from all datasets
    """
    all_entries = []
    
    for name in names:
        name_entries = EntryDivider(pd.read_excel(name + ".xlsx", encoding = "utf8")).entries
        
        name_entries = Event_OutlierHandler_Transformer(name_entries).clean_entries
        
        for name_entry in name_entries:
            entry = Entry(name, name_entry)
            all_entries.append(entry)
            
    return all_entries

In [None]:
i = 0
fig, ax = plt.subplots(nrows = len(PASSWORD), figsize = (15, 15 * len(PASSWORD)))
for name in names:
    OWNER_NAME = name
    COLOR = colors[i]
    entries_data = EntriesData(generate_entries())
    entries_data.owner_presstime_distribution()
    i += 1

  return (a - mns) / sstd


In [None]:
#entries_data.owner_event_distribution("KeyPress")

In [None]:
#entries_data.owner_event_distribution("KeyRelease")