In [1]:
import numpy as np
import pandas as pd
import datetime

In [2]:
source_name = './data/full_keyboard_data.csv'
df = pd.read_csv(source_name)

In [4]:
test_user_df = df[df['user_id'] == 10]

In [34]:
now = datetime.datetime.now()

In [36]:
parts = generate_parts(now, 30)

In [37]:
for i, part in enumerate(parts):
    print(i+1, ' ', part)

1   2021-05-28 10:00:00
2   2021-05-28 10:30:00
3   2021-05-28 11:00:00
4   2021-05-28 11:30:00
5   2021-05-28 12:00:00
6   2021-05-28 12:30:00
7   2021-05-28 13:00:00
8   2021-05-28 13:30:00
9   2021-05-28 14:00:00
10   2021-05-28 14:30:00
11   2021-05-28 15:00:00
12   2021-05-28 15:30:00
13   2021-05-28 16:00:00
14   2021-05-28 16:30:00
15   2021-05-28 17:00:00
16   2021-05-28 17:30:00
17   2021-05-28 18:00:00


In [29]:
def to_time(date_time, h, m):
    new_date_time = date_time.replace(hour=h, minute=m, second=0, microsecond=0)
    return new_date_time


def generate_parts(date_time, step):
    last_time = to_time(date_time, 18, 0)
    parts = [to_time(date_time, 10, 0)]
    while parts[-1] < last_time:
        new_date_time = parts[-1] + datetime.timedelta(seconds=60 * step)
        parts.append(new_date_time)
    return parts    

def get_part_idx(date_time, time_step):
    parts = generate_parts(date_time, time_step)
    for idx in range(len(parts)-1):
        if parts[idx] <= date_time < parts[idx+1]:
            return idx
    if date_time < parts[0]:
        return 0
    if date_time > parts[-1]:
        return len(parts) - 1

    
def proc_df(df):
    df = df.copy()
    df.pop('Unnamed: 0')
    df.pop('Unnamed: 0.1')
    try:
        df['start_typing_time'] = pd.to_datetime(df['start_typing_time'])
    except:
        df['start_typing_time'] = df['start_typing_time'].map(lambda x: x.replace('_', ' '))
        df['start_typing_time'] = df['start_typing_time'].map(lambda x: x.split('+')[0])
        df['start_typing_time'] = pd.to_datetime(df['start_typing_time'])
        
    try:
        df['end_typing_time'] = pd.to_datetime(df['end_typing_time'])
    except:
        df['end_typing_time'] = df['end_typing_time'].map(lambda x: x.replace('_', ' '))
        df['end_typing_time'] = df['end_typing_time'].map(lambda x: x.split('+')[0])
        df['end_typing_time'] = pd.to_datetime(df['end_typing_time'])        
        
    df['date'] = df['start_typing_time'].map(lambda x: x.date())
    df['day'] = df['start_typing_time'].map(lambda x: x.day)
    df['month'] = df['start_typing_time'].map(lambda x: x.month)
    df['session_duration'] = df['end_typing_time'] - df['start_typing_time']
    df['session_duration'] = df['session_duration'].map(lambda x: x.total_seconds())
    df['part_of_day'] = df['start_typing_time'].map(lambda x: get_part_idx(x, 30))
    df = df.sort_values(['month', 'day', 'start_typing_time'])
    df = df[df['session_duration'] > 0]
    
    diff_names = [
        'speed',
        'overlaps',
        'holding_time',
        'errors',
        'capital_symbols'
    ]
    d_diff_names = [f'd_{name}' for name in diff_names]
    dd_diff_names = [f'd_d_{name}' for name in diff_names]

    for name, new_col_name in zip(diff_names, d_diff_names):
        df[new_col_name] = 0.

        for date in df['date'].unique():
            df.loc[(df['date'] == date), new_col_name] = df[df['date'] == date][name].diff()

    for name, dd_name in zip(d_diff_names, dd_diff_names):
        df[dd_name] = 0.

        for date in df['date'].unique():
            df.loc[(df['date'] == date), dd_name] = df[df['date'] == date][name].diff()
    
    full_names = diff_names + d_diff_names + dd_diff_names
    
    for date in df['date'].unique():
        for part_of_day in df['part_of_day'].unique():
            for name in full_names:
                df.loc[(df['date'] == date) & (df['part_of_day'] == part_of_day), f'std_{name}'] = \
                    df.loc[(df['date'] == date) & (df['part_of_day'] == part_of_day)][name].std()                
            
                df.loc[(df['date'] == date) & (df['part_of_day'] == part_of_day), f'mean_{name}'] = \
                    df.loc[(df['date'] == date) & (df['part_of_day'] == part_of_day)][name].mean()                

                df.loc[(df['date'] == date) & (df['part_of_day'] == part_of_day), f'q25_{name}'] = \
                    df.loc[(df['date'] == date) & (df['part_of_day'] == part_of_day)][name].quantile(0.25)

                df.loc[(df['date'] == date) & (df['part_of_day'] == part_of_day), f'q50_{name}'] = \
                    df.loc[(df['date'] == date) & (df['part_of_day'] == part_of_day)][name].quantile(0.50)

                df.loc[(df['date'] == date) & (df['part_of_day'] == part_of_day), f'q75_{name}'] = \
                    df.loc[(df['date'] == date) & (df['part_of_day'] == part_of_day)][name].quantile(0.75)
    return df
            

In [23]:
proced_df = proc_df(test_user_df)

In [30]:
for user_id in df['user_id'].unique():
    target_user_df = df[df['user_id'] == user_id]
    proced_target_df = proc_df(target_user_df)
    proced_target_df.to_csv(f'user_{user_id}_keyboard_stats.csv')