# Data

### Data can be downloaded using the package


In [None]:
from keystrokes.data.download_utils import download_data
from keystrokes.utils.path_utils import DATA_URL, ZIP_FILEPATH
download_data(DATA_URL, ZIP_FILEPATH)

In [3]:
# Default Working directory: ~/.keystrokes
# Set Environment Variable KEYSTROKES_WORKING_FOLDER to change Working Directory
# Data will be saved at {KEYSTROKES_WORKING_FOLDER}/data/raw
!ls -hs ~/.keystrokes/data/raw/

total 1.5G
1.5G data.zip


# Keystrokes Raw Data Sample

In [17]:
from keystrokes.data.data_utils import (
    read_csv_from_zip, list_keystroke_files_in_zip
)
user_files_df = list_keystroke_files_in_zip(zip_filepath=ZIP_FILEPATH)
df_csv = read_csv_from_zip(zip_filepath=ZIP_FILEPATH, file_within_zip=user_files_df.loc[0,'filename'])
df_csv

Unnamed: 0,PARTICIPANT_ID,TEST_SECTION_ID,PRESS_TIME,RELEASE_TIME,KEYCODE
0,5,7,1471934383592,1471934383760,16
1,5,7,1471934383701,1471934383760,84
2,5,7,1471934383838,1471934383910,72
3,5,7,1471934383910,1471934383991,69
4,5,7,1471934384054,1471934384138,32
...,...,...,...,...,...
625,5,83,1471934546993,1471934547098,70
626,5,83,1471934547207,1471934547267,69
627,5,83,1471934547361,1471934547433,69
628,5,83,1471934547495,1471934547647,16


# Note about Data Used for Modeling

While the raw data is a full resolution keystroke data set. We just need a minimized version containing median times of a subset of (keystrokes, next keystrokes) pairs. 

In [19]:
from sklearn.pipeline import Pipeline

from keystrokes.transformers.key_code_median_transformer import KeyCodeMedianTransformer
from keystrokes.transformers.keyboard_event_transformer import KeyboardEventTransformer

pipeline = Pipeline(
    [
        ("keystroke", KeyboardEventTransformer()),
        ("median", KeyCodeMedianTransformer()),
    ]
)
pipeline.transform(df_csv)

Unnamed: 0,KEYCODE,NEXT_KEYCODE,MEDIAN_PRESS_PRESS_TIME,MEDIAN_HOLD_TIME
0,8,8,137.0,61.0
1,8,32,170.0,59.0
2,8,66,208.0,83.0
3,8,67,106.0,60.0
4,8,68,132.0,117.0
...,...,...,...,...
235,191,8,401.0,135.0
236,191,16,3469.0,127.0
237,222,76,166.0,104.0
238,222,83,123.0,92.0
