# Writing Process Task
## Phase 1 Data cleaning

In [118]:
import pandas as pd
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
import ftfy

In [105]:
trainlog = pd.read_csv('data/train_logs.csv')
trainscore = pd.read_csv('data/train_scores.csv')
testlog = pd.read_csv('data/test_logs.csv')
trainlog

Unnamed: 0,id,event_id,down_time,up_time,action_time,activity,down_event,up_event,text_change,cursor_position,word_count
0,001519c8,1,4526,4557,31,Nonproduction,Leftclick,Leftclick,NoChange,0,0
1,001519c8,2,4558,4962,404,Nonproduction,Leftclick,Leftclick,NoChange,0,0
2,001519c8,3,106571,106571,0,Nonproduction,Shift,Shift,NoChange,0,0
3,001519c8,4,106686,106777,91,Input,q,q,q,1,1
4,001519c8,5,107196,107323,127,Input,q,q,q,2,1
...,...,...,...,...,...,...,...,...,...,...,...
8405893,fff05981,3615,2063944,2064440,496,Nonproduction,Leftclick,Leftclick,NoChange,1031,240
8405894,fff05981,3616,2064497,2064497,0,Nonproduction,Shift,Shift,NoChange,1031,240
8405895,fff05981,3617,2064657,2064765,108,Replace,q,q,q => q,1031,240
8405896,fff05981,3618,2069186,2069259,73,Nonproduction,Leftclick,Leftclick,NoChange,1028,240


In [106]:
trainlog['id'].unique()

array(['001519c8', '0022f953', '0042269b', ..., 'ffccd6fd', 'ffec5b38',
       'fff05981'], shape=(2471,), dtype=object)

In [107]:
trainlog['down_event'].unique()

array(['Leftclick', 'Shift', 'q', 'Space', 'Backspace', '.', ',', 'Enter',
       'ArrowLeft', "'", ';', 'ArrowRight', '-', '?', 'Tab', '"',
       'ArrowUp', 'ArrowDown', 'Rightclick', '=', 'CapsLock', 'Control',
       'c', 'v', '/', 'Delete', ':', 'z', '[', '$', '(', ')', '+', 'Home',
       'End', '\\', 'Meta', '*', '&', 'AudioVolumeMute', 'x', '!',
       'Insert', 'MediaPlayPause', 'NumLock', '%', 'V', '>', 'Alt',
       'AudioVolumeUp', 'ContextMenu', 'AudioVolumeDown', 'a', '<',
       'PageDown', ']', 'Middleclick', '@', 'F12', 'j', '\x96', 'Dead',
       't', 's', 'n', 'y', '{', 'ScrollLock', '¿', 'Process', '}',
       'MediaTrackPrevious', 'MediaTrackNext', 'F3', '^', 'Unidentified',
       'Cancel', '2', 'i', 'd', 'r', 'e', '`', '\x9b', 'm', '#', '~',
       'PageUp', 'T', 'A', 'b', 'S', 'ModeChange', '_', 'Escape', 'F11',
       'Unknownclick', 'AltGraph', 'F10', 'h', 'F15', 'Clear', 'OS', 'F',
       'C', 'o', 'Ä±', 'f', 'u', 'w', 'p', 'g', 'M', 'l', '|',
       'â\x80\x

In [108]:
trainlog['action_time'].max()

447470

In [109]:
def label_encoding(df, col="id"):
    label_encoder = LabelEncoder()
    label_encoder.fit(df[col])
    df[col + "_encoded"] = label_encoder.transform(df[col])
    return df


# remove time that the author havent start writing or is resting
# reference: remove_margin for https://www.kaggle.com/code/tomooinubushi/1st-place-solution-training-and-inference-code

def remove_procrastination_time(df, start_margin=2*60*1000, end_margin=2*60*1000):
    df = df[df['up_event'] != 'Unidentified'].reset_index(drop=True)
    result_df = []
    grouped_df = df.groupby('id_encoded')

    for _, log in tqdm(grouped_df):
        valid_events = log[(log.activity != 'Nonproduction') & (
            log.up_event != 'Shift') & (log.up_event != 'CapsLock')].down_time.values
        if len(valid_events) == 0:
            continue
        log = log[(log.down_time > valid_events.min() - start_margin)
                  & (log['down_time'] <= valid_events.max() + end_margin)].copy()
        log['event_id'] = range(len(log))
        result_df.append(log)

    result = pd.concat(result_df, ignore_index=True)

    return result




In [110]:

train_log_df = label_encoding(trainlog)
train_log_df = remove_procrastination_time(train_log_df)
train_log_df.head(15)


100%|██████████| 2471/2471 [00:03<00:00, 736.55it/s]


Unnamed: 0,id,event_id,down_time,up_time,action_time,activity,down_event,up_event,text_change,cursor_position,word_count,id_encoded
0,001519c8,0,4526,4557,31,Nonproduction,Leftclick,Leftclick,NoChange,0,0,0
1,001519c8,1,4558,4962,404,Nonproduction,Leftclick,Leftclick,NoChange,0,0,0
2,001519c8,2,106571,106571,0,Nonproduction,Shift,Shift,NoChange,0,0,0
3,001519c8,3,106686,106777,91,Input,q,q,q,1,1,0
4,001519c8,4,107196,107323,127,Input,q,q,q,2,1,0
5,001519c8,5,107296,107400,104,Input,q,q,q,3,1,0
6,001519c8,6,107469,107596,127,Input,q,q,q,4,1,0
7,001519c8,7,107659,107766,107,Input,q,q,q,5,1,0
8,001519c8,8,107743,107852,109,Input,q,q,q,6,1,0
9,001519c8,9,107840,107978,138,Input,Space,Space,,7,1,0


## Remove Nonproduction rows

In [111]:
# remove Nonproduction rows

train_log_df = train_log_df[train_log_df['activity'] != 'Nonproduction' ].reset_index(drop=True)
train_log_df
train_log_df.down_event.unique()

array(['q', 'Space', 'Backspace', '.', ',', 'Enter', "'", ';',
       'Leftclick', '-', '?', '"', '=', 'v', '/', 'Delete', ':',
       'Rightclick', 'z', '[', '$', '(', ')', '+', '\\', '*', '&', 'x',
       '!', '%', 'V', '>', '<', ']', '@', '\x96', 't', 's', 'n', '{', '¿',
       '}', '^', 'i', 'a', 'd', 'r', 'e', '`', '\x9b', '#', '~', 'S', '_',
       'h', 'c', 'Ä±', 'u', 'o', 'p', '|', 'â\x80\x93', '\x97', 'Ë\x86',
       '¡', 'm', '\x80', 'Â´', 'Å\x9f', 'ä'], dtype=object)

## Fix timestamp Errors

In [115]:
df_sel = train_log_df.loc[
    (train_log_df['id'] == 'a0c24719') &
    (train_log_df['event_id'].between(64, 71))
]
df_sel

Unnamed: 0,id,event_id,down_time,up_time,action_time,activity,down_event,up_event,text_change,cursor_position,word_count,id_encoded
4684048,a0c24719,64,59994.0,60078.0,84,Input,q,q,q,63,10,1518
4684049,a0c24719,65,60166.0,60296.0,130,Input,q,q,q,64,10,1518
4684050,a0c24719,66,64594.0,64735.0,141,Input,Space,Space,,65,10,1518
4684051,a0c24719,67,66231.0,66337.0,106,Remove/Cut,Backspace,Backspace,,64,10,1518
4684052,a0c24719,68,66834.16,66970.16,136,Input,.,.,.,65,10,1518
4684053,a0c24719,69,66967.16,67088.16,121,Input,Space,Space,,66,10,1518
4684054,a0c24719,71,71655.16,71748.16,93,Input,q,q,q,67,11,1518


In [116]:
# According to 1st place notebook there are several bugs about the training data
# sometimes time will go back and resume
df_temp = train_log_df[train_log_df['id'] == 'a0c24719']
intervals = []
for i in range(0,df_temp.shape[0]-1):
    if df_temp.iloc[i]['activity'] == 'Remove/Cut' and df_temp.iloc[i + 1]['activity'] == 'Input':
        intervals.append(int(df_temp.iloc[i+1]['down_time'] - df_temp.iloc[i]['down_time']))
intervals = pd.Series(intervals)
intervals_mean = intervals[intervals.between(0,10000)].mean()
intervals_mean

np.float64(603.1578947368421)

**notice the timejump between event 67 and 68**

In [113]:
diff = intervals_mean + 66231 - 17831 # literals taken from above
mask = (train_log_df['id'] == 'a0c24719') & (train_log_df['event_id'] >= 68)
train_log_df.loc[mask, 'down_time'] = train_log_df.loc[mask, 'down_time'] + diff
train_log_df.loc[mask, 'up_time'] = train_log_df.loc[mask, 'up_time'] + diff

train_log_df.loc[
    (train_log_df['id'] == 'a0c24719') &
    (train_log_df['event_id'].between(64, 71))]


  train_log_df.loc[mask, 'down_time'] = train_log_df.loc[mask, 'down_time'] + diff
  train_log_df.loc[mask, 'up_time'] = train_log_df.loc[mask, 'up_time'] + diff


Unnamed: 0,id,event_id,down_time,up_time,action_time,activity,down_event,up_event,text_change,cursor_position,word_count,id_encoded
4684048,a0c24719,64,59994.0,60078.0,84,Input,q,q,q,63,10,1518
4684049,a0c24719,65,60166.0,60296.0,130,Input,q,q,q,64,10,1518
4684050,a0c24719,66,64594.0,64735.0,141,Input,Space,Space,,65,10,1518
4684051,a0c24719,67,66231.0,66337.0,106,Remove/Cut,Backspace,Backspace,,64,10,1518
4684052,a0c24719,68,66834.16,66970.16,136,Input,.,.,.,65,10,1518
4684053,a0c24719,69,66967.16,67088.16,121,Input,Space,Space,,66,10,1518
4684054,a0c24719,71,71655.16,71748.16,93,Input,q,q,q,67,11,1518


In [121]:
train_log_df.loc[train_log_df['action_time'] >= 60 *1000]

Unnamed: 0,id,event_id,down_time,up_time,action_time,activity,down_event,up_event,text_change,cursor_position,word_count,id_encoded
1131499,258ecfdc,1601,1476397.0,1554636.0,78239,Remove/Cut,Backspace,Backspace,,1287,222,365
4688145,a0c24719,4200,1858564.16,1931287.16,72723,Input,Space,Space,,2970,313,1518


## Fix unicode errors

In [123]:
cols = ['down_event', 'up_event', 'text_change']

train_log_df.loc[:, cols] = train_log_df.loc[:, cols].apply(
    lambda s: s.astype('string').map(lambda x: ftfy.fix_text(x) if x is not pd.NA else x)
)
train_log_df['down_event'].unique()

array(['q', 'Space', 'Backspace', '.', ',', 'Enter', "'", ';',
       'Leftclick', '-', '?', '"', '=', 'v', '/', 'Delete', ':',
       'Rightclick', 'z', '[', '$', '(', ')', '+', '\\', '*', '&', 'x',
       '!', '%', 'V', '>', '<', ']', '@', '–', 't', 's', 'n', '{', '¿',
       '}', '^', 'i', 'a', 'd', 'r', 'e', '`', '›', '#', '~', 'S', '_',
       'h', 'c', 'ı', 'u', 'o', 'p', '|', '—', 'ˆ', '¡', 'm', '€', '´',
       'ş', 'ä'], dtype=object)

## Discard events
### Discard those 'unidentified' and mouse clicks.

In [128]:
train_log_df['activity'].unique()

array(['Input', 'Remove/Cut', 'Replace',
       'Move From [284, 292] To [282, 290]',
       'Move From [287, 289] To [285, 287]',
       'Move From [460, 461] To [465, 466]', 'Paste',
       'Move From [905, 1314] To [907, 1316]',
       'Move From [565, 743] To [669, 847]',
       'Move From [669, 847] To [565, 743]',
       'Move From [1041, 1121] To [1496, 1576]',
       'Move From [1455, 1557] To [1323, 1425]',
       'Move From [2268, 2275] To [2247, 2254]',
       'Move From [213, 302] To [902, 991]',
       'Move From [0, 158] To [234, 392]',
       'Move From [460, 465] To [925, 930]',
       'Move From [810, 906] To [816, 912]',
       'Move From [186, 187] To [184, 185]',
       'Move From [140, 272] To [299, 431]',
       'Move From [114, 140] To [272, 298]',
       'Move From [1386, 1450] To [1445, 1509]',
       'Move From [442, 524] To [296, 378]',
       'Move From [408, 414] To [390, 396]',
       'Move From [1144, 1147] To [1142, 1145]',
       'Move From [218, 220] T

In [126]:
train_log_df['down_event'].unique()

array(['q', 'Space', 'Backspace', '.', ',', 'Enter', "'", ';',
       'Leftclick', '-', '?', '"', '=', 'v', '/', 'Delete', ':',
       'Rightclick', 'z', '[', '$', '(', ')', '+', '\\', '*', '&', 'x',
       '!', '%', 'V', '>', '<', ']', '@', '–', 't', 's', 'n', '{', '¿',
       '}', '^', 'i', 'a', 'd', 'r', 'e', '`', '›', '#', '~', 'S', '_',
       'h', 'c', 'ı', 'u', 'o', 'p', '|', '—', 'ˆ', '¡', 'm', '€', '´',
       'ş', 'ä'], dtype=object)

In [130]:
drop_events = ['LeftClick','RightClick']
train_log_df = train_log_df[~train_log_df['down_event'].isin(drop_events)]
train_log_df['event_id'] = train_log_df.groupby('id').cumcount() + 1 # reset event_id
train_log_df.reset_index(inplace=True,drop=True)
train_log_df

Unnamed: 0,id,event_id,down_time,up_time,action_time,activity,down_event,up_event,text_change,cursor_position,word_count,id_encoded
0,001519c8,1,106686.0,106777.0,91,Input,q,q,q,1,1,0
1,001519c8,2,107196.0,107323.0,127,Input,q,q,q,2,1,0
2,001519c8,3,107296.0,107400.0,104,Input,q,q,q,3,1,0
3,001519c8,4,107469.0,107596.0,127,Input,q,q,q,4,1,0
4,001519c8,5,107659.0,107766.0,107,Input,q,q,q,5,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
7702042,fff05981,2772,2057097.0,2057176.0,79,Input,",",",",",",1436,240,2470
7702043,fff05981,2773,2058353.0,2058413.0,60,Input,.,.,.,1491,240,2470
7702044,fff05981,2774,2062417.0,2062555.0,138,Replace,q,q,q => q,1268,240,2470
7702045,fff05981,2775,2064657.0,2064765.0,108,Replace,q,q,q => q,1031,240,2470


In [131]:
train_log_df.to_csv('data/train_logs_clean.csv')