# Writing Process Task
## Phase 1 Data cleaning

In [38]:
import pandas as pd
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
import ftfy

In [39]:
trainlog = pd.read_csv('../data/train_logs_clean.csv')
trainscore = pd.read_csv('../data/train_scores.csv')
testlog = pd.read_csv('../data/test_logs.csv')
trainlog

Unnamed: 0.1,Unnamed: 0,id,event_id,down_time,up_time,action_time,activity,down_event,up_event,text_change,cursor_position,word_count,id_encoded
0,0,0000aaaa,1,338433,338518,85,Input,Space,Space,,0,0,0
1,1,0000aaaa,2,760073,760160,87,Input,Space,Space,,1,0,0
2,2,2222bbbb,1,711956,712023,67,Input,q,q,q,0,1,1
3,3,2222bbbb,2,290502,290548,46,Input,q,q,q,1,1,1
4,4,4444cccc,1,635547,635641,94,Input,Space,Space,,0,0,2
5,5,4444cccc,2,184996,185052,56,Input,q,q,q,1,1,2


In [40]:
trainlog['id'].unique()

array(['0000aaaa', '2222bbbb', '4444cccc'], dtype=object)

In [41]:
trainlog['down_event'].unique()

array(['Space', 'q'], dtype=object)

In [42]:
trainlog['action_time'].max()

np.int64(94)

In [43]:
def label_encoding(df, col="id"):
    label_encoder = LabelEncoder()
    label_encoder.fit(df[col])
    df[col + "_encoded"] = label_encoder.transform(df[col])
    return df


# remove time that the author havent start writing or is resting
# reference: remove_margin for https://www.kaggle.com/code/tomooinubushi/1st-place-solution-training-and-inference-code

def remove_procrastination_time(df, start_margin=2*60*1000, end_margin=2*60*1000):
    df = df[df['up_event'] != 'Unidentified'].reset_index(drop=True)
    result_df = []
    grouped_df = df.groupby('id_encoded')

    for _, log in tqdm(grouped_df):
        valid_events = log[(log.activity != 'Nonproduction') & (
            log.up_event != 'Shift') & (log.up_event != 'CapsLock')].down_time.values
        if len(valid_events) == 0:
            continue
        log = log[(log.down_time > valid_events.min() - start_margin)
                  & (log['down_time'] <= valid_events.max() + end_margin)].copy()
        log['event_id'] = range(len(log))
        result_df.append(log)

    result = pd.concat(result_df, ignore_index=True)

    return result


In [44]:

train_log_df = label_encoding(trainlog)
train_log_df = remove_procrastination_time(train_log_df)
train_log_df.head(15)


100%|██████████| 3/3 [00:00<00:00, 1718.98it/s]


Unnamed: 0.1,Unnamed: 0,id,event_id,down_time,up_time,action_time,activity,down_event,up_event,text_change,cursor_position,word_count,id_encoded
0,0,0000aaaa,0,338433,338518,85,Input,Space,Space,,0,0,0
1,1,0000aaaa,1,760073,760160,87,Input,Space,Space,,1,0,0
2,2,2222bbbb,0,711956,712023,67,Input,q,q,q,0,1,1
3,3,2222bbbb,1,290502,290548,46,Input,q,q,q,1,1,1
4,4,4444cccc,0,635547,635641,94,Input,Space,Space,,0,0,2
5,5,4444cccc,1,184996,185052,56,Input,q,q,q,1,1,2


## Remove Nonproduction rows

In [45]:
# remove Nonproduction rows

train_log_df = train_log_df[train_log_df['activity'] != 'Nonproduction' ].reset_index(drop=True)
train_log_df
train_log_df.down_event.unique()

array(['Space', 'q'], dtype=object)

## Fix timestamp Errors

In [46]:
df_sel = train_log_df.loc[
    (train_log_df['id'] == 'a0c24719') &
    (train_log_df['event_id'].between(64, 71))
]
df_sel

Unnamed: 0.1,Unnamed: 0,id,event_id,down_time,up_time,action_time,activity,down_event,up_event,text_change,cursor_position,word_count,id_encoded


In [47]:
# According to 1st place notebook there are several bugs about the training data
# sometimes time will go back and resume
df_temp = train_log_df[train_log_df['id'] == 'a0c24719']
intervals = []
for i in range(0,df_temp.shape[0]-1):
    if df_temp.iloc[i]['activity'] == 'Remove/Cut' and df_temp.iloc[i + 1]['activity'] == 'Input':
        intervals.append(int(df_temp.iloc[i+1]['down_time'] - df_temp.iloc[i]['down_time']))
intervals = pd.Series(intervals)
intervals_mean = intervals[intervals.between(0,10000)].mean()
intervals_mean

nan

**notice the timejump between event 67 and 68**

In [48]:
diff = intervals_mean + 66231 - 17831 # literals taken from above
mask = (train_log_df['id'] == 'a0c24719') & (train_log_df['event_id'] >= 68)
train_log_df.loc[mask, 'down_time'] = train_log_df.loc[mask, 'down_time'] + diff
train_log_df.loc[mask, 'up_time'] = train_log_df.loc[mask, 'up_time'] + diff

train_log_df.loc[
    (train_log_df['id'] == 'a0c24719') &
    (train_log_df['event_id'].between(64, 71))]


Unnamed: 0.1,Unnamed: 0,id,event_id,down_time,up_time,action_time,activity,down_event,up_event,text_change,cursor_position,word_count,id_encoded


In [49]:
train_log_df.loc[train_log_df['action_time'] >= 60 *1000]

Unnamed: 0.1,Unnamed: 0,id,event_id,down_time,up_time,action_time,activity,down_event,up_event,text_change,cursor_position,word_count,id_encoded


## Fix unicode errors

In [50]:
cols = ['down_event', 'up_event', 'text_change']

train_log_df.loc[:, cols] = train_log_df.loc[:, cols].apply(
    lambda s: s.astype('string').map(lambda x: ftfy.fix_text(x) if x is not pd.NA else x)
)
train_log_df['down_event'].unique()

array(['Space', 'q'], dtype=object)

## Discard events
### Discard those 'unidentified' and mouse clicks.

In [51]:
train_log_df['activity'].unique()

array(['Input'], dtype=object)

In [52]:
train_log_df['down_event'].unique()

array(['Space', 'q'], dtype=object)

In [53]:
drop_events = ['LeftClick','RightClick']
train_log_df = train_log_df[~train_log_df['down_event'].isin(drop_events)]
train_log_df['event_id'] = train_log_df.groupby('id').cumcount() + 1 # reset event_id
train_log_df.reset_index(inplace=True,drop=True)
train_log_df

Unnamed: 0.1,Unnamed: 0,id,event_id,down_time,up_time,action_time,activity,down_event,up_event,text_change,cursor_position,word_count,id_encoded
0,0,0000aaaa,1,338433,338518,85,Input,Space,Space,,0,0,0
1,1,0000aaaa,2,760073,760160,87,Input,Space,Space,,1,0,0
2,2,2222bbbb,1,711956,712023,67,Input,q,q,q,0,1,1
3,3,2222bbbb,2,290502,290548,46,Input,q,q,q,1,1,1
4,4,4444cccc,1,635547,635641,94,Input,Space,Space,,0,0,2
5,5,4444cccc,2,184996,185052,56,Input,q,q,q,1,1,2


In [None]:
train_log_df.to_csv('../data/train_logs_clean.csv')