In [1]:
import pandas as pd
import numpy as np
from m4_feats_functions import *

In [2]:
INPUT_DIR = 'kaggle/input/linking-writing-processes-to-writing-quality'
FEAT_STORE_DIR = 'feature_store'
train_logs = pd.read_csv(f'{INPUT_DIR}/train_logs.csv')
train_scores = pd.read_csv(f'{INPUT_DIR}/train_scores.csv')
test_logs = pd.read_csv(f'{INPUT_DIR}/test_logs.csv')
ss_df = pd.read_csv(f'{INPUT_DIR}/sample_submission.csv')

In [7]:
def countvectorize_one_one(train_logs, test_logs, train_feats, test_feats):
    essays = getEssays(train_logs)
    c_vect = CountVectorizer(ngram_range=(1, 2))
    tr_toks = c_vect.fit_transform(essays['essay']).todense()
    tr_toks_df = pd.DataFrame(columns = [f'tok_{i}' for i in range(tr_toks.shape[1])], data=tr_toks)
    train_feats = pd.concat([train_feats['id'], tr_toks_df], axis=1)

    test_essay = getEssays(test_logs)
    ts_toks = c_vect.fit_transform(test_essay['essay']).todense()
    ts_toks_df = pd.DataFrame(columns = [f'tok_{i}' for i in range(ts_toks.shape[1])], data=ts_toks)
    test_feats = pd.concat([test_feats['id'], ts_toks_df], axis=1)
    return train_feats, test_feats

train_feats = pd.read_pickle('feature_store/base_feats/train_base_feats_1.pkl')
test_feats = pd.read_pickle('feature_store/base_feats/test_base_feats_1.pkl')
train_, test_ = countvectorize_one_one(train_logs, test_logs, train_feats, test_feats)

100%|██████████| 2471/2471 [00:04<00:00, 552.56it/s]
100%|██████████| 3/3 [00:00<00:00, 2875.44it/s]


In [8]:
def countvectorize_one_one(train_logs, test_logs, train_feats, test_feats):

    tr_ids = train_feats.id
    tst_ids = test_feats.id
    tr_ts_logs = pd.concat([train_logs, test_logs], axis=0)
    tr_ts_feats = pd.concat([train_feats['id'], test_feats['id']], axis=0).reset_index(drop=True)

    essays = getEssays(tr_ts_logs)
    c_vect = CountVectorizer(ngram_range=(1, 1))
    toks = c_vect.fit_transform(essays['essay']).todense()
    toks_df = pd.DataFrame(columns = [f'tok_{i}' for i in range(toks.shape[1])], data=toks)
    toks_df.reset_index(drop=True, inplace=True)
    print(toks_df.shape, tr_ts_feats.shape)

    tr_ts_feats = pd.concat([tr_ts_feats, toks_df], axis=1)

    train_feats = tr_ts_feats[tr_ts_feats['id'].isin(tr_ids)]
    test_feats = tr_ts_feats[tr_ts_feats['id'].isin(tst_ids)]

    return train_feats, test_feats

train_feats = pd.read_pickle('feature_store/base_feats/train_base_feats_1.pkl')
test_feats = pd.read_pickle('feature_store/base_feats/test_base_feats_1.pkl')
train_, test_ = countvectorize_one_one(train_logs, test_logs, train_feats, test_feats)

100%|██████████| 2474/2474 [00:04<00:00, 572.73it/s]


(2474, 27) (2474,)


In [9]:
train_

Unnamed: 0,id,tok_0,tok_1,tok_2,tok_3,tok_4,tok_5,tok_6,tok_7,tok_8,...,tok_17,tok_18,tok_19,tok_20,tok_21,tok_22,tok_23,tok_24,tok_25,tok_26
0,001519c8,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0022f953,53,46,36,37,21,19,13,19,8,...,0,0,0,0,0,0,0,0,0,0
2,0042269b,61,88,67,42,23,12,5,1,4,...,0,0,0,0,0,0,0,0,0,0
3,0059420b,64,70,61,49,29,37,31,32,6,...,0,0,0,0,0,0,0,0,0,0
4,0075873a,44,31,43,19,22,23,2,2,3,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2466,ffb8c745,78,49,45,37,27,20,17,26,10,...,0,0,0,0,0,0,0,0,0,0
2467,ffbef7e5,83,69,48,34,26,24,9,11,13,...,0,0,0,0,0,0,0,0,0,0
2468,ffccd6fd,72,56,56,47,23,21,14,13,8,...,0,0,0,0,0,0,0,0,0,0
2469,ffec5b38,49,39,55,37,19,26,13,11,3,...,0,0,0,0,0,0,0,0,0,0


In [5]:
test_

Unnamed: 0,id,event_id_max,up_time_max,action_time_max,action_time_min,action_time_mean,action_time_std,action_time_quantile,action_time_sem,action_time_sum,...,tok_17,tok_18,tok_19,tok_20,tok_21,tok_22,tok_23,tok_24,tok_25,tok_26


In [8]:
file_name = 'count_vectorized_bigrams'

train_.to_pickle(f'{FEAT_STORE_DIR}/train_{file_name}.pkl')
test_.to_pickle(f'{FEAT_STORE_DIR}/test_{file_name}.pkl')

In [3]:
preprocessor = Preprocessor(seed=42)
train_feats = preprocessor.make_feats(train_logs)
test_feats = preprocessor.make_feats(test_logs)
nan_cols = train_feats.columns[train_feats.isna().any()].tolist()
train_feats = train_feats.drop(columns=nan_cols)
test_feats = test_feats.drop(columns=nan_cols)

train_, test_ = process_action_time_activity(train_logs, test_logs)
train_feats = train_feats.merge(train_, on='id', how='left')
test_feats = test_feats.merge(test_, on='id', how='left')

train_essay = getEssays(train_logs)
test_essay = getEssays(test_logs)
train_ = create_word_length_features(train_essay, 'essay_words', 'id', 'words')
test_ = create_word_length_features(test_essay, 'essay_words', 'id', 'words')
train_feats = train_feats.merge(train_, on='id', how='left')
test_feats = test_feats.merge(test_, on='id', how='left')

train_sent_df = split_essays_into_sentences(train_essay)
train_ = compute_sentence_aggregations(train_sent_df)
test_ = compute_sentence_aggregations(split_essays_into_sentences(test_essay))
train_feats = train_feats.merge(train_, on='id', how='left')
test_feats = test_feats.merge(test_, on='id', how='left')

train_paragraph_df = split_essays_into_paragraphs(train_essay)
train_ = compute_paragraph_aggregations(train_paragraph_df)
test_ = compute_paragraph_aggregations(split_essays_into_paragraphs(test_essay))
train_feats = train_feats.merge(train_, on='id', how='left')
test_feats = test_feats.merge(test_, on='id', how='left')
##### feat_1
train_, test_ = process_feats_time_gap_activity(train_logs, test_logs)
train_feats = train_feats.merge(train_, on='id', how='left')
test_feats = test_feats.merge(test_, on='id', how='left')

train_, test_ = process_feats_action_time_gap(train_logs, test_logs)
train_feats = train_feats.merge(train_, on='id', how='left')
test_feats = test_feats.merge(test_, on='id', how='left')
##### feat_2
train_feats.shape

Engineering time data
Engineering cursor position data
Engineering word count data
Engineering statistical summaries for features


100%|██████████| 33/33 [01:29<00:00,  2.72s/it, column=word_count_change100, method=kurt]         


Engineering activity counts data


100%|██████████| 2471/2471 [00:00<00:00, 12514.69it/s]
  result = getattr(ufunc, method)(*inputs, **kwargs)


Engineering event counts data


100%|██████████| 2471/2471 [00:00<00:00, 13338.93it/s]
  result = getattr(ufunc, method)(*inputs, **kwargs)
100%|██████████| 2471/2471 [00:00<00:00, 9903.07it/s] 
  result = getattr(ufunc, method)(*inputs, **kwargs)


Engineering text change counts data


100%|██████████| 2471/2471 [00:00<00:00, 13499.24it/s]
  result = getattr(ufunc, method)(*inputs, **kwargs)


Engineering punctuation counts data


100%|██████████| 2471/2471 [00:00<00:00, 13254.54it/s]


Engineering input words data


  feats['word_time_ratio'] = feats['word_count_max'] / feats['up_time_max']
  feats['word_event_ratio'] = feats['word_count_max'] / feats['event_id_max']
  feats['event_time_ratio'] = feats['event_id_max']  / feats['up_time_max']
  feats['idle_time_ratio'] = feats['action_time_gap1_sum'] / feats['up_time_max']


Engineering ratios data
Engineering time data
Engineering cursor position data
Engineering word count data
Engineering statistical summaries for features


100%|██████████| 33/33 [00:01<00:00, 22.36it/s, column=word_count_change100, method=kurt]         


Engineering activity counts data


100%|██████████| 3/3 [00:00<00:00, 39568.91it/s]
  result = getattr(ufunc, method)(*inputs, **kwargs)


Engineering event counts data


100%|██████████| 3/3 [00:00<00:00, 28992.88it/s]
  result = getattr(ufunc, method)(*inputs, **kwargs)
100%|██████████| 3/3 [00:00<00:00, 26944.14it/s]
  result = getattr(ufunc, method)(*inputs, **kwargs)


Engineering text change counts data


100%|██████████| 3/3 [00:00<00:00, 37008.56it/s]
  result = getattr(ufunc, method)(*inputs, **kwargs)


Engineering punctuation counts data


100%|██████████| 3/3 [00:00<00:00, 23087.91it/s]
  feats['word_time_ratio'] = feats['word_count_max'] / feats['up_time_max']
  feats['word_event_ratio'] = feats['word_count_max'] / feats['event_id_max']
  feats['event_time_ratio'] = feats['event_id_max']  / feats['up_time_max']
  feats['idle_time_ratio'] = feats['action_time_gap1_sum'] / feats['up_time_max']


Engineering input words data
Engineering ratios data


100%|██████████| 2471/2471 [00:04<00:00, 564.51it/s]
100%|██████████| 3/3 [00:00<00:00, 2775.23it/s]


(2471, 375)

In [9]:
train_feats = train_feats.merge(train_, on='id', how='left')
test_feats = test_feats.merge(test_, on='id', how='left')
train_feats.shape

(2471, 495)

In [10]:
train_feats.to_pickle(f'{FEAT_STORE_DIR}/train_feats_2_1.pkl')
test_feats.to_pickle(f'{FEAT_STORE_DIR}/test_feats_2_1.pkl')

In [5]:
file_name = 'essay_words_feats'

train_essay = getEssays(train_logs)
test_essay = getEssays(test_logs)
train_ = create_word_length_features(train_essay, 'essay_words', 'id', 'words')
test_ = create_word_length_features(test_essay, 'essay_words', 'id', 'words')

train_, test_ = process_feats_action_time_gap(train_logs, test_logs)

train_, test_ = process_feats_time_gap_activity(train_logs, test_logs)


100%|██████████| 2471/2471 [00:03<00:00, 630.93it/s]
100%|██████████| 3/3 [00:00<00:00, 3413.70it/s]


In [None]:
file_name = 'base_feats'

preprocessor = Preprocessor(seed=42)
train_feats = preprocessor.make_feats(train_logs)
test_feats = preprocessor.make_feats(test_logs)
nan_cols = train_feats.columns[train_feats.isna().any()].tolist()
train_feats = train_feats.drop(columns=nan_cols)
test_feats = test_feats.drop(columns=nan_cols)

train_feats.to_pickle(f'{FEAT_STORE_DIR}/train_{file_name}.pkl')
test_feats.to_pickle(f'{FEAT_STORE_DIR}/test_{file_name}.pkl')

Engineering time data
Engineering cursor position data
Engineering word count data
Engineering statistical summaries for features


100%|██████████| 33/33 [01:24<00:00,  2.57s/it, column=word_count_change100, method=kurt]         


Engineering activity counts data


100%|██████████| 2471/2471 [00:00<00:00, 14360.23it/s]
  result = getattr(ufunc, method)(*inputs, **kwargs)


Engineering event counts data


100%|██████████| 2471/2471 [00:00<00:00, 12737.11it/s]
  result = getattr(ufunc, method)(*inputs, **kwargs)
100%|██████████| 2471/2471 [00:00<00:00, 11716.33it/s]
  result = getattr(ufunc, method)(*inputs, **kwargs)


Engineering text change counts data


100%|██████████| 2471/2471 [00:00<00:00, 14317.88it/s]
  result = getattr(ufunc, method)(*inputs, **kwargs)


Engineering punctuation counts data


100%|██████████| 2471/2471 [00:00<00:00, 13897.94it/s]


Engineering input words data


  feats['word_time_ratio'] = feats['word_count_max'] / feats['up_time_max']
  feats['word_event_ratio'] = feats['word_count_max'] / feats['event_id_max']
  feats['event_time_ratio'] = feats['event_id_max']  / feats['up_time_max']
  feats['idle_time_ratio'] = feats['action_time_gap1_sum'] / feats['up_time_max']


Engineering ratios data
Engineering time data
Engineering cursor position data
Engineering word count data
Engineering statistical summaries for features


100%|██████████| 33/33 [00:01<00:00, 23.38it/s, column=word_count_change100, method=kurt]         


Engineering activity counts data


100%|██████████| 3/3 [00:00<00:00, 18808.54it/s]
  result = getattr(ufunc, method)(*inputs, **kwargs)


Engineering event counts data


100%|██████████| 3/3 [00:00<00:00, 38956.38it/s]
  result = getattr(ufunc, method)(*inputs, **kwargs)
100%|██████████| 3/3 [00:00<00:00, 36054.19it/s]
  result = getattr(ufunc, method)(*inputs, **kwargs)


Engineering text change counts data


100%|██████████| 3/3 [00:00<00:00, 31855.47it/s]
  result = getattr(ufunc, method)(*inputs, **kwargs)


Engineering punctuation counts data


100%|██████████| 3/3 [00:00<00:00, 38362.54it/s]
  feats['word_time_ratio'] = feats['word_count_max'] / feats['up_time_max']
  feats['word_event_ratio'] = feats['word_count_max'] / feats['event_id_max']
  feats['event_time_ratio'] = feats['event_id_max']  / feats['up_time_max']
  feats['idle_time_ratio'] = feats['action_time_gap1_sum'] / feats['up_time_max']


Engineering input words data
Engineering ratios data


In [8]:
train_essay_words = create_word_length_features(train_essay, 'essay_words', 'id', 'words')
test_essay_words = create_word_length_features(test_essay, 'essay_words', 'id', 'words')
train_essay_words.to_pickle(f'{FEAT_STORE_DIR}/train_{file_name}.pkl')
test_essay_words.to_pickle(f'{FEAT_STORE_DIR}/test_{file_name}.pkl')

In [8]:
file_name = 'at_by_bucket'
train_action_buckets, test_action_buckets = action_time_by_bucket_feats(train_logs, test_logs)
train_action_buckets.to_pickle(f'{FEAT_STORE_DIR}/train_{file_name}.pkl')
test_action_buckets.to_pickle(f'{FEAT_STORE_DIR}/test_{file_name}.pkl')

In [9]:
file_name = 'at_by_activ'
train_at_by_act, test_at_by_act = process_action_time_activity(train_logs, test_logs)
train_at_by_act.to_pickle(f'{FEAT_STORE_DIR}/train_{file_name}.pkl')
test_at_by_act.to_pickle(f'{FEAT_STORE_DIR}/test_{file_name}.pkl')

In [11]:
file_name = 'adj_eff_time'
train_adj_eff_time, test_adj_eff_time = process_adjusted_eff_time(train_logs, test_logs)
train_adj_eff_time.to_pickle(f'{FEAT_STORE_DIR}/train/train_{file_name}.pkl')
test_adj_eff_time.to_pickle(f'{FEAT_STORE_DIR}/test/test_{file_name}.pkl')

In [13]:
file_name = 'rep_cut'
train_rep_cut, test_rep_cut = process_re_cut_essays(train_logs, test_logs)
train_rep_cut.to_pickle(f'{FEAT_STORE_DIR}/train/train_{file_name}.pkl')
test_rep_cut.to_pickle(f'{FEAT_STORE_DIR}/test/test_{file_name}.pkl')

100%|██████████| 2470/2470 [00:00<00:00, 4126.84it/s]


In [15]:
file_name = 'action_time_gap'
train_at_gap, test_at_gap = process_feats_action_time_gap(train_logs, test_logs)
train_at_gap.to_pickle(f'{FEAT_STORE_DIR}/train/train_{file_name}.pkl')
test_at_gap.to_pickle(f'{FEAT_STORE_DIR}/test/test_{file_name}.pkl')

In [17]:
file_name = 'action_time_gap_by_acti'
train_feats, test_feats = process_feats_time_gap_activity(train_logs, test_logs)
train_feats.to_pickle(f'{FEAT_STORE_DIR}/train/train_{file_name}.pkl')
test_feats.to_pickle(f'{FEAT_STORE_DIR}/test/test_{file_name}.pkl')

In [3]:
file_name = 'IKI'
train_IKI = train_logs.groupby(['id']).apply(calculate_pause_features).reset_index()
test_IKI = test_logs.groupby(['id']).apply(calculate_pause_features).reset_index()
train_IKI.to_pickle(f'{FEAT_STORE_DIR}/train/train_{file_name}.pkl')
test_IKI.to_pickle(f'{FEAT_STORE_DIR}/test/test_{file_name}.pkl')

In [3]:
file_name = 'train_wc_chage'
train_feats = create_feats_wc_change(train_logs)
test_feats = create_feats_wc_change(test_logs)
train_feats.to_pickle(f'{FEAT_STORE_DIR}/train/train_{file_name}.pkl')
test_feats.to_pickle(f'{FEAT_STORE_DIR}/test/test_{file_name}.pkl')

In [4]:
file_name = 'wpm_feats'
train_feats, test_feats = wpm_feats(train_logs, test_logs)
train_feats.to_pickle(f'{FEAT_STORE_DIR}/train/train_{file_name}.pkl')
test_feats.to_pickle(f'{FEAT_STORE_DIR}/test/test_{file_name}.pkl')

In [7]:
file_name = 'essay_paste_words'
train_feats, test_feats = essay_paste_words(train_logs, test_logs)
train_feats.to_pickle(f'{FEAT_STORE_DIR}/train/train_{file_name}.pkl')
test_feats.to_pickle(f'{FEAT_STORE_DIR}/test/test_{file_name}.pkl')

100%|██████████| 317/317 [00:00<00:00, 16520.80it/s]


In [6]:
train_essays = getEssays(train_logs)
test_essays = getEssays(test_logs)

# Sentence features for train dataset
train_essays = getEssays(train_logs)
train_sent_df = split_essays_into_sentences(train_essays)
train_sent_agg_df = compute_sentence_aggregations(train_sent_df)

# Paragraph features for train dataset
train_paragraph_df = split_essays_into_paragraphs(train_essays)
train_paragraph_agg_df = compute_paragraph_aggregations(train_paragraph_df)

# Features for test dataset
test_sent_agg_df = compute_sentence_aggregations(split_essays_into_sentences(test_essays))
test_paragraph_agg_df = compute_paragraph_aggregations(split_essays_into_paragraphs(test_essays))

file_name = 'essay_sen'
train_sent_agg_df.to_pickle(f'{FEAT_STORE_DIR}/train/train_{file_name}.pkl')
test_sent_agg_df.to_pickle(f'{FEAT_STORE_DIR}/test/test_{file_name}.pkl')

file_name = 'essay_par'
train_paragraph_agg_df.to_pickle(f'{FEAT_STORE_DIR}/train/train_{file_name}.pkl')
test_paragraph_agg_df.to_pickle(f'{FEAT_STORE_DIR}/test/test_{file_name}.pkl')

100%|██████████| 2471/2471 [00:04<00:00, 578.22it/s]
100%|██████████| 3/3 [00:00<00:00, 3821.11it/s]
100%|██████████| 2471/2471 [00:04<00:00, 613.59it/s]
