In [1]:
import pandas as pd
import numpy as np
import ast

from m5_sb_models import *
from m3_model_params import xgb_params_2 as xgb_params
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PowerTransformer
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Paths to the train and test directories
# INPUT_DIR = 'kaggle/input/linking-writing-processes-to-writing-quality'
FEATURE_STORE = 'feature_store'
# train_dir = 'feature_store/train'
# test_dir = 'feature_store/test'

# train_logs = pd.read_csv(f'{INPUT_DIR}/train_logs.csv')
# train_scores = pd.read_csv(f'{INPUT_DIR}/train_scores.csv')
# test_logs = pd.read_csv(f'{INPUT_DIR}/test_logs.csv')
# ss_df = pd.read_csv(f'{INPUT_DIR}/sample_submission.csv')

train_feats = pd.read_pickle('feature_selection/train_feats.pkl')
test_feats = pd.read_pickle('feature_selection/test_feats.pkl')
train_feats = train_feats.sort_values('id')

lgb_params_1 = {
    'boosting_type': 'gbdt', 
    'metric': 'rmse',
    'reg_alpha': 0.0031, 
    'reg_lambda': 0.001, 
    'colsample_bytree': 0.8,  
    'subsample_freq': 1,  
    'subsample': 0.75,  
    'learning_rate': 0.017, 
    'num_leaves': 19, 
    'min_child_samples': 46,
    'n_estimators': 350,
    'verbosity': -1
    }

In [3]:
# LGBM Average RMSE over 50 folds: 0.603452
# Features: ('action_time_gap_by_acti', 'action_time_gap').

max_combination_length = 4 # You can adjust this to test different combination lengths
min_combination_length = 4
results_comb = compare_feature_combinations(base_dir=FEATURE_STORE, 
                                       base_train_feats=train_feats,
                                       base_test_feats=test_feats,
                                       params = lgb_params_1,
                                       baseline_metrics=0.604068, #(categorical_nunique, r_burst_feats, get_keys_...
                                       max_combination_length=max_combination_length,
                                       min_combination_length=min_combination_length)

results_comb.sort_values('Improvement', ascending=False).head(5)

Number of combinations: 330
Feature set: ('p_burst_feats', 'get_keys_pressed_per_second', 'word_counts_rate_of_change', 'categorical_nunique')
Train feats shape : (2471, 86)
Train feats shape : (2471, 87)
Train feats shape : (2471, 100)
Train feats shape : (2471, 103)
(2471, 103) (3, 102)


KeyboardInterrupt: 

In [None]:
# LGBM Average RMSE over 50 folds: 0.603452
# Features: ('action_time_gap_by_acti', 'action_time_gap').

max_combination_length = 3 # You can adjust this to test different combination lengths
min_combination_length = 2
results_comb = compare_feature_combinations(base_dir=FEATURE_STORE, 
                                       base_train_feats=train_feats,
                                       base_test_feats=test_feats,
                                       params = lgb_params_1,
                                       baseline_metrics=0.604472,
                                       max_combination_length=max_combination_length,
                                       min_combination_length=min_combination_length)

results_comb.sort_values('Improvement', ascending=False).head(5)

Number of combinations: 220
Feature set: ('categorical_nunique', 'essay_words')
Train feats shape : (2471, 82)
Train feats shape : (2471, 92)
(2471, 92) (3, 91)


Features: ('categorical_nunique', 'essay_words'). RMSE: 0.606690, Improvement: -0.002218
Feature set: ('categorical_nunique', 'r_burst_feats')
Train feats shape : (2471, 82)
Train feats shape : (2471, 89)
(2471, 89) (3, 88)
Features: ('categorical_nunique', 'r_burst_feats'). RMSE: 0.604206, Improvement: 0.000266
Feature set: ('categorical_nunique', 'get_keys_pressed_per_second')
Train feats shape : (2471, 82)
Train feats shape : (2471, 83)
(2471, 83) (3, 82)
Features: ('categorical_nunique', 'get_keys_pressed_per_second'). RMSE: 0.604646, Improvement: -0.000174
Feature set: ('categorical_nunique', 'vector_two_gram')
Train feats shape : (2471, 82)
Train feats shape : (2471, 98)
(2471, 98) (3, 97)
Features: ('categorical_nunique', 'vector_two_gram'). RMSE: 0.604515, Improvement: -0.000043
Feature set: ('categorical_nunique', 'count_of_activities')
Train feats shape : (2471, 82)
Train feats shape : (2471, 87)
(2471, 87) (3, 86)
Features: ('categorical_nunique', 'count_of_activities'). RMS

Unnamed: 0,Feature Combination,Metric,Improvement
64,"(categorical_nunique, r_burst_feats, get_keys_...",0.604068,0.000404
69,"(categorical_nunique, r_burst_feats, p_burst_f...",0.604074,0.000398
9,"(categorical_nunique, word_count_acceleration)",0.604175,0.000297
1,"(categorical_nunique, r_burst_feats)",0.604206,0.000266
7,"(categorical_nunique, p_burst_feats)",0.604268,0.000204


In [3]:
results = compare_with_baseline(
    base_dir=FEATURE_STORE, 
    base_train_feats=train_feats,
    base_test_feats=test_feats,
    params = lgb_params_1,
    baseline_metrics=0.604321)

results.sort_values('Improvement', ascending=False).head(5)

['word_counts_rate_of_change', 'essay_sentences', 'cursor_pos_rate_of_change', 'count_of_activities', 'p_burst_feats', 'events_counts_acceleration', 'events_counts_baseline', 'down_events_counts', 'action_time_baseline_stats', 'action_time_by_activity', 'events_counts_rate_of_change', 'essay_words', 'events_counts_time_based', 'product_to_keys', 'input_text_change_feats', 'word_count_time_based', 'cursor_pos_time_based', 'categorical_nunique']
Set of features to test: word_counts_rate_of_change
(2471, 128)
Final RMSE over 50: 0.605701. Std 0.8240
RMSE by fold 0.605587. Std 0.0118
Features: word_counts_rate_of_change. RMSE: 0.605701, Improvement: -0.001380
Set of features to test: essay_sentences
(2471, 134)
Final RMSE over 50: 0.605301. Std 0.8271
RMSE by fold 0.605178. Std 0.0123
Features: essay_sentences. RMSE: 0.605301, Improvement: -0.000980
Set of features to test: cursor_pos_rate_of_change
(2471, 127)
Final RMSE over 50: 0.604853. Std 0.8221
RMSE by fold 0.604735. Std 0.0120
Feat

Unnamed: 0,Feature Set,Metric,Improvement
7,down_events_counts,0.603829,0.000492
4,p_burst_feats,0.604314,7e-06
17,categorical_nunique,0.604352,-3.1e-05
8,action_time_baseline_stats,0.604592,-0.000271
2,cursor_pos_rate_of_change,0.604853,-0.000532


In [3]:
#train_feats = train_feats.drop(columns=['score_x','score_x'])train_feats.rename(columns={'score_x': 'score'})
#train_feats.rename(columns={'score_y': 'score'}, inplace=True)
#train_feats.to_pickle('feature_selection/train_feats_1.pkl')

In [4]:
# LGBM Average RMSE over 50 folds: 0.603452
# Features: ('action_time_gap_by_acti', 'action_time_gap').

max_combination_length = 3 # You can adjust this to test different combination lengths
min_combination_length = 3
results_comb = compare_feature_combinations(base_dir=f'{FEATURE_STORE}', 
                                       base_train_feats=train_feats,
                                       base_test_feats=test_feats,
                                       params = lgb_params_1,
                                       baseline_metrics=0.603377,
                                       max_combination_length=max_combination_length,
                                       min_combination_length=min_combination_length)

Number of combinations: 816
Feature set: ('action_time_by_activity', 'word_counts_rate_of_change', 'count_of_activities')
Base train size: (2471, 107)
(2471, 130) (3, 129)
Final RMSE over 50: 0.604869. Std 0.8272
RMSE by fold 0.604788. Std 0.0103
Features: ('action_time_by_activity', 'word_counts_rate_of_change', 'count_of_activities'). RMSE: 0.604869, Improvement: -0.001492
Feature set: ('action_time_by_activity', 'word_counts_rate_of_change', 'essay_words')
Base train size: (2471, 107)
(2471, 135) (3, 134)
Final RMSE over 50: 0.606621. Std 0.8310
RMSE by fold 0.606537. Std 0.0105
Features: ('action_time_by_activity', 'word_counts_rate_of_change', 'essay_words'). RMSE: 0.606621, Improvement: -0.003244
Feature set: ('action_time_by_activity', 'word_counts_rate_of_change', 'events_counts_acceleration')
Base train size: (2471, 107)
(2471, 137) (3, 136)
Final RMSE over 50: 0.604837. Std 0.8273
RMSE by fold 0.604731. Std 0.0116
Features: ('action_time_by_activity', 'word_counts_rate_of_cha

In [6]:
results_comb.sort_values('Improvement', ascending=False)

Unnamed: 0,Feature Combination,Metric,Improvement
651,"(get_keys_pressed_per_second, p_burst_feats, r...",0.602696,0.000681
559,"(cursor_pos_rate_of_change, p_burst_feats, cat...",0.602937,0.000440
82,"(action_time_by_activity, get_keys_pressed_per...",0.602960,0.000417
320,"(count_of_activities, p_burst_feats, product_t...",0.602973,0.000404
95,"(action_time_by_activity, p_burst_feats, produ...",0.602979,0.000398
...,...,...,...
431,"(essay_words, action_time_baseline_stats, even...",0.607957,-0.004580
382,"(essay_words, cursor_pos_rate_of_change, curso...",0.608018,-0.004641
434,"(essay_words, action_time_baseline_stats, word...",0.608051,-0.004674
366,"(essay_words, events_counts_acceleration, even...",0.608322,-0.004945


In [1]:
train_feats = pd.read_pickle('feature_store/base_feats/train_base_feats_2.pkl')
test_feats = pd.read_pickle('feature_store/base_feats/test_base_feats_2.pkl')


tr_count_vector = pd.read_pickle('feature_store/train/train_count_vectorized.pkl')
ts_count_vector = pd.read_pickle('feature_store/test/test_count_vectorized.pkl')
train_feats = train_feats.merge(tr_count_vector, on='id', how='left')
train_feats = train_feats.merge(train_scores, on='id', how='left')
feats_feats = test_feats.merge(ts_count_vector, on='id', how='left')

alpha = 450

_, oof_2, rmse, model1 = lgb_pipeline(train_feats, test_feats, params)

_, oof_1, rmse, model1 = xgb_pipeline(train_feats=train_feats, 
                                        test_feats=test_feats, 
                                        xgb_params=xgb_params)

train_feats = pd.read_pickle('feature_store/base_feats/train_base_feats_2.pkl')
test_feats = pd.read_pickle('feature_store/base_feats/test_base_feats_2.pkl')

tr_count_vector = pd.read_pickle('feature_store/train/train_count_vectorized.pkl')
ts_count_vector = pd.read_pickle('feature_store/test/test_count_vectorized.pkl')
train_feats = train_feats.merge(tr_count_vector, on='id', how='left')
feats_feats = test_feats.merge(ts_count_vector, on='id', how='left')

train_feats = preprocess_feats(train_feats, PowerTransformer('yeo-johnson'))
test_feats = preprocess_feats(test_feats, PowerTransformer('yeo-johnson'))
train_feats = train_feats.merge(train_scores, on='id', how='left')      


ridge_params = {'alpha': alpha}  # Create a dictionary with alpha
print(f'Alpha {alpha}')
_, _, ridge_oof_preds, _ = ridge_cv_pipeline(train_feats, test_feats, ridge_params, seed=42, n_repeats=n_repeats, n_splits=n_splits)
                                        
blend = pd.concat([oof_1, oof_2, ridge_oof_preds], axis=0)
blend_scores = blend.groupby(['id','score'])['prediction'].mean().reset_index()
blend_rmse = mean_squared_error(blend_scores['score'], blend_scores['prediction'], squared=False)
print(f'Blend RMSE {blend_rmse:.6f}')

NameError: name 'pd' is not defined

In [8]:
results_comb.sort_values('Metric').head(6)

Unnamed: 0,Feature Combination,Metric,Improvement
245,"(count_vectorized, action_time_gap, wpm_feats)",0.601765,0.002549
244,"(count_vectorized, action_time_gap, pause)",0.601846,0.002468
272,"(count_vectorized, rep_cut, action_time_gap_by...",0.601978,0.002336
90,"(at_by_activity, count_vectorized_bigrams, act...",0.602142,0.002172
24,"(count_vectorized_bigrams, action_time_gap)",0.602142,0.002172
199,"(count_vectorized_bigrams, count_vectorized, a...",0.602142,0.002172


In [7]:
results.sort_values('Improvement', ascending=False)

Unnamed: 0,Feature Set,Metric,Improvement
6,count_vectorized,0.602782,0.001532
1,action_time_gap,0.604004,0.00031
2,rep_cut,0.604101,0.000213
8,action_time_gap_by_acti,0.604244,7e-05
7,IKI,0.604575,-0.000261
3,at_by_bucket,0.605283,-0.000969
5,wpm_feats,0.605465,-0.001151
4,wc_chage,0.605597,-0.001283
0,adj_eff_time,0.605755,-0.001441


In [None]:
# Combinations of 3
results_comb.sort_values('Metric').head(6)

Unnamed: 0,Feature Combination,Metric,Improvement
245,"(count_vectorized, action_time_gap, wpm_feats)",0.601765,0.002549
244,"(count_vectorized, action_time_gap, pause)",0.601846,0.002468
272,"(count_vectorized, rep_cut, action_time_gap_by...",0.601978,0.002336
90,"(at_by_activity, count_vectorized_bigrams, act...",0.602142,0.002172
24,"(count_vectorized_bigrams, action_time_gap)",0.602142,0.002172
199,"(count_vectorized_bigrams, count_vectorized, a...",0.602142,0.002172


In [10]:
best_params = {'reg_alpha': 0.007678095440286993, 
               'reg_lambda': 0.34230534302168353, 
               'colsample_bytree': 0.627061253588415, 
               'subsample': 0.854942238828458, 
               'learning_rate': 0.038697981947473245, 
               'num_leaves': 22, 
               'max_depth': 37, 
               'min_child_samples': 18}

n_repeats = 5
n_splits = 10

train_feats = pd.read_pickle('feature_store/base_feats/train_base_feats_2.pkl')
test_feats = pd.read_pickle('feature_store/base_feats/test_base_feats_2.pkl')
train_feats = train_feats.merge(train_scores, on='id', how='left')    
#train_feats, test_feats = countvectorize_one_one(train_logs, test_logs, train_feats, test_feats)

_, oof_2, rmse, model1 = cv_pipeline(train_feats, test_feats, best_params, 'gbdt')

LGBM Average RMSE over 50 folds: 0.612893


In [5]:
from m3_model_params import lgb_params_1, xgb_params_2

n_repeats = 5
n_splits = 6

train_feats = pd.read_pickle('feature_store/base_feats/train_base_feats_2.pkl')
test_feats = pd.read_pickle('feature_store/base_feats/test_base_feats_2.pkl')
train_feats = train_feats.merge(train_scores, on='id', how='left')

#train_feats, test_feats = countvectorize_one_one(train_logs, test_logs, train_feats, test_feats)
_, oof_1, rmse, model1 = xgb_cv_pipeline(train_feats=train_feats, 
                                        test_feats=test_feats, 
                                        xgb_params=xgb_params_2, 
                                        seed=seed, 
                                        n_repeats=n_repeats, 
                                        n_splits=n_splits)
                                        
_, oof_2, rmse, model1 = cv_pipeline(train_feats, test_feats, lgb_params_2, lgb_params_1['boosting_type'])

blend = pd.concat([oof_1, oof_2], axis=0)
blend_scores = blend.groupby(['id','score'])['prediction'].mean().reset_index()
blend_rmse = mean_squared_error(blend_scores['score'], blend_scores['prediction'], squared=False)
print(f'Blend RMSE {blend_rmse:.6f}')

Iterations: 100%|██████████| 5/5 [06:05<00:00, 73.09s/it]


XGB Average RMSE over 30 folds: 0.607183
LGBM Average RMSE over 50 folds: 0.604164
Blend RMSE 0.603108


feats_1 - No countvectorizer
XGB Average RMSE over 30 folds: 0.605292
LGBM Average RMSE over 50 folds: 0.605479
Blend RMSE 0.603148

In [3]:
params_df = pd.read_csv('params.csv')
params_df = params_df.head(2)
params_df['params'] = params_df['params'].apply(ast.literal_eval)

for i in range(2):
    params = params_df.loc[i]['params']
    params.pop('verbose')
    params.pop('random_state')
    params['n_estimators'] = 2000


In [5]:
train_feats = pd.read_pickle('feature_store/base_feats/train_base_feats_2.pkl')
test_feats = pd.read_pickle('feature_store/base_feats/test_base_feats_2.pkl')
train_feats = train_feats.merge(train_scores, on='id', how='left')

_, oof_1, rmse, model1 = xgb_cv_pipeline(train_feats=train_feats, 
                                        test_feats=test_feats, 
                                        xgb_params=xgb_params_2, 
                                        seed=seed, 
                                        n_repeats=n_repeats, 
                                        n_splits=n_splits)

_, oof_2, rmse, model1 = cv_pipeline(train_feats, test_feats, lgb_params_2, lgb_params_1['boosting_type'])


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


Iterations: 100%|██████████| 5/5 [01:46<00:00, 21.27s/it]

XGB Average RMSE over 50 folds: 0.601289





In [17]:
blend = pd.concat([oof_2, oof_4], axis=0)
blend_scores = blend.groupby(['id','score'])['prediction'].mean().reset_index()
blend_rmse = mean_squared_error(blend_scores['score'], blend_scores['prediction'], squared=False)
print(f'Blend RMSE {blend_rmse:.6f}')

Blend RMSE 0.600613


In [13]:
lgb_params_1 = params_df.loc[0]['params']
lgb_params_2 = params_df.loc[0]['params']

train_feats = pd.read_pickle('feature_store/base_feats/train_base_feats_1.pkl')
test_feats = pd.read_pickle('feature_store/base_feats/test_base_feats_1.pkl')
train_feats = train_feats.merge(train_scores, on='id', how='left')

train_feats, test_feats = countvectorize_one_one(train_logs, test_logs, train_feats, test_feats)
_, oof_3, rmse, model1 = cv_pipeline(train_feats, test_feats, lgb_params_1, lgb_params_1['boosting_type'])
_, oof_1, rmse, model1 = xgb_cv_pipeline(train_feats=train_feats, 
                                        test_feats=test_feats, 
                                        xgb_params=xgb_params, 
                                        seed=seed, 
                                        n_repeats=n_repeats, 
                                        n_splits=n_splits)

train_feats = pd.read_pickle('feature_store/base_feats/train_base_feats_2.pkl')
test_feats = pd.read_pickle('feature_store/base_feats/test_base_feats_2.pkl')
train_feats = train_feats.merge(train_scores, on='id', how='left')

train_feats, test_feats = countvectorize_one_one(train_logs, test_logs, train_feats, test_feats)
_, oof_4, rmse, model1 = cv_pipeline(train_feats, test_feats, lgb_params_1, lgb_params_1['boosting_type'])


blend = pd.concat([oof_1, oof_2, oof_3, oof_4], axis=0)
blend_scores = blend.groupby(['id','score'])['prediction'].mean().reset_index()
blend_rmse = mean_squared_error(blend_scores['score'], blend_scores['prediction'], squared=False)
print(f'Blend RMSE {blend_rmse:.6f}')

100%|██████████| 2471/2471 [00:04<00:00, 523.60it/s]
100%|██████████| 3/3 [00:00<00:00, 2966.27it/s]


LGBM Average RMSE over 50 folds: 0.602144


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


Iterations: 100%|██████████| 5/5 [06:53<00:00, 82.64s/it] 


XGB Average RMSE over 50 folds: 0.604448


100%|██████████| 2471/2471 [00:05<00:00, 487.36it/s]
100%|██████████| 3/3 [00:00<00:00, 2452.81it/s]


KeyError: "None of [Index(['score'], dtype='object')] are in the [columns]"

In [14]:
### TO DELETE

train_feats = pd.read_pickle('feature_store/base_feats/train_base_feats_2.pkl')
test_feats = pd.read_pickle('feature_store/base_feats/test_base_feats_2.pkl')
train_feats = train_feats.merge(train_scores, on='id', how='left')

train_feats, test_feats = countvectorize_one_one(train_logs, test_logs, train_feats, test_feats)
_, oof_4, rmse, model1 = cv_pipeline(train_feats, test_feats, lgb_params_1, lgb_params_1['boosting_type'])
_, oof_2, rmse, model1 = xgb_cv_pipeline(train_feats=train_feats, 
                                        test_feats=test_feats, 
                                        xgb_params=xgb_params, 
                                        seed=seed, 
                                        n_repeats=n_repeats, 
                                        n_splits=n_splits)


blend = pd.concat([oof_1, oof_2, oof_3, oof_4], axis=0)
blend_scores = blend.groupby(['id','score'])['prediction'].mean().reset_index()
blend_rmse = mean_squared_error(blend_scores['score'], blend_scores['prediction'], squared=False)
print(f'Blend RMSE {blend_rmse:.6f}')

100%|██████████| 2471/2471 [00:04<00:00, 521.34it/s]
100%|██████████| 3/3 [00:00<00:00, 1652.82it/s]


LGBM Average RMSE over 50 folds: 0.601574


Iterations: 100%|██████████| 5/5 [02:51<00:00, 34.39s/it]

XGB Average RMSE over 50 folds: 0.603451
Blend RMSE 0.601227





In [19]:
from m3_model_params import lgb_params_1
_, oof_5, rmse, model1 = cv_pipeline(train_feats, test_feats, lgb_params_1, lgb_params_1['boosting_type'])

blend = pd.concat([oof_4, oof_3, oof_5], axis=0) # , oof_3, oof_4, oof_5, oof_6
blend_scores = blend.groupby(['id','score'])['prediction'].mean().reset_index()
blend_rmse = mean_squared_error(blend_scores['score'], blend_scores['prediction'], squared=False)
print(f'Blend RMSE {blend_rmse:.6f}')

LGBM Average RMSE over 50 folds: 0.602618
Blend RMSE 0.600350


In [8]:
# TEST HYPERPARAMS from search .csv
train_feats = pd.read_pickle('feature_store/base_feats/train_base_feats_1.pkl')
test_feats = pd.read_pickle('feature_store/base_feats/test_base_feats_1.pkl')
train_feats = train_feats.merge(train_scores, on='id', how='left')
params_df = pd.read_csv('params.csv')
params_df['params'] = params_df['params'].apply(ast.literal_eval)

for i in range(3):
    params = params_df.loc[i]['params']
    params.pop('verbose')
    params.pop('random_state')
    params['n_estimators'] = 2000
    _, oof_1, rmse, model1 = cv_pipeline(train_feats, test_feats, params, params['boosting_type'])

LGBM Average RMSE over 50 folds: 0.603961
LGBM Average RMSE over 50 folds: 0.604117
LGBM Average RMSE over 50 folds: 0.604946


In [None]:
# 0.602618 with feats_2 and vektorizer for 1,1 n-grams
# 0.602250 with feats_1 and vektorizer for 1,1 n-grams

In [9]:
print((f'The shape of train_feats is {train_feats.shape}'))

# lgb_params_3 = params_df.loc[1]['params']

_, oof_1, rmse, model1 = cv_pipeline(train_feats, test_feats, lgb_params_1, lgb_params_1['boosting_type'])
_, oof_2, rmse, model2 = cv_pipeline(train_feats, test_feats, lgb_params_2, lgb_params_2['boosting_type'])
_, oof_3, rmse, model3 = cv_pipeline(train_feats, test_feats, lgb_params_3, lgb_params_3['boosting_type'])

blend = pd.concat([oof_1, oof_2, oof_3], axis=0)
blend_scores = blend.groupby(['id','score'])['prediction'].mean().reset_index()
blend_rmse = mean_squared_error(blend_scores['score'], blend_scores['prediction'], squared=False)
print(f'Blend RMSE {blend_rmse}')

The shape of train_feats is (2471, 446)
LGBM Average RMSE over 50 folds: 0.602250
LGBM Average RMSE over 50 folds: 0.602758
LGBM Average RMSE over 50 folds: 0.603808
Blend RMSE 0.601391683184638


In [10]:
print((f'The shape of train_feats is {train_feats.shape}'))

# lgb_params_3 = params_df.loc[1]['params']

_, oof_1, rmse, model1 = cv_pipeline(train_feats, test_feats, lgb_params_1, lgb_params_1['boosting_type'])
_, oof_2, rmse, model2 = cv_pipeline(train_feats, test_feats, lgb_params_2, lgb_params_2['boosting_type'])

blend = pd.concat([oof_1, oof_2], axis=0)
blend_scores = blend.groupby(['id','score'])['prediction'].mean().reset_index()
blend_rmse = mean_squared_error(blend_scores['score'], blend_scores['prediction'], squared=False)
print(f'Blend RMSE {blend_rmse}')

The shape of train_feats is (2471, 446)
LGBM Average RMSE over 50 folds: 0.602250
LGBM Average RMSE over 50 folds: 0.602758
Blend RMSE 0.6014560217995425


New LGBM_params2 - Blend RMSE 0.602450743997947
Original 3 - Blend RMSE 0.602521633922091

Percentage removed 0.25 - Full pred 0.602333. Top pred 0.602194
Percentage removed 0.25 - Full pred 0.602161. Top pred 0.602109

feats_2 - blend LGBM 2 - 0.6026870286911109
feats_2 - blend LGBM 3 - 0.602521633922091

feats_1 - blend LGBM 3 - 0.6030926120372674

In [7]:
pct_to_remv = 0.8 
#Percentage removed 0.8 - Full pred 0.603179. Top pred 0.601455

params = [lgb_params_1, lgb_params_2, lgb_params_3]
test_ids = test_feats.id
test_bl_results, train_bl_results = pd.DataFrame(), pd.DataFrame()
blend_scores = pd.read_pickle('blend_scores_ft2.pkl')

for pct_to_remv in [0.80, 0.85, 0.95]: # 1.050000 BEST

    for i, p in enumerate(params):

        bal_scores = create_specific_balanced_datasets(train_scores, 
                                                        scores_to_split=[3, 3.5, 4, 4.5], 
                                                        pct_to_remv=pct_to_remv,
                                                        n_datasets=5,
                                                        seed=seed+i)
        
        for ds in bal_scores:
                    
                    ids = ds.id.unique()
                    test_preds, oof_results, rmse = cv_balanced_pipeline(train_feats=train_feats,
                                                                        test_feats=test_feats,
                                                                        lgb_params=p,
                                                                        balanced_dataset_ids=ids,
                                                                        boosting_type=p['boosting_type']
                                                                        )
                    
                    data = {'id': test_ids, 'prediction': test_preds}
                    test_tmp = pd.DataFrame(data=data)
                    test_bl_results = pd.concat([test_bl_results, test_tmp], axis=0)
                    train_bl_results = pd.concat([train_bl_results, oof_results], axis=0)

    train_avg_blc = train_bl_results.groupby(['id', 'score'])['prediction'].mean().reset_index()
    test_avg_blc = test_bl_results.groupby(['id'])['prediction'].mean().reset_index()

    train_bal_preds = train_avg_blc[(train_avg_blc['prediction'] < 2.5) | (train_avg_blc['prediction'] > 4.5)]
    train_bal_preds.to_pickle('bal_scores_ft2_full.pkl')
    train_concat_results = pd.concat([blend_scores, train_bal_preds], axis=0)
    train_blend_preds = train_concat_results.groupby(['id', 'score'])['prediction'].mean().reset_index()
    pred_full = np.sqrt(mean_squared_error(train_blend_preds['score'], train_blend_preds['prediction']))

    train_bal_preds = train_avg_blc[(train_avg_blc['prediction'] > 4.5)]
    train_bal_preds.to_pickle('bal_scores_ft2_top.pkl')
    train_concat_results = pd.concat([blend_scores, train_bal_preds], axis=0)
    train_blend_preds = train_concat_results.groupby(['id', 'score'])['prediction'].mean().reset_index()
    pred_top = np.sqrt(mean_squared_error(train_blend_preds['score'], train_blend_preds['prediction']))
    print(f'Percentage removed {pct_to_remv} - Full pred {pred_full:.6f}. Top pred {pred_top:.6f}')

LGBM Average RMSE over 50 folds: 0.649328
LGBM Average RMSE over 50 folds: 0.646151
LGBM Average RMSE over 50 folds: 0.650191
LGBM Average RMSE over 50 folds: 0.649058
LGBM Average RMSE over 50 folds: 0.645658
LGBM Average RMSE over 50 folds: 0.656797
LGBM Average RMSE over 50 folds: 0.652602
LGBM Average RMSE over 50 folds: 0.652666
LGBM Average RMSE over 50 folds: 0.650218
LGBM Average RMSE over 50 folds: 0.654699
LGBM Average RMSE over 50 folds: 0.651360
LGBM Average RMSE over 50 folds: 0.646451
LGBM Average RMSE over 50 folds: 0.649629
LGBM Average RMSE over 50 folds: 0.657803
LGBM Average RMSE over 50 folds: 0.649285
Percentage removed 0.8 - Full pred 0.603179. Top pred 0.601455
LGBM Average RMSE over 50 folds: 0.658478
LGBM Average RMSE over 50 folds: 0.657686
LGBM Average RMSE over 50 folds: 0.660616
LGBM Average RMSE over 50 folds: 0.662334
LGBM Average RMSE over 50 folds: 0.657820
LGBM Average RMSE over 50 folds: 0.666143
LGBM Average RMSE over 50 folds: 0.664742
LGBM Average 

##### Test one single instance of percentage removal

In [None]:
test_preds_st, oof_results_st, rmse = cv_pipeline(train_feats, test_feats, lgb_params_1)
data = {'id': test_ids, 'prediction': test_preds_st}
test_tmp_st = pd.DataFrame(data=data)

train_bal_preds = train_avg_blc[(train_avg_blc['prediction'] < 2.5) | (train_avg_blc['prediction'] > 4.5)]
train_concat_results = pd.concat([oof_results_st, train_bal_preds], axis=0)
train_blend_preds = train_concat_results.groupby(['id', 'score'])['prediction'].mean().reset_index()
pred_full = np.sqrt(mean_squared_error(train_blend_preds['score'], train_blend_preds['prediction']))

train_bal_preds = train_avg_blc[(train_avg_blc['prediction'] > 4.5)]
train_concat_results = pd.concat([oof_results_st, train_bal_preds], axis=0)
train_blend_preds = train_concat_results.groupby(['id', 'score'])['prediction'].mean().reset_index()
pred_top = np.sqrt(mean_squared_error(train_blend_preds['score'], train_blend_preds['prediction']))
print(f'Full pred {pred_full}. Top pred {pred_top}')

LGBM Average RMSE over 50 folds: 0.603452


0.6027577504973198

In [30]:
train_bal_preds = train_avg_blc[(train_avg_blc['prediction'] < 2.5) | (train_avg_blc['prediction'] > 4.5)]
train_concat_results = pd.concat([oof_results_st, train_bal_preds], axis=0)
train_blend_preds = train_concat_results.groupby(['id', 'score'])['prediction'].mean().reset_index()
np.sqrt(mean_squared_error(train_blend_preds['score'], train_blend_preds['prediction']))

0.6031404980252844

In [24]:
train_bal_preds = train_avg_blc[(train_avg_blc['prediction'] > 4.5)]
train_concat_results = pd.concat([oof_results_st, train_bal_preds], axis=0)
train_blend_preds = train_concat_results.groupby(['id', 'score'])['prediction'].mean().reset_index()
np.sqrt(mean_squared_error(train_blend_preds['score'], train_blend_preds['prediction']))

0.60293358997378

In [14]:
# COMPARE ALL FEATURES WITH BASELINE - ONE BY ONE

lgb_params_2 = {
    'boosting_type': 'gbdt', 
    'colsample_bytree': 1.0, 
    'importance_type': 'split', 
    'learning_rate': 0.17106535627270134, 
    'max_depth': 16, 
    'min_child_samples': 39, 
    'min_child_weight': 0.001, 
    'min_split_gain': 0.0, 
    'n_jobs': None, 
    'num_leaves': 15, 
    'reg_alpha': 0.8577521098353755, 
    'reg_lambda': 0.7679447672996995, 
    'subsample': 1.0, 
    'subsample_for_bin': 200000, 
    'subsample_freq': 0
    }

train_feats = pd.read_pickle('feature_store/base_feats/train_base_feats_1.pkl')
test_feats = pd.read_pickle('feature_store/base_feats/test_base_feats_1.pkl')
train_feats = train_feats.merge(train_scores, on='id', how='left')


_, oof_2, rmse, model1 = cv_pipeline(train_feats, test_feats, lgb_params_2, lgb_params_1['boosting_type'])

LGBM Average RMSE over 50 folds: 0.604314


In [None]:
params_df = pd.read_csv('params.csv')
params_df = params_df.head(2)
params_df['params'] = params_df['params'].apply(ast.literal_eval)

for i in range(2):
    params = params_df.loc[i]['params']
    params.pop('verbose')
    params.pop('random_state')
    params['n_estimators'] = 2000

lgb_params_2 = params_df.loc[0]['params']

from m3_model_params import lgb_params_1, xgb_params_2

n_repeats = 5
n_splits = 6

train_feats = pd.read_pickle('feature_store/base_feats/train_base_feats_2.pkl')
test_feats = pd.read_pickle('feature_store/base_feats/test_base_feats_2.pkl')
train_feats = train_feats.merge(train_scores, on='id', how='left')

train_feats, test_feats = countvectorize_one_one(train_logs, test_logs, train_feats, test_feats)
_, oof_1, rmse, model1 = xgb_cv_pipeline(train_feats=train_feats, 
                                        test_feats=test_feats, 
                                        xgb_params=xgb_params_2, 
                                        seed=seed, 
                                        n_repeats=n_repeats, 
                                        n_splits=n_splits)
                                        
_, oof_2, rmse, model1 = cv_pipeline(train_feats, test_feats, lgb_params_2, lgb_params_1['boosting_type'])

blend = pd.concat([oof_1, oof_2], axis=0)
blend_scores = blend.groupby(['id','score'])['prediction'].mean().reset_index()
blend_rmse = mean_squared_error(blend_scores['score'], blend_scores['prediction'], squared=False)
print(f'Blend RMSE {blend_rmse:.6f}')

100%|██████████| 2471/2471 [00:04<00:00, 553.04it/s]
100%|██████████| 3/3 [00:00<00:00, 2004.93it/s]
Iterations: 100%|██████████| 5/5 [15:27<00:00, 185.60s/it]


XGB Average RMSE over 50 folds: 0.600903
LGBM Average RMSE over 50 folds: 0.601574
Blend RMSE 0.598880


In [None]:
 # base_train_2 = pd.read_pickle('feature_store/base_feats/train_base_feats_1.pkl')
 # tr_ft1 = pd.read_pickle('feature_store/train/train_wpm_feats.pkl')
 # tr_ft2 = pd.read_pickle('feature_store/train/train_action_time_gap_by_acti.pkl')
 # tr_ft3 = pd.read_pickle('feature_store/train/train_IKI.pkl')
 # base_train_2 = base_train_2.merge(tr_ft1, on=['id'], how='left')
 # base_train_2 = base_train_2.merge(tr_ft2, on=['id'], how='left')
 # base_train_2 = base_train_2.merge(tr_ft3, on=['id'], how='left')
 # 
 # base_test_2 = pd.read_pickle('feature_store/base_feats/test_base_feats_1.pkl')
 # ts_ft1 = pd.read_pickle('feature_store/test/test_wpm_feats.pkl')
 # ts_ft2 = pd.read_pickle('feature_store/test/test_action_time_gap_by_acti.pkl')
 # ts_ft3 = pd.read_pickle('feature_store/test/test_IKI.pkl')
 # base_test_2 = base_test_2.merge(ts_ft1, on=['id'], how='left')
 # base_test_2 = base_test_2.merge(ts_ft2, on=['id'], how='left')
 # base_test_2 = base_test_2.merge(ts_ft3, on=['id'], how='left')

# base_train_2.to_pickle('feature_store/base_feats/train_base_feats_2.pkl')
# base_test_2.to_pickle('feature_store/base_feats/test_base_feats_2.pkl')

#(wpm_feats, IKI, action_time_gap_by_acti)

In [None]:
base_train_2 = pd.read_pickle('feature_store/base_feats/train_base_feats_1.pkl')
tr_ft1 = pd.read_pickle('feature_store/train/train_action_time_gap_by_acti.pkl')
tr_ft2 = pd.read_pickle('feature_store/train/train_action_time_gap.pkl')
base_train_2 = base_train_2.merge(tr_ft1, on=['id'], how='left')
base_train_2 = base_train_2.merge(tr_ft2, on=['id'], how='left')

base_test_2 = pd.read_pickle('feature_store/base_feats/test_base_feats_1.pkl')
ts_ft1 = pd.read_pickle('feature_store/test/test_action_time_gap_by_acti.pkl')
ts_ft2 = pd.read_pickle('feature_store/train/train_action_time_gap.pkl')
base_test_2 = base_test_2.merge(ts_ft1, on=['id'], how='left')
base_test_2 = base_test_2.merge(ts_ft2, on=['id'], how='left')

# base_train_2.to_pickle('feature_store/base_feats/train_base_feats_2.pkl')
# base_test_2.to_pickle('feature_store/base_feats/test_base_feats_2.pkl')
print(base_train_2.shape, base_test_2.shape)

(2471, 495) (3, 385)


#### RESULTS

In [None]:
results_comb.sort_values(by='Metric').head(10)

Unnamed: 0,Feature Combination,Metric,Improvement
225,"(action_time_gap_by_acti, action_time_gap)",0.603452,0.00087
40,"(action_time_gap_by_acti, IKI, wpm_feats, rep_...",0.603481,0.000841
173,"(action_time_gap_by_acti, wpm_feats, action_ti...",0.603664,0.000658
11,"(action_time_gap_by_acti, IKI, wpm_feats, rep_...",0.603668,0.000654
131,"(IKI, wpm_feats, rep_cut, action_time_gap)",0.603718,0.000604
142,"(IKI, rep_cut, at_by_bucket, action_time_gap)",0.603842,0.00048
110,"(action_time_gap_by_acti, wpm_feats, rep_cut, ...",0.603907,0.000415
97,"(action_time_gap_by_acti, IKI, wpm_feats, acti...",0.603984,0.000338
219,"(action_time_gap_by_acti, IKI)",0.604015,0.000307
220,"(action_time_gap_by_acti, wpm_feats)",0.604043,0.000279


In [None]:
results_comb.sort_values('Improvement', ascending=False)

#('action_time_gap', 'count_vectorized', 'action_time_gap_by_acti', 'wc_chage')
#('action_time_gap', 'count_vectorized', 'action_time_gap_by_acti', 'rep_cut')
#('action_time_gap', 'count_vectorized')
#('action_time_gap', 'count_vectorized', 'action_time_gap_by_acti', 'IKI')
#('count_vectorized', 'action_time_gap_by_acti', 'wc_chage', 'IKI')

Unnamed: 0,Feature Combination,Metric,Improvement
13,"(action_time_gap, count_vectorized, action_tim...",0.600539,0.003775
17,"(action_time_gap, count_vectorized, action_tim...",0.600864,0.003450
330,"(action_time_gap, count_vectorized)",0.601028,0.003286
16,"(action_time_gap, count_vectorized, action_tim...",0.601100,0.003214
122,"(count_vectorized, action_time_gap_by_acti, wc...",0.601112,0.003202
...,...,...,...
209,"(adj_eff_time, wpm_feats, IKI, rep_cut)",0.606174,-0.001860
365,"(wc_chage, adj_eff_time)",0.606192,-0.001878
326,"(adj_eff_time, wpm_feats, IKI)",0.606394,-0.002080
155,"(at_by_bucket, action_time_gap_by_acti, wc_cha...",0.606476,-0.002162


In [13]:
results_comb.sort_values('Improvement', ascending=False).head(15)

Unnamed: 0,Feature Combination,Metric,Improvement
13,"(action_time_gap, count_vectorized, action_tim...",0.600539,0.003775
17,"(action_time_gap, count_vectorized, action_tim...",0.600864,0.00345
330,"(action_time_gap, count_vectorized)",0.601028,0.003286
16,"(action_time_gap, count_vectorized, action_tim...",0.6011,0.003214
122,"(count_vectorized, action_time_gap_by_acti, wc...",0.601112,0.003202
1,"(action_time_gap, count_vectorized, at_by_buck...",0.601142,0.003172
127,"(count_vectorized, action_time_gap_by_acti, wp...",0.601183,0.003131
14,"(action_time_gap, count_vectorized, action_tim...",0.601221,0.003093
263,"(count_vectorized, action_time_gap_by_acti, re...",0.601315,0.002999
126,"(count_vectorized, action_time_gap_by_acti, ad...",0.601395,0.002919
