In [7]:
import pandas as pd
import numpy as np

from m4_feats_functions import *
from m5_models import *
from m7_utils import *
from m3_model_params import lgb_params_1, lgb_params_2, xgb_params
from sklearn.metrics import mean_squared_error
import pandas as pd

In [3]:
# Paths to the train and test directories
INPUT_DIR = 'kaggle/input/linking-writing-processes-to-writing-quality'
FEATURE_STORE = 'feature_store'
train_scores = pd.read_csv(f'{INPUT_DIR}/train_scores.csv')
base_train_feats = pd.read_pickle('feature_store/base_feats/train_base_feats_2.pkl')
base_test_feats = pd.read_pickle('feature_store/base_feats/test_base_feats_2.pkl')
train_dir = 'feature_store/train'
test_dir = 'feature_store/test'
# Usage
seed = 42
n_repeats = 5
n_splits = 10
target_col = 'score'

In [8]:
train_scores = pd.read_csv(f'{INPUT_DIR}/train_scores.csv')
train_feats = pd.read_pickle('feature_store/base_feats/train_base_feats_2.pkl')
test_feats = pd.read_pickle('feature_store/base_feats/test_base_feats_2.pkl')
train_feats = train_feats.merge(train_scores, on='id', how='left')
print((f'The shape of train_feats is {train_feats.shape}'))

target_col = ['score']
drop_cols = ['id']
train_cols = [col for col in train_feats.columns if col not in target_col + drop_cols]
boosting_type1 = lgb_params_1['boosting_type']
boosting_type2 = lgb_params_2['boosting_type']

_, oof_1, rmse = cv_pipeline(train_feats, test_feats, lgb_params_1, boosting_type1)
_, oof_2, rmse = cv_pipeline(train_feats, test_feats, lgb_params_2, boosting_type2)

The shape of train_feats is (2471, 496)
LGBM Average RMSE over 50 folds: 0.603452
LGBM Average RMSE over 50 folds: 0.606038


In [3]:
train_scores = pd.read_csv(f'{INPUT_DIR}/train_scores.csv')
train_feats = pd.read_pickle('feature_store/base_feats/train_base_feats_2.pkl')
test_feats = pd.read_pickle('feature_store/base_feats/test_base_feats_2.pkl')
train_feats = train_feats.merge(train_scores, on='id', how='left')
print((f'The shape of train_feats is {train_feats.shape}'))

target_col = ['score']
drop_cols = ['id']
train_cols = [col for col in train_feats.columns if col not in target_col + drop_cols]

missing_cols = [col for col in train_cols if col not in test_feats.columns]
missing_cols_df = pd.DataFrame({col: np.nan for col in missing_cols}, index=test_feats.index)
test_feats = pd.concat([test_feats, missing_cols_df], axis=1)

train_feats.replace([np.inf, -np.inf], np.nan, inplace=True)
test_feats.replace([np.inf, -np.inf], np.nan, inplace=True)
lgb_params = lgb_params_1

train_bl_results = pd.DataFrame()
test_bl_results = pd.DataFrame()

test_ids = list(test_feats['id'].values.copy())
test_preds_st, oof_results_st, rmse = cv_pipeline(train_feats, test_feats, lgb_params_1)
data = {'id': test_ids, 'prediction': test_preds_st}
test_tmp_st = pd.DataFrame(data=data)

The shape of train_feats is (2471, 496)
LGBM Average RMSE over 50 folds: 0.603452


In [19]:
def create_specific_balanced_datasets(train_scores, scores_to_split=[3, 3.5, 4, 4.5], pct_to_remv=0.2, n_datasets=2, seed=42):

    balanced_scores = []
    shuffled_scores = train_scores.copy()
    for i in range(n_datasets):
        np.random.seed(seed)
        shuffled_scores = shuffled_scores.sample(frac=1, random_state=seed + i)
        scores_to_keep = [score for score in shuffled_scores.score.unique() if score not in scores_to_split]
        ix_to_keep = []
        ix_to_keep.append(shuffled_scores[shuffled_scores['score'].isin(scores_to_keep)].index.values)

        for score in [3, 3.5, 4, 4.5]:
            tpm_scores = shuffled_scores[shuffled_scores['score']==score]
            series_len = len(tpm_scores)
            rows_to_rmv = int(series_len * pct_to_remv)
            keep_ix = tpm_scores.index.values[rows_to_rmv:]
            ix_to_keep.append(keep_ix)

        ix_to_keep = [j for i in ix_to_keep for j in i]
        temp_scores = shuffled_scores.loc[ix_to_keep].copy()
        balanced_scores.append(temp_scores)
    
    return balanced_scores

def run_lgb_cv_for_balanced_set(train_feats, test_feats, train_cols, target_col, lgb_params, balanced_dataset_ids, seed=42, n_repeats=5, n_splits=10):

    oof_results = pd.DataFrame(columns = ['id', 'score', 'prediction'])
    binned_y = np.digitize(train_feats[target_col], bins=sorted(train_feats[target_col].value_counts()))

    X = train_feats[train_cols]
    y = train_feats[target_col]
    X_test = test_feats[train_cols]

    for i in range(n_repeats):
        skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed + i)

        for train_idx, valid_idx in skf.split(train_feats, binned_y):
            
            filtered_train_idx = train_idx[train_feats.loc[train_idx, 'id'].isin(balanced_dataset_ids)]

            X_train, y_train = X.loc[filtered_train_idx], y.loc[filtered_train_idx]
            X_valid, y_valid = X.loc[valid_idx], y.loc[valid_idx]

            model_lgb = lgb.LGBMRegressor(**lgb_params, verbose=-1, random_state=seed)
            valid_preds_lgb, test_preds_lgb = run_lgb_model(model_lgb, 
                                               X_train, y_train, 
                                               X_valid, y_valid, 
                                               X_test)
        
            tmp_df = train_feats.loc[valid_idx][['id', 'score']]
            tmp_df['prediction'] = valid_preds_lgb
            oof_results = pd.concat([oof_results, tmp_df])

    avg_preds = oof_results.groupby(['id', 'score'])['prediction'].mean().reset_index()
    rmse = mean_squared_error(avg_preds['score'], avg_preds['prediction'], squared=False)
    print(f"LGBM Average RMSE over {n_repeats * n_splits} folds: {rmse:.6f}")
    return test_preds_lgb, avg_preds, rmse


In [21]:
for pct_to_remv in [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]: #[0, 0.15, 0.30, 0.45, 0.60, 0.75, 0.90] # 1.050000 BEST

    for i in range(3):

    # create_specific_balanced_datasets(train_scores, scores_to_split=[3, 3.5, 4, 4.5], pct_to_remv=0.2, seed=42):
        bal_scores = create_specific_balanced_datasets(train_scores, 
                                                    scores_to_split=[3, 3.5, 4, 4.5], 
                                                    pct_to_remv=pct_to_remv,
                                                    n_datasets=2,
                                                    seed=seed+i)
        
        for ds in bal_scores:
            ids = ds.id.unique()
            test_preds, oof_results, rmse = run_lgb_cv_for_balanced_set(train_feats=train_feats,
                                                                        test_feats=test_feats,
                                                                        train_cols=train_cols,
                                                                        target_col=target_col,
                                                                        lgb_params=lgb_params,
                                                                        balanced_dataset_ids=ids)
            
            train_bl_results = pd.concat([train_bl_results, oof_results], axis=0)
            data = {'id': test_ids, 'prediction': test_preds}
            test_tmp = pd.DataFrame(data=data)
            test_bl_results = pd.concat([test_bl_results, test_tmp], axis=0)

    train_avg_blc = train_bl_results.groupby(['id', 'score'])['prediction'].mean().reset_index()
    test_avg_blc = test_bl_results.groupby(['id'])['prediction'].mean().reset_index()

    train_bal_preds = train_avg_blc[(train_avg_blc['prediction'] < 2.5) | (train_avg_blc['prediction'] > 4.5)]
    train_concat_results = pd.concat([oof_results_st, train_bal_preds], axis=0)
    train_blend_preds = train_concat_results.groupby(['id', 'score'])['prediction'].mean().reset_index()
    pred_full = np.sqrt(mean_squared_error(train_blend_preds['score'], train_blend_preds['prediction']))

    #test_bal_preds = test_avg_blc[(test_avg_blc['prediction'] < 2.5) | (test_avg_blc['prediction'] > 4.5)]
    #test_concat_results = pd.concat([test_tmp_st, test_bal_preds], axis=0)
    #test_blend_preds = test_concat_results.groupby(['id'])['prediction'].mean().reset_index()

    train_bal_preds = train_avg_blc[(train_avg_blc['prediction'] > 4.5)]
    train_concat_results = pd.concat([oof_results_st, train_bal_preds], axis=0)
    train_blend_preds = train_concat_results.groupby(['id', 'score'])['prediction'].mean().reset_index()
    pred_top = np.sqrt(mean_squared_error(train_blend_preds['score'], train_blend_preds['prediction']))
    print(f'Percentage revmoed {pct_to_remv} - Full pred {pred_full}. Top pred {pred_top:.6f}')

LGBM Average RMSE over 50 folds: 0.606538
LGBM Average RMSE over 50 folds: 0.605953
LGBM Average RMSE over 50 folds: 0.606989
LGBM Average RMSE over 50 folds: 0.607509
LGBM Average RMSE over 50 folds: 0.608550
LGBM Average RMSE over 50 folds: 0.607122
Percentage revmoed 0.2 - Full pred 0.6031590726143912. Top pred 0.603056
LGBM Average RMSE over 50 folds: 0.609532
LGBM Average RMSE over 50 folds: 0.608586
LGBM Average RMSE over 50 folds: 0.607393
LGBM Average RMSE over 50 folds: 0.610165
LGBM Average RMSE over 50 folds: 0.610738
LGBM Average RMSE over 50 folds: 0.608191
Percentage revmoed 0.3 - Full pred 0.6031378409262801. Top pred 0.603046
LGBM Average RMSE over 50 folds: 0.612552
LGBM Average RMSE over 50 folds: 0.611449
LGBM Average RMSE over 50 folds: 0.612109
LGBM Average RMSE over 50 folds: 0.613826
LGBM Average RMSE over 50 folds: 0.615674
LGBM Average RMSE over 50 folds: 0.610964
Percentage revmoed 0.4 - Full pred 0.6030762324860133. Top pred 0.603002
LGBM Average RMSE over 50

##### Test one single instance of percentage removal

In [None]:
test_preds_st, oof_results_st, rmse = cv_pipeline(train_feats, test_feats, lgb_params_1)
data = {'id': test_ids, 'prediction': test_preds_st}
test_tmp_st = pd.DataFrame(data=data)

train_bal_preds = train_avg_blc[(train_avg_blc['prediction'] < 2.5) | (train_avg_blc['prediction'] > 4.5)]
train_concat_results = pd.concat([oof_results_st, train_bal_preds], axis=0)
train_blend_preds = train_concat_results.groupby(['id', 'score'])['prediction'].mean().reset_index()
pred_full = np.sqrt(mean_squared_error(train_blend_preds['score'], train_blend_preds['prediction']))

train_bal_preds = train_avg_blc[(train_avg_blc['prediction'] > 4.5)]
train_concat_results = pd.concat([oof_results_st, train_bal_preds], axis=0)
train_blend_preds = train_concat_results.groupby(['id', 'score'])['prediction'].mean().reset_index()
pred_top = np.sqrt(mean_squared_error(train_blend_preds['score'], train_blend_preds['prediction']))
print(f'Full pred {pred_full}. Top pred {pred_top}')

LGBM Average RMSE over 50 folds: 0.603452


0.6027577504973198

In [30]:
train_bal_preds = train_avg_blc[(train_avg_blc['prediction'] < 2.5) | (train_avg_blc['prediction'] > 4.5)]
train_concat_results = pd.concat([oof_results_st, train_bal_preds], axis=0)
train_blend_preds = train_concat_results.groupby(['id', 'score'])['prediction'].mean().reset_index()
np.sqrt(mean_squared_error(train_blend_preds['score'], train_blend_preds['prediction']))

0.6031404980252844

In [24]:
train_bal_preds = train_avg_blc[(train_avg_blc['prediction'] > 4.5)]
train_concat_results = pd.concat([oof_results_st, train_bal_preds], axis=0)
train_blend_preds = train_concat_results.groupby(['id', 'score'])['prediction'].mean().reset_index()
np.sqrt(mean_squared_error(train_blend_preds['score'], train_blend_preds['prediction']))

0.60293358997378

In [None]:
results = compare_with_baseline(base_dir=FEATURE_STORE, 
                                base_train_feats=base_train_feats,
                                base_test_feats=base_test_feats,
                                baseline_metrics=0.604499,
                                train_scores=train_scores)

['IKI', 'adj_eff_time', 'rep_cut', 'action_time_gap_by_acti', 'at_by_bucket', 'action_time_gap', 'wpm_feats', 'wc_chage']
IKI
LGBM Average RMSE over 50 folds: 0.603993
Features: IKI. RMSE: 0.603993, Improvement: 0.000506
adj_eff_time
LGBM Average RMSE over 50 folds: 0.605109
Features: adj_eff_time. RMSE: 0.605109, Improvement: -0.000610
rep_cut
LGBM Average RMSE over 50 folds: 0.606213
Features: rep_cut. RMSE: 0.606213, Improvement: -0.001714
action_time_gap_by_acti
LGBM Average RMSE over 50 folds: 0.604801
Features: action_time_gap_by_acti. RMSE: 0.604801, Improvement: -0.000302
at_by_bucket
LGBM Average RMSE over 50 folds: 0.605348
Features: at_by_bucket. RMSE: 0.605348, Improvement: -0.000849
action_time_gap
LGBM Average RMSE over 50 folds: 0.604974
Features: action_time_gap. RMSE: 0.604974, Improvement: -0.000475
wpm_feats
LGBM Average RMSE over 50 folds: 0.605128
Features: wpm_feats. RMSE: 0.605128, Improvement: -0.000629
wc_chage
LGBM Average RMSE over 50 folds: 0.604975
Features

In [None]:
INPUT_DIR = 'kaggle/input/linking-writing-processes-to-writing-quality'
FEATURE_STORE = 'feature_store'
train_scores = pd.read_csv(f'{INPUT_DIR}/train_scores.csv')
train_feats = pd.read_pickle('feature_store/base_feats/train_base_feats_1.pkl')
test_feats = pd.read_pickle('feature_store/base_feats/test_base_feats_1.pkl')
train_feats = train_feats.merge(train_scores, on=['id'], how='left')
# Usage
max_combination_length = 8 # You can adjust this to test different combination lengths
results_comb = compare_feature_combinations(base_dir=FEATURE_STORE, 
                                       base_train_feats=train_feats,
                                       base_test_feats=test_feats,
                                       baseline_metrics=0.604322,
                                       max_combination_length=max_combination_length)

Number of combinations: 255
Feature set: ('action_time_gap_by_acti', 'IKI', 'wpm_feats', 'rep_cut', 'adj_eff_time', 'at_by_bucket', 'wc_chage', 'action_time_gap')
Base train size: (2471, 419)
Merging: action_time_gap_by_acti. Size: (2471, 67)
Size after the merge: (2471, 485)
Merging: IKI. Size: (2471, 5)
Size after the merge: (2471, 489)
Merging: wpm_feats. Size: (2471, 3)
Size after the merge: (2471, 492)
Merging: rep_cut. Size: (2470, 12)
Size after the merge: (2471, 503)
Merging: adj_eff_time. Size: (2471, 19)
Size after the merge: (2471, 521)
Merging: at_by_bucket. Size: (2471, 13)
Size after the merge: (2471, 533)
Merging: wc_chage. Size: (2471, 31)
Size after the merge: (2471, 563)
Merging: action_time_gap. Size: (2471, 12)
Size after the merge: (2471, 574)
(2471, 574) (3, 573)
LGBM Average RMSE over 50 folds: 0.605169
Features: ('action_time_gap_by_acti', 'IKI', 'wpm_feats', 'rep_cut', 'adj_eff_time', 'at_by_bucket', 'wc_chage', 'action_time_gap'). RMSE: 0.605169, Improvement: 

Percentage removed 0.9 - Full pred 0.602766235492493. Top pred 0.602456


2 datasets - adding 0.5 data to each split
0.6024924800771286

2 datasets - no addition
0.6027946051309577

2 datasets - adding 0.75 data to each split
0.6028916462061885

2 low datasets
RMSE for low sample: 0.7105393172817931
RMSE for high sample: 0.525438051205524

2 low ds with 0.5
RMSE for low sample: 0.7165067818714125
RMSE for high sample: 0.5216383354968095

4 low datasets
RMSE for low sample: 0.7300616750480474
RMSE for high sample: 0.5542844778033886

3 low datasets
RMSE for low sample: 0.7162610445486091
RMSE for high sample: 0.5441948383774858

standard
RMSE for low sample: 0.7362816876835477
RMSE for high sample: 0.47489943472536583

In [None]:
 # base_train_2 = pd.read_pickle('feature_store/base_feats/train_base_feats_1.pkl')
 # tr_ft1 = pd.read_pickle('feature_store/train/train_wpm_feats.pkl')
 # tr_ft2 = pd.read_pickle('feature_store/train/train_action_time_gap_by_acti.pkl')
 # tr_ft3 = pd.read_pickle('feature_store/train/train_IKI.pkl')
 # base_train_2 = base_train_2.merge(tr_ft1, on=['id'], how='left')
 # base_train_2 = base_train_2.merge(tr_ft2, on=['id'], how='left')
 # base_train_2 = base_train_2.merge(tr_ft3, on=['id'], how='left')
 # 
 # base_test_2 = pd.read_pickle('feature_store/base_feats/test_base_feats_1.pkl')
 # ts_ft1 = pd.read_pickle('feature_store/test/test_wpm_feats.pkl')
 # ts_ft2 = pd.read_pickle('feature_store/test/test_action_time_gap_by_acti.pkl')
 # ts_ft3 = pd.read_pickle('feature_store/test/test_IKI.pkl')
 # base_test_2 = base_test_2.merge(ts_ft1, on=['id'], how='left')
 # base_test_2 = base_test_2.merge(ts_ft2, on=['id'], how='left')
 # base_test_2 = base_test_2.merge(ts_ft3, on=['id'], how='left')

# base_train_2.to_pickle('feature_store/base_feats/train_base_feats_2.pkl')
# base_test_2.to_pickle('feature_store/base_feats/test_base_feats_2.pkl')

#(wpm_feats, IKI, action_time_gap_by_acti)

In [None]:
base_train_2 = pd.read_pickle('feature_store/base_feats/train_base_feats_1.pkl')
tr_ft1 = pd.read_pickle('feature_store/train/train_action_time_gap_by_acti.pkl')
tr_ft2 = pd.read_pickle('feature_store/train/train_action_time_gap.pkl')
base_train_2 = base_train_2.merge(tr_ft1, on=['id'], how='left')
base_train_2 = base_train_2.merge(tr_ft2, on=['id'], how='left')

base_test_2 = pd.read_pickle('feature_store/base_feats/test_base_feats_1.pkl')
ts_ft1 = pd.read_pickle('feature_store/test/test_action_time_gap_by_acti.pkl')
ts_ft2 = pd.read_pickle('feature_store/train/train_action_time_gap.pkl')
base_test_2 = base_test_2.merge(ts_ft1, on=['id'], how='left')
base_test_2 = base_test_2.merge(ts_ft2, on=['id'], how='left')

# base_train_2.to_pickle('feature_store/base_feats/train_base_feats_2.pkl')
# base_test_2.to_pickle('feature_store/base_feats/test_base_feats_2.pkl')
print(base_train_2.shape, base_test_2.shape)

(2471, 495) (3, 385)


#### RESULTS

In [None]:
results_comb.sort_values(by='Metric').head(10)

Unnamed: 0,Feature Combination,Metric,Improvement
225,"(action_time_gap_by_acti, action_time_gap)",0.603452,0.00087
40,"(action_time_gap_by_acti, IKI, wpm_feats, rep_...",0.603481,0.000841
173,"(action_time_gap_by_acti, wpm_feats, action_ti...",0.603664,0.000658
11,"(action_time_gap_by_acti, IKI, wpm_feats, rep_...",0.603668,0.000654
131,"(IKI, wpm_feats, rep_cut, action_time_gap)",0.603718,0.000604
142,"(IKI, rep_cut, at_by_bucket, action_time_gap)",0.603842,0.00048
110,"(action_time_gap_by_acti, wpm_feats, rep_cut, ...",0.603907,0.000415
97,"(action_time_gap_by_acti, IKI, wpm_feats, acti...",0.603984,0.000338
219,"(action_time_gap_by_acti, IKI)",0.604015,0.000307
220,"(action_time_gap_by_acti, wpm_feats)",0.604043,0.000279
