In [1]:
import lightgbm as lgb
import pandas as pd
from m5_sb_models import *

In [2]:
INPUT_DIR = 'kaggle/input/linking-writing-processes-to-writing-quality'
FEAT_STORE_DIR = 'feat_store_combined'
train_logs = pd.read_csv(f'{INPUT_DIR}/train_logs.csv')
train_scores = pd.read_csv(f'{INPUT_DIR}/train_scores.csv')
test_logs = pd.read_csv(f'{INPUT_DIR}/test_logs.csv')
ss_df = pd.read_csv(f'{INPUT_DIR}/sample_submission.csv')

# logs = pd.concat([train_logs, test_logs], axis=0)

In [3]:
import polars as pl
data_path     = 'kaggle/input/linking-writing-processes-to-writing-quality/'
train_logs    = pl.scan_csv(f'{data_path}/train_logs.csv')
test_logs    = pl.scan_csv(f'{data_path}/test_logs.csv')
train_scores = pl.scan_csv(f'{data_path}/train_scores.csv')

In [6]:
def create_integrated_iki(train_logs, test_logs):
    print("integrated IKI")    
    iki = []
    for data in [train_logs, test_logs]:
        logs = data.select(pl.col(['id','action_time'])).clone()
        logs = logs.with_columns(
            pl.col('action_time').diff()
            .over('id')
            .alias('iki')
            .fill_null(0)
        )

        logs = logs.with_columns(
            pl.col('action_time')
            .mean()
            .over('id')
            .alias('action_time_mean')
        )

        logs = logs.with_columns(
            (pl.col('iki') - pl.col('action_time'))
            .alias('mean_centering')
        )

        logs = logs.with_columns(
            pl.col('mean_centering')
            .cum_sum()
            .over('id')
            .alias('iki_integrated')
        )

        logs = logs.select(pl.col(['id','iki_integrated']))
        iki.append(logs)

    return iki[0], iki[1]


def integrated_iki(train_logs, test_logs):
    print("integrated IKI")    
    feats = []
    create_integrated_iki

    for data in [train_logs, test_logs]:
        logs = data.clone()
        logs = create_integrated_iki(logs)

        iki_stats = logs.group_by(['id']).agg(
                        iki_stats_count = pl.col('iki_integrated').count(),
                        iki_stats_mean = pl.col('iki_integrated').mean(),
                        iki_stats_sum = pl.col('iki_integrated').sum(),
                        iki_stats_std = pl.col('iki_integrated').std(),
                        iki_stats_max = pl.col('iki_integrated').max(),
                        iki_stats_min = pl.col('iki_integrated').min(),
                        iki_stats_median = pl.col('iki_integrated').median()
        )
        feats.append(iki_stats)
    return feats[0], feats[1]

tr_i_i, ts_i_i =  integrated_iki(train_logs, test_logs)

integrated IKI


In [10]:
from m3_model_params import lgb_params_1
train_feats = tr_i_i.collect().to_pandas().merge(train_scores.collect().to_pandas(), on='id', how='left')
test_preds, oof_preds, rmse, model = lgb_pipeline(train_feats, train_feats.drop(columns='score'), lgb_params_1)



Final RMSE over 50: 0.830242. Std 0.7551
RMSE by fold 0.830156. Std 0.0121


In [None]:
def calculate_fluctuations(iki_integrated, q, bin_sizes):
    Fq = np.zeros(len(bin_sizes))
    for i, s in enumerate(bin_sizes):
        segments = int(np.floor(len(iki_integrated) / s))
        rms = np.zeros(segments)
        for v in range(segments):
            segment = iki_integrated[v * s: (v + 1) * s]
            trend = np.polyfit(np.arange(s), segment, 1)  # linear fit (trend)
            detrended = segment - np.polyval(trend, np.arange(s))
            rms[v] = np.sqrt(np.mean(detrended ** 2))
        Fq[i] = (np.mean(rms ** q)) ** (1 / q) if q != 0 else np.exp(0.5 * np.mean(np.log(rms ** 2)))
    return Fq

def mfdfla_for_series(series, q_values, bin_sizes):
    results = []
    for q in q_values:
        Fq_values = calculate_fluctuations(series, q, bin_sizes)
        results.extend(Fq_values)
    return results

def process_group(series, q_values, bin_sizes):
    return mfdfla_for_series(series, q_values, bin_sizes)

def calculate_selected_fluctuations_parallel(df, q_values, bin_sizes, n_jobs=-1):
    grouped = df.groupby('id')['iki_integrated']
    results = Parallel(n_jobs=n_jobs)(delayed(process_group)(group, q_values, bin_sizes) for name, group in grouped)
    feats = pd.DataFrame(results, index=[name for name, group in grouped])
    feats.reset_index(inplace=True)
    columns = ['id'] + [f'Fq_q{q}_bin{s}' for q in q_values for s in bin_sizes]
    feats.columns = columns
    return feats

q_values = np.linspace(-3, 3, 2)
bin_sizes = [1500, 2500, 3500]
features_df = calculate_selected_fluctuations_parallel(iki_int_df, q_values, bin_sizes)

In [90]:
logs.collect()

id,iki_integrated
str,i64
"""001519c8""",-31
"""001519c8""",-62
"""001519c8""",-466
"""001519c8""",-466
"""001519c8""",-557
"""001519c8""",-684
"""001519c8""",-788
"""001519c8""",-915
"""001519c8""",-1022
"""001519c8""",-1131


In [4]:
import polars as pl

def integrated_iki(train_logs, test_logs):

    feats = []
    for data in [train_logs, test_logs]:
        logs = data[['id', 'action_time']].clone()
        logs = logs.group_by('id').pl.diff(pl.col('action_time')).alias('iki')


    logs = train_logs.copy()
    logs['iki'] = logs.groupby('id')['action_time'].diff().fillna(0)
    action_time_mean = logs.groupby('id')['action_time'].mean().reset_index()
    action_time_mean.columns = ['id', 'action_time_mean']
    logs = logs.merge(action_time_mean, on='id', how='left')
    logs['mean_centering'] = logs['iki'] - logs['action_time_mean']
    logs['iki_integrated'] = logs.groupby('id')['mean_centering'].cumsum()
    iki_int_df = logs[['id', 'iki_integrated']]

In [5]:
added_feats_list = ['train_down_events_counts.pkl',
 'train_vector_one_gram.pkl',
 'train_create_pauses.pkl',
 'train_essay_paragraphs.pkl',
 'train_down_events_counts.pkl',
 'train_vector_one_gram.pkl',
 'train_create_pauses.pkl',
 'train_essay_paragraphs.pkl',
 'train_down_events_counts.pkl',
 'train_vector_one_gram.pkl',
 'train_create_pauses.pkl',
 'train_essay_paragraphs.pkl',
 'train_cursor_pos_acceleration.pkl',
 'train_count_of_activities.pkl',
 'train_down_events_counts.pkl',
 'train_vector_one_gram.pkl',
 'train_create_pauses.pkl',
 'train_essay_paragraphs.pkl',
 'train_events_counts_acceleration.pkl',
 'train_essay_sentences.pkl',
 'train_word_counts_rate_of_change.pkl',
 'train_down_events_counts.pkl',
 'train_vector_one_gram.pkl',
 'train_create_pauses.pkl',
 'train_essay_paragraphs.pkl',
 'train_cursor_pos_acceleration.pkl',
 'train_essay_sentences.pkl',
 'train_categorical_nunique.pkl',
 'train_down_events_counts.pkl',
 'train_vector_one_gram.pkl',
 'train_create_pauses.pkl',
 'train_essay_paragraphs.pkl',
 'train_r_burst_feats.pkl']


In [6]:
counts = {}
for item in added_feats_list:
    counts[item] = counts.get(item, 0) + 1


In [32]:
import pandas as pd
import numpy as np
from joblib import Parallel, delayed

def calculate_fluctuations(iki_integrated, q, bin_sizes):
    Fq = np.zeros(len(bin_sizes))
    for i, s in enumerate(bin_sizes):
        segments = int(np.floor(len(iki_integrated) / s))
        rms = np.zeros(segments)
        for v in range(segments):
            segment = iki_integrated[v * s: (v + 1) * s]
            trend = np.polyfit(np.arange(s), segment, 1)  # linear fit (trend)
            detrended = segment - np.polyval(trend, np.arange(s))
            rms[v] = np.sqrt(np.mean(detrended ** 2))
        Fq[i] = (np.mean(rms ** q)) ** (1 / q) if q != 0 else np.exp(0.5 * np.mean(np.log(rms ** 2)))
    return Fq

def mfdfla_for_series(series, q_values, bin_sizes):
    results = []
    for q in q_values:
        Fq_values = calculate_fluctuations(series, q, bin_sizes)
        results.extend(Fq_values)
    return results

def process_group(series, q_values, bin_sizes):
    return mfdfla_for_series(series, q_values, bin_sizes)

def calculate_selected_fluctuations_parallel(df, q_values, bin_sizes, n_jobs=-1):
    grouped = df.groupby('id')['iki_integrated']
    results = Parallel(n_jobs=n_jobs)(delayed(process_group)(group, q_values, bin_sizes) for name, group in grouped)
    feats = pd.DataFrame(results, index=[name for name, group in grouped])
    feats.reset_index(inplace=True)
    columns = ['id'] + [f'Fq_q{q}_bin{s}' for q in q_values for s in bin_sizes]
    feats.columns = columns
    return feats

q_values = np.linspace(-3, 3, 2)
bin_sizes = [1500, 2500, 3500]
features_df = calculate_selected_fluctuations_parallel(iki_int_df, q_values, bin_sizes)

# Example usage
# Define your q_values and bin_sizes
q_values = np.linspace(-9, 9, 6)
bin_sizes = [1000, 1500, 2000]

# Assuming 'iki_int_df' is your DataFrame and it has columns 'id' and 'iki_integrated'
features_df = calculate_selected_fluctuations_parallel(iki_int_df, q_values, bin_sizes)

In [65]:
q_values = np.linspace(-15, 15, 3)
bin_sizes = [1500, 2500, 3500]
features_df = calculate_selected_fluctuations_parallel(iki_int_df, q_values, bin_sizes)

from m3_model_params import lgb_params_1
train_feats = features_df.merge(train_scores, on='id', how='left')
test_preds, oof_preds, rmse, model = lgb_pipeline(train_feats, train_feats.drop(columns='score'), lgb_params_1)



Final RMSE over 50: 0.836485. Std 0.7275
RMSE by fold 0.836355. Std 0.0150


In [41]:
Final RMSE over 50: 0.834386. Std 0.7259
RMSE by fold 0.834233. Std 0.0163 15

Unnamed: 0,id,Fq_q-3.0_bin1000,Fq_q-3.0_bin2000,Fq_q-3.0_bin3000,Fq_q-1.8_bin1000,Fq_q-1.8_bin2000,Fq_q-1.8_bin3000,Fq_q-0.6000000000000001_bin1000,Fq_q-0.6000000000000001_bin2000,Fq_q-0.6000000000000001_bin3000,Fq_q0.5999999999999996_bin1000,Fq_q0.5999999999999996_bin2000,Fq_q0.5999999999999996_bin3000,Fq_q1.7999999999999998_bin1000,Fq_q1.7999999999999998_bin2000,Fq_q1.7999999999999998_bin3000,Fq_q3.0_bin1000,Fq_q3.0_bin2000,Fq_q3.0_bin3000
0,001519c8,56.355006,78.499737,,60.021458,78.499737,,65.172197,78.499737,,71.394955,78.499737,,77.521710,78.499737,,82.565266,78.499737,
1,0022f953,39.595725,42.279434,,40.029338,42.279434,,40.485763,42.279434,,40.953782,42.279434,,41.420748,42.279434,,41.874347,42.279434,
2,0042269b,70.571049,80.042540,86.381326,72.710149,80.341087,86.381326,75.208774,80.644832,86.381326,77.961219,80.951113,86.381326,80.788767,81.257164,86.381326,83.500128,81.560242,86.381326
3,0059420b,115.246057,,,115.246057,,,115.246057,,,115.246057,,,115.246057,,,115.246057,,
4,0075873a,54.783656,58.847735,,55.590619,58.847735,,56.455582,58.847735,,57.350663,58.847735,,58.243012,58.847735,,59.100931,58.847735,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2466,ffb8c745,39.814903,46.825043,37.037430,41.699878,49.640161,37.037430,45.234280,53.479750,37.037430,51.655668,58.032470,37.037430,60.951541,62.521191,37.037430,70.463091,66.279960,37.037430
2467,ffbef7e5,28.652337,28.708405,,28.658734,28.708405,,28.665139,28.708405,,28.671546,28.708405,,28.677953,28.708405,,28.684356,28.708405,
2468,ffccd6fd,44.422517,49.953910,61.228789,47.855130,49.953910,61.228789,52.018380,49.953910,61.228789,56.291833,49.953910,61.228789,60.011971,49.953910,61.228789,62.923722,49.953910,61.228789
2469,ffec5b38,50.571851,52.146483,55.118354,51.727472,52.146483,55.118354,52.879708,52.146483,55.118354,53.967910,52.146483,55.118354,54.946535,52.146483,55.118354,55.792777,52.146483,55.118354


In [None]:
train_feats = pd.read_pickle('feature_selection/test_feats.pkl')
test_feats = pd.read_pickle('feature_selection/test_feats.pkl')