# Libraries

In [1]:
import polars as pl
import pandas as pd
import numpy as np
import re
from lightgbm import LGBMRegressor
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error
from scipy.stats import skew, kurtosis
import warnings
warnings.filterwarnings("ignore")

In [2]:
data_path     = 'kaggle/input/linking-writing-processes-to-writing-quality/'
train_logs    = pl.scan_csv(data_path + 'train_logs.csv')
# train_feats   = dev_feats(train_logs)
# train_feats   = train_feats.collect().to_pandas()

# Polars FE & Helper Functions

In [3]:
num_cols = ['down_time', 'up_time', 'action_time', 'cursor_position', 'word_count']
activities = ['Input', 'Remove/Cut', 'Nonproduction', 'Replace', 'Paste']
events = ['q', 'Space', 'Backspace', 'Shift', 'ArrowRight', 'Leftclick', 'ArrowLeft', '.', ',', 'ArrowDown', 'ArrowUp', 'Enter', 'CapsLock', "'", 'Delete', 'Unidentified']
text_changes = ['q', ' ', '.', ',', '\n', "'", '"', '-', '?', ';', '=', '/', '\\', ':']


def count_by_values(df, colname, values):
    fts = df.select(pl.col('id').unique(maintain_order=True))
    for i, value in enumerate(values):
        tmp_df = df.group_by('id').agg(pl.col(colname).is_in([value]).sum().alias(f'{colname}_{i}_cnt'))
        fts  = fts.join(tmp_df, on='id', how='left') 
    return fts


def dev_feats(df):
    
    print("< Count by values features >")
    feats = count_by_values(df, 'activity', activities)
    feats = feats.join(count_by_values(df, 'text_change', text_changes), on='id', how='left') 
    feats = feats.join(count_by_values(df, 'down_event', events), on='id', how='left') 
    feats = feats.join(count_by_values(df, 'up_event', events), on='id', how='left') 

    print("< Input words stats features >")
    temp = df.filter((~pl.col('text_change').str.contains('=>')) & (pl.col('text_change') != 'NoChange'))
    temp = temp.group_by('id').agg(pl.col('text_change').str.concat('').str.extract_all(r'q+'))
    temp = temp.with_columns(input_word_count = pl.col('text_change').list.lengths(),
                             input_word_length_mean = pl.col('text_change').apply(lambda x: np.mean([len(i) for i in x] if len(x) > 0 else 0)),
                             input_word_length_max = pl.col('text_change').apply(lambda x: np.max([len(i) for i in x] if len(x) > 0 else 0)),
                             input_word_length_std = pl.col('text_change').apply(lambda x: np.std([len(i) for i in x] if len(x) > 0 else 0)),
                             input_word_length_median = pl.col('text_change').apply(lambda x: np.median([len(i) for i in x] if len(x) > 0 else 0)),
                             input_word_length_skew = pl.col('text_change').apply(lambda x: skew([len(i) for i in x] if len(x) > 0 else 0)))
    temp = temp.drop('text_change')
    feats = feats.join(temp, on='id', how='left') 

    print("< Numerical columns features >")
    temp = df.group_by("id").agg(pl.sum('action_time').suffix('_sum'), pl.mean(num_cols).suffix('_mean'), pl.std(num_cols).suffix('_std'),
                                 pl.median(num_cols).suffix('_median'), pl.min(num_cols).suffix('_min'), pl.max(num_cols).suffix('_max'),
                                 pl.quantile(num_cols, 0.5).suffix('_quantile'))
    feats = feats.join(temp, on='id', how='left') 

    print("< Categorical columns features >")
    temp  = df.group_by("id").agg(pl.n_unique(['activity', 'down_event', 'up_event', 'text_change']))
    feats = feats.join(temp, on='id', how='left') 

    
    print("< Idle time features >")
    temp = df.with_columns(pl.col('up_time').shift().over('id').alias('up_time_lagged'))
    temp = temp.with_columns((abs(pl.col('down_time') - pl.col('up_time_lagged')) / 1000).fill_null(0).alias('time_diff'))
    temp = temp.filter(pl.col('activity').is_in(['Input', 'Remove/Cut']))
    temp = temp.group_by("id").agg(inter_key_largest_lantency = pl.max('time_diff'),
                                   inter_key_median_lantency = pl.median('time_diff'),
                                   mean_pause_time = pl.mean('time_diff'),
                                   std_pause_time = pl.std('time_diff'),
                                   total_pause_time = pl.sum('time_diff'),
                                   pauses_half_sec = pl.col('time_diff').filter((pl.col('time_diff') > 0.5) & (pl.col('time_diff') < 1)).count(),
                                   pauses_1_sec = pl.col('time_diff').filter((pl.col('time_diff') > 1) & (pl.col('time_diff') < 1.5)).count(),
                                   pauses_1_half_sec = pl.col('time_diff').filter((pl.col('time_diff') > 1.5) & (pl.col('time_diff') < 2)).count(),
                                   pauses_2_sec = pl.col('time_diff').filter((pl.col('time_diff') > 2) & (pl.col('time_diff') < 3)).count(),
                                   pauses_3_sec = pl.col('time_diff').filter(pl.col('time_diff') > 3).count(),)
    feats = feats.join(temp, on='id', how='left') 
    
    print("< P-bursts features >")
    temp = df.with_columns(pl.col('up_time').shift().over('id').alias('up_time_lagged'))
    temp = temp.with_columns((abs(pl.col('down_time') - pl.col('up_time_lagged')) / 1000).fill_null(0).alias('time_diff'))
    temp = temp.filter(pl.col('activity').is_in(['Input', 'Remove/Cut']))
    temp = temp.with_columns(pl.col('time_diff')<2)
    temp = temp.with_columns(pl.when(pl.col("time_diff") & pl.col("time_diff").is_last()).then(pl.count()).over(pl.col("time_diff").rle_id()).alias('P-bursts'))
    temp = temp.drop_nulls()
    temp = temp.group_by("id").agg(pl.mean('P-bursts').suffix('_mean'), pl.std('P-bursts').suffix('_std'), pl.count('P-bursts').suffix('_count'),
                                   pl.median('P-bursts').suffix('_median'), pl.max('P-bursts').suffix('_max'),
                                   pl.first('P-bursts').suffix('_first'), pl.last('P-bursts').suffix('_last'))
    feats = feats.join(temp, on='id', how='left') 


    print("< R-bursts features >")
    temp = df.filter(pl.col('activity').is_in(['Input', 'Remove/Cut']))
    temp = temp.with_columns(pl.col('activity').is_in(['Remove/Cut']))
    temp = temp.with_columns(pl.when(pl.col("activity") & pl.col("activity").is_last()).then(pl.count()).over(pl.col("activity").rle_id()).alias('R-bursts'))
    temp = temp.drop_nulls()
    temp = temp.group_by("id").agg(pl.mean('R-bursts').suffix('_mean'), pl.std('R-bursts').suffix('_std'), 
                                   pl.median('R-bursts').suffix('_median'), pl.max('R-bursts').suffix('_max'),
                                   pl.first('R-bursts').suffix('_first'), pl.last('R-bursts').suffix('_last'))
    feats = feats.join(temp, on='id', how='left')
    
    return feats

def train_valid_split(data_x, data_y, train_idx, valid_idx):
    x_train = data_x.iloc[train_idx]
    y_train = data_y[train_idx]
    x_valid = data_x.iloc[valid_idx]
    y_valid = data_y[valid_idx]
    return x_train, y_train, x_valid, y_valid


def evaluate(data_x, data_y, model, random_state=42, n_splits=5, test_x=None):
    skf    = StratifiedKFold(n_splits=n_splits, random_state=random_state, shuffle=True)
    test_y = np.zeros(len(data_x)) if (test_x is None) else np.zeros((len(test_x), n_splits))

    valid_preds = pd.DataFrame()

    for i, (train_index, valid_index) in enumerate(skf.split(data_x, data_y.astype(str))):
        train_x, train_y, valid_x, valid_y = train_valid_split(data_x, data_y, train_index, valid_index)
        model.fit(train_x, train_y)

        # valid
        valid_preds = pd.concat([valid_x[['score','id']]])
        valid_preds['preds'] = model.predict(valid_x)
    
        test_y[:, i] = model.predict(test_x)

    return valid_preds, test_y

def calculate_rmse(y, yhat):
    return mean_squared_error(y, yhat, squared=False)

# Pandas FE & Helper Functions

In [4]:
def q1(x):
    return x.quantile(0.25)
def q3(x):
    return x.quantile(0.75)

AGGREGATIONS = ['count', 'mean', 'min', 'max', 'first', 'last', q1, 'median', q3, 'sum']

def reconstruct_essay(currTextInput):
    essayText = ""
    for Input in currTextInput.values:
        if Input[0] == 'Replace':
            replaceTxt = Input[2].split(' => ')
            essayText = essayText[:Input[1] - len(replaceTxt[1])] + replaceTxt[1] + essayText[Input[1] - len(replaceTxt[1]) + len(replaceTxt[0]):]
            continue
        if Input[0] == 'Paste':
            essayText = essayText[:Input[1] - len(Input[2])] + Input[2] + essayText[Input[1] - len(Input[2]):]
            continue
        if Input[0] == 'Remove/Cut':
            essayText = essayText[:Input[1]] + essayText[Input[1] + len(Input[2]):]
            continue
        if "M" in Input[0]:
            croppedTxt = Input[0][10:]
            splitTxt = croppedTxt.split(' To ')
            valueArr = [item.split(', ') for item in splitTxt]
            moveData = (int(valueArr[0][0][1:]), int(valueArr[0][1][:-1]), int(valueArr[1][0][1:]), int(valueArr[1][1][:-1]))
            if moveData[0] != moveData[2]:
                if moveData[0] < moveData[2]:
                    essayText = essayText[:moveData[0]] + essayText[moveData[1]:moveData[3]] + essayText[moveData[0]:moveData[1]] + essayText[moveData[3]:]
                else:
                    essayText = essayText[:moveData[2]] + essayText[moveData[0]:moveData[1]] + essayText[moveData[2]:moveData[0]] + essayText[moveData[1]:]
            continue
        essayText = essayText[:Input[1] - len(Input[2])] + Input[2] + essayText[Input[1] - len(Input[2]):]
    return essayText


def get_essay_df(df):
    df       = df[df.activity != 'Nonproduction']
    temp     = df.groupby('id').apply(lambda x: reconstruct_essay(x[['activity', 'cursor_position', 'text_change']]))
    essay_df = pd.DataFrame({'id': df['id'].unique().tolist()})
    essay_df = essay_df.merge(temp.rename('essay'), on='id')
    return essay_df


def word_feats(df):
    essay_df = df
    df['word'] = df['essay'].apply(lambda x: re.split(' |\\n|\\.|\\?|\\!',x))
    df = df.explode('word')
    df['word_len'] = df['word'].apply(lambda x: len(x))
    df = df[df['word_len'] != 0]
    word_agg_df = df[['id','word_len']].groupby(['id']).agg(AGGREGATIONS)
    word_agg_df.columns = ['_'.join(x) for x in word_agg_df.columns]
    word_agg_df['id'] = word_agg_df.index
    word_agg_df = word_agg_df.reset_index(drop=True)
    return word_agg_df


def sent_feats(df):
    df['sent'] = df['essay'].apply(lambda x: re.split('\\.|\\?|\\!',x))
    df = df.explode('sent')
    df['sent'] = df['sent'].apply(lambda x: x.replace('\n','').strip())
    # Number of characters in sentences
    df['sent_len'] = df['sent'].apply(lambda x: len(x))
    # Number of words in sentences
    df['sent_word_count'] = df['sent'].apply(lambda x: len(x.split(' ')))
    df = df[df.sent_len!=0].reset_index(drop=True)

    sent_agg_df = pd.concat([df[['id','sent_len']].groupby(['id']).agg(AGGREGATIONS), 
                             df[['id','sent_word_count']].groupby(['id']).agg(AGGREGATIONS)], axis=1)
    sent_agg_df.columns = ['_'.join(x) for x in sent_agg_df.columns]
    sent_agg_df['id'] = sent_agg_df.index
    sent_agg_df = sent_agg_df.reset_index(drop=True)
    sent_agg_df.drop(columns=["sent_word_count_count"], inplace=True)
    sent_agg_df = sent_agg_df.rename(columns={"sent_len_count":"sent_count"})
    return sent_agg_df


def parag_feats(df):
    df['paragraph'] = df['essay'].apply(lambda x: x.split('\n'))
    df = df.explode('paragraph')
    # Number of characters in paragraphs
    df['paragraph_len'] = df['paragraph'].apply(lambda x: len(x)) 
    # Number of words in paragraphs
    df['paragraph_word_count'] = df['paragraph'].apply(lambda x: len(x.split(' ')))
    df = df[df.paragraph_len!=0].reset_index(drop=True)
    
    paragraph_agg_df = pd.concat([df[['id','paragraph_len']].groupby(['id']).agg(AGGREGATIONS), 
                                  df[['id','paragraph_word_count']].groupby(['id']).agg(AGGREGATIONS)], axis=1) 
    paragraph_agg_df.columns = ['_'.join(x) for x in paragraph_agg_df.columns]
    paragraph_agg_df['id'] = paragraph_agg_df.index
    paragraph_agg_df = paragraph_agg_df.reset_index(drop=True)
    paragraph_agg_df.drop(columns=["paragraph_word_count_count"], inplace=True)
    paragraph_agg_df = paragraph_agg_df.rename(columns={"paragraph_len_count":"paragraph_count"})
    return paragraph_agg_df

def product_to_keys(logs, essays):
    essays['product_len'] = essays.essay.str.len()
    tmp_df = logs[logs.activity.isin(['Input', 'Remove/Cut'])].groupby(['id']).agg({'activity': 'count'}).reset_index().rename(columns={'activity': 'keys_pressed'})
    essays = essays.merge(tmp_df, on='id', how='left')
    essays['product_to_keys'] = essays['product_len'] / essays['keys_pressed']
    return essays[['id', 'product_to_keys']]

def get_keys_pressed_per_second(logs):
    temp_df = logs[logs['activity'].isin(['Input', 'Remove/Cut'])].groupby(['id']).agg(keys_pressed=('event_id', 'count')).reset_index()
    temp_df_2 = logs.groupby(['id']).agg(min_down_time=('down_time', 'min'), max_up_time=('up_time', 'max')).reset_index()
    temp_df = temp_df.merge(temp_df_2, on='id', how='left')
    temp_df['keys_per_second'] = temp_df['keys_pressed'] / ((temp_df['max_up_time'] - temp_df['min_down_time']) / 1000)
    return temp_df[['id', 'keys_per_second']]


In [5]:
from m7_utils import preprocess_feats
from sklearn.preprocessing import StandardScaler

data_path     = 'kaggle/input/linking-writing-processes-to-writing-quality/'
train_logs    = pl.scan_csv(data_path + 'train_logs.csv')
train_feats   = dev_feats(train_logs)
train_feats   = train_feats.collect().to_pandas()

print('< Essay Reconstruction >')
train_logs             = train_logs.collect().to_pandas()
train_essays           = get_essay_df(train_logs)
train_feats            = train_feats.merge(word_feats(train_essays), on='id', how='left')
train_feats            = train_feats.merge(sent_feats(train_essays), on='id', how='left')
train_feats            = train_feats.merge(parag_feats(train_essays), on='id', how='left')
train_feats            = train_feats.merge(get_keys_pressed_per_second(train_logs), on='id', how='left')
train_feats            = train_feats.merge(product_to_keys(train_logs, train_essays), on='id', how='left')


print('< Testing Data >')
test_logs   = pl.scan_csv(data_path + 'test_logs.csv')
test_feats  = dev_feats(test_logs)
test_feats  = test_feats.collect().to_pandas()

test_logs             = test_logs.collect().to_pandas()
test_essays           = get_essay_df(test_logs)
test_feats            = test_feats.merge(word_feats(test_essays), on='id', how='left')
test_feats            = test_feats.merge(sent_feats(test_essays), on='id', how='left')
test_feats            = test_feats.merge(parag_feats(test_essays), on='id', how='left')
test_feats            = test_feats.merge(get_keys_pressed_per_second(test_logs), on='id', how='left')
test_feats            = test_feats.merge(product_to_keys(test_logs, test_essays), on='id', how='left')

< Count by values features >
< Input words stats features >
< Numerical columns features >
< Categorical columns features >
< Idle time features >
< P-bursts features >
< R-bursts features >
< Essay Reconstruction >
< Testing Data >
< Count by values features >
< Input words stats features >
< Numerical columns features >
< Categorical columns features >
< Idle time features >
< P-bursts features >
< R-bursts features >


In [6]:
# FEATS_DIR = 'silver_bullet'
# 
# train_feats.to_pickle(f'{FEATS_DIR}/train_feats.pkl')
# test_feats.to_pickle(f'{FEATS_DIR}/test_feats.pkl')

In [7]:
FEATS_DIR = 'silver_bullet'
 
train_feats= pd.read_pickle(f'{FEATS_DIR}/train_feats.pkl')
test_feats = pd.read_pickle(f'{FEATS_DIR}/test_feats.pkl')

In [8]:
""" def word_change_stats(train_logs, test_logs):

    data = []
    for logs in [train_logs, test_logs]:

        logs = logs.with_columns(
            pl.col('word_count').diff().over('id').fill_nan(0).alias('wc_diff')
        )

        pref = 'wc_diff'

        x = logs.group_by('id').agg([
            pl.col('wc_diff').mean().alias(f'{pref}_mean'),
            pl.col('wc_diff').std().alias(f'{pref}_std'),
            pl.col('wc_diff').kurtosis().alias(f'{pref}_kurt'),
            pl.col('wc_diff').skew().alias(f'{pref}_skew')
        ])
        data.append(x)

    return data[0].collect().to_pandas(), data[1].collect().to_pandas()

train_logs   = pl.scan_csv(data_path + 'train_logs.csv')
test_logs   = pl.scan_csv(data_path + 'test_logs.csv')

tr_feats, ts_feats = word_change_stats(train_logs, test_logs)


train_feats = train_feats.merge(tr_feats, on='id', how='left')
test_feats = test_feats.merge(ts_feats, on='id', how='left') """

In [9]:
""" from sklearn.feature_extraction.text import CountVectorizer
from m4_feats_functions import getEssays

def countvectorize_one_one(train_logs, test_logs):

    data = []

    for logs in [train_logs, test_logs]:

        ids = logs.id.unique()
        essays = getEssays(logs)
        c_vect = CountVectorizer(ngram_range=(1, 1))
        toks = c_vect.fit_transform(essays['essay']).todense()
        toks = toks[:,:16]
        toks_df = pd.DataFrame(columns = [f'tok_{i}' for i in range(toks.shape[1])], data=toks)
        toks_df['id'] = ids
        toks_df.reset_index(drop=True, inplace=True)
        data.append(toks_df)

    return data[0], data[1]

test_logs   = pl.scan_csv(data_path + 'test_logs.csv').collect().to_pandas()
tr_feats, ts_feats = countvectorize_one_one(train_logs, test_logs)

train_feats = train_feats.merge(tr_feats, on='id', how='left')
test_feats = test_feats.merge(ts_feats, on='id', how='left')
missing_cols = set(tr_feats.columns) - set(ts_feats.columns)

for col in missing_cols:
    test_feats[col] = np.nan """

" from sklearn.feature_extraction.text import CountVectorizer\nfrom m4_feats_functions import getEssays\n\ndef countvectorize_one_one(train_logs, test_logs):\n\n    data = []\n\n    for logs in [train_logs, test_logs]:\n\n        ids = logs.id.unique()\n        essays = getEssays(logs)\n        c_vect = CountVectorizer(ngram_range=(1, 1))\n        toks = c_vect.fit_transform(essays['essay']).todense()\n        toks = toks[:,:16]\n        toks_df = pd.DataFrame(columns = [f'tok_{i}' for i in range(toks.shape[1])], data=toks)\n        toks_df['id'] = ids\n        toks_df.reset_index(drop=True, inplace=True)\n        data.append(toks_df)\n\n    return data[0], data[1]\n\ntest_logs   = pl.scan_csv(data_path + 'test_logs.csv').collect().to_pandas()\ntr_feats, ts_feats = countvectorize_one_one(train_logs, test_logs)\n\ntrain_feats = train_feats.merge(tr_feats, on='id', how='left')\ntest_feats = test_feats.merge(ts_feats, on='id', how='left')\nmissing_cols = set(tr_feats.columns) - set(ts_fea

In [10]:
""" # WORD STATS ON ESSAY
def diverse_stats(series, prefix):
    if isinstance(series, list):
        series = pd.Series([len(item) for item in series])

    stats = {
        # f'{prefix}_count': series.count(),
        f'{prefix}_mean': series.mean(),
        f'{prefix}_std': series.std(),
        f'{prefix}_max': series.max(),
        f'{prefix}_median': series.median(),
        # f'{prefix}_sum': series.sum(),
        # f'{prefix}_last': series.iloc[-1] if not series.empty else None,
        # f'{prefix}_q1': series.quantile(0.25),
        # f'{prefix}_q3': series.quantile(0.75),
        # f'{prefix}_iqr': series.quantile(0.75) - series.quantile(0.25),
        # f'{prefix}_min': series.min(),
        # f'{prefix}_first': series.iloc[0] if not series.empty else None,
        # f'{prefix}_sem': series.sem(),
        f'{prefix}_skew': series.skew(),
        f'{prefix}_kurt': series.kurtosis(),
        # f'{prefix}_range': series.max() - series.min(),
    }
    return pd.Series(stats)


def process_feats_action_time_gap(train_logs, test_logs):
    def calc_action_time_gap(comb_df):
        action_time_gap_df = comb_df.copy()
        action_time_gap_df['up_time_shift1'] = action_time_gap_df.groupby('id')['up_time'].shift(1)
        action_time_gap_df['action_time_gap'] = action_time_gap_df['down_time'] - action_time_gap_df['up_time_shift1']

        grouped = action_time_gap_df.groupby('id')['action_time_gap']
        return grouped.apply(lambda x: diverse_stats(x, 'action_time_gap')).reset_index()

    train_action_time_gap_feats = calc_action_time_gap(train_logs).pivot(index='id', columns='level_1', values='action_time_gap').reset_index()
    test_action_time_gap_feats = calc_action_time_gap(test_logs).pivot(index='id', columns='level_1', values='action_time_gap').reset_index()

    return train_action_time_gap_feats, test_action_time_gap_feats

test_logs   = pl.scan_csv(data_path + 'test_logs.csv').collect().to_pandas()
tr_feats, ts_feats = process_feats_action_time_gap(train_logs, test_logs)

train_feats = train_feats.merge(tr_feats, on='id', how='left')
test_feats = test_feats.merge(ts_feats, on='id', how='left')
missing_cols = set(tr_feats.columns) - set(ts_feats.columns)

for col in missing_cols:
    test_feats[col] = np.nan """

AttributeError: 'LazyFrame' object has no attribute 'copy'

In [12]:
from m5_sb_models import lgb_pipeline
import pandas as pd

FEATS_DIR = 'silver_bullet'
data_path     = 'kaggle/input/linking-writing-processes-to-writing-quality/'

train_feats= pd.read_pickle(f'{FEATS_DIR}/train_feats.pkl')
test_feats = pd.read_pickle(f'{FEATS_DIR}/test_feats.pkl')

print('< Mapping >')
train_scores   = pd.read_csv(data_path + 'train_scores.csv')
train = train_feats.merge(train_scores, on='id', how='left')
test = test_feats.copy()

test_preds, valid_preds, final_rmse, cv_rmse  = lgb_pipeline(train, test)

< Mapping >
Final RMSE over 50: 0.616633.
RMSE by fold 0.616477


In [34]:
def calculate_rmse(y, yhat):
    return mean_squared_error([y], [yhat], squared=False)

In [36]:
valid_preds['rmse'] = valid_preds.apply(lambda row : calculate_rmse(row['score'], row['preds']), axis=1)

In [66]:
valid_preds[valid_preds['score']==0.5].sort_values('rmse')

Unnamed: 0,id,score,preds,iteration,rmse
469,315bdafd,0.5,1.627277,7,1.127277
469,315bdafd,0.5,1.739659,7,1.239659
1848,c3663a2d,0.5,1.74985,8,1.24985
469,315bdafd,0.5,1.776693,10,1.276693
1848,c3663a2d,0.5,1.798456,8,1.298456
302,1ebb9b74,0.5,1.841998,9,1.341998
606,40b28508,0.5,1.84491,9,1.34491
606,40b28508,0.5,1.849344,1,1.349344
606,40b28508,0.5,1.881667,8,1.381667
469,315bdafd,0.5,1.884797,1,1.384797


In [49]:
x = valid_preds.groupby(['id', 'score'])['preds'].mean().reset_index(drop=False)

In [None]:
valid_preds.apply(lambda row : calculate_rmse(row['score'], row['preds']), axis=1)

In [53]:
x['rmse'] = x.apply(lambda row: calculate_rmse(row['score'], row['preds']), axis=1)

In [55]:
x.sort_values(by='rmse', ascending=False).head(35)

Unnamed: 0,id,score,preds,rmse
1005,69916fc0,1.0,3.671835,2.671835
2232,e86a132d,1.5,4.066565,2.566565
1614,aac5ac07,1.0,3.566223,2.566223
998,68df1430,1.0,3.55508,2.55508
1513,a04a32c3,1.5,3.992352,2.492352
497,3402f8b4,2.0,4.331402,2.331402
1747,b73648cf,6.0,3.77139,2.22861
306,1fbedb17,2.5,4.728096,2.228096
378,2717fdef,1.0,3.122466,2.122466
453,2f935a5c,2.5,4.621413,2.121413


In [64]:
train_feats[train_feats['id']=='69916fc0']

Unnamed: 0,id,activity_0_cnt,activity_1_cnt,activity_2_cnt,activity_3_cnt,activity_4_cnt,text_change_0_cnt,text_change_1_cnt,text_change_2_cnt,text_change_3_cnt,...,paragraph_word_count_min,paragraph_word_count_max,paragraph_word_count_first,paragraph_word_count_last,paragraph_word_count_q1,paragraph_word_count_median,paragraph_word_count_q3,paragraph_word_count_sum,keys_per_second,product_to_keys
1005,69916fc0,2063,301,503,0,0,1923,389,14,18,...,59,98,71,98,68.0,74.0,82.25,305,1.560324,0.745347


In [68]:
train_logs.filter(pl.col('id')=='b73648cf').collect().to_pandas().to_csv('b73648cf.csv')

In [45]:
valid_preds.sort_values(by='rmse', ascending=False).head(35)

Unnamed: 0,id,score,preds,iteration,rmse
1005,69916fc0,1.0,3.748508,4,2.748508
998,68df1430,1.0,3.696622,7,2.696622
1005,69916fc0,1.0,3.696166,4,2.696166
1005,69916fc0,1.0,3.672941,3,2.672941
1005,69916fc0,1.0,3.662965,10,2.662965
2232,e86a132d,1.5,4.14822,2,2.64822
1614,aac5ac07,1.0,3.64169,5,2.64169
2232,e86a132d,1.5,4.136043,7,2.636043
306,1fbedb17,2.5,5.118027,7,2.618027
2232,e86a132d,1.5,4.105837,7,2.605837


In [None]:
# tr_ids = data.id
# ts_ids = test_feats.id
# 
# tr_ts_feats = pd.concat([data, test_feats])
# tr_ts_feats.iloc[:,1:-1] = preprocess_feats(tr_ts_feats.iloc[:,1:-1], StandardScaler())
# 
# data = tr_ts_feats[tr_ts_feats['id'].isin(tr_ids)]
# test_feats = tr_ts_feats[tr_ts_feats['id'].isin(ts_ids)]

print('< Mapping >')
train_scores   = pd.read_csv(data_path + 'train_scores.csv')
data           = train_feats.merge(train_scores, on='id', how='left')
test_ids        = test_feats['id'].values
testin_x        = test_feats.drop(['id'], axis=1)
x               = data.drop(['id', 'score'], axis=1)
y               = data['score'].values
print(f'Number of features: {len(x.columns)}')

< Mapping >
Number of features: 165


In [14]:
from lightgbm import LGBMRegressor
from sklearn.model_selection import StratifiedKFold
print('< Learning and Evaluation >')
param = {'n_estimators': 1024,
         'learning_rate': 0.005,
         'metric': 'rmse',
         'force_col_wise': True,
         'verbosity': 0,}


data_x = x.copy()
data_y = y.copy()
test_x=testin_x.copy()
n_splits = 5
iterations = 10
valid_preds = pd.DataFrame(columns = ['id', 'score', 'preds', 'iteration'])
test_preds = []

for iter in range(iterations):
    skf    = StratifiedKFold(n_splits=n_splits, random_state=42+iter, shuffle=True)
    model = LGBMRegressor(**param, random_state=42+iter)
    for i, (train_index, valid_index) in enumerate(skf.split(data_x, data_y.astype(str))):
        train_x, train_y, valid_x, valid_y = train_valid_split(data_x, data_y, train_index, valid_index)
        
        model.fit(train_x, train_y)
        valid_predictions = model.predict(valid_x)
        test_predictions = model.predict(test_x)
        test_preds.append(test_predictions)

        tmp_df = data.loc[valid_index][['id','score']]
        tmp_df['preds'] = valid_predictions
        tmp_df['iteration'] = i + 1
        valid_preds = pd.concat([valid_preds, tmp_df])

    final_rmse = mean_squared_error(valid_preds['score'], valid_preds['preds'], squared=False)
    cv_rmse = valid_preds.groupby(['iteration']).apply(lambda g: calculate_rmse(g['score'], g['preds']))

print(f'Final RMSE over {n_splits * iterations}: {final_rmse:.6f}.')
print(f'RMSE by fold {np.mean(cv_rmse):.6f}')

< Learning and Evaluation >
Final RMSE over 50: 0.620546.
RMSE by fold 0.620478


adding wc_change (proper)


< Learning and Evaluation > adding vectoriz
Final RMSE 0.615059.
RMSE by fold 0.614998

5 iterations baseline
Final RMSE 0.619785.
RMSE by fold 0.619458

< Learning and Evaluation > adding at_by_activity
Final RMSE 0.621999.
RMSE by fold 0.621681

In [101]:
data_x = x.copy()
data_y = y.copy()
test_x=testin_x.copy()
n_splits = 10
model = LGBMRegressor(**param)

skf    = StratifiedKFold(n_splits=n_splits, random_state=42, shuffle=True)
test_y = np.zeros(len(data_x)) if (test_x is None) else np.zeros((len(test_x), n_splits))
test_preds = []

valid_preds = pd.DataFrame(columns = ['id', 'score', 'preds', 'iteration'])

for i, (train_index, valid_index) in enumerate(skf.split(data_x, data_y.astype(str))):
    train_x, train_y, valid_x, valid_y = train_valid_split(data_x, data_y, train_index, valid_index)
    
    model.fit(train_x, train_y)
    valid_predictions = model.predict(valid_x)
    test_predictions = model.predict(test_x)
    test_preds.append(test_predictions)

    tmp_df = data.loc[valid_index][['id','score']]
    tmp_df['preds'] = valid_predictions
    tmp_df['iteration'] = i + 1
    valid_preds = pd.concat([valid_preds, tmp_df])

final_rmse = mean_squared_error(valid_preds['score'], valid_preds['preds'], squared=False)
cv_rmse = valid_preds.groupby(['iteration']).apply(lambda g: calculate_rmse(g['score'], g['preds']))
print(f'Final RMSE {final_rmse:.6f}.')
print(f'RMSE by fold {np.mean(cv_rmse):.6f}')

Final RMSE 0.612482.
RMSE by fold 0.611805


In [100]:
cv_rmse

iteration
1     0.595718
2     0.601724
3     0.597549
4     0.607840
5     0.575182
6     0.576292
7     0.612651
8     0.636930
9     0.641946
10    0.672214
dtype: float64

In [98]:
valid_preds.iteration.unique()

array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype=object)

In [97]:
def calculate_rmse(y, yhat):
    return mean_squared_error(y, yhat, squared=False)

valid_preds.groupby(['iteration']).apply(lambda g: calculate_rmse(g['score'], g['preds']))

iteration
1     0.595718
2     0.601724
3     0.597549
4     0.607840
5     0.575182
6     0.576292
7     0.612651
8     0.636930
9     0.641946
10    0.672214
dtype: float64

In [90]:
mean_squared_error(valid_preds['score'], valid_preds['preds'], squared=False)

0.6124817170301459

In [76]:
from sklearn.metrics import mean_squared_error
np.sqrt(mean_squared_error(valid_preds['score'], valid_preds['preds'], squared=False))

0.7826121114767812

# Solution

In [4]:
data_path     = 'kaggle/input/linking-writing-processes-to-writing-quality/'
train_logs    = pl.scan_csv(data_path + 'train_logs.csv')
train_feats   = dev_feats(train_logs)
train_feats   = train_feats.collect().to_pandas()

print('< Essay Reconstruction >')
train_logs             = train_logs.collect().to_pandas()
train_essays           = get_essay_df(train_logs)
train_feats            = train_feats.merge(word_feats(train_essays), on='id', how='left')
train_feats            = train_feats.merge(sent_feats(train_essays), on='id', how='left')
train_feats            = train_feats.merge(parag_feats(train_essays), on='id', how='left')
train_feats            = train_feats.merge(get_keys_pressed_per_second(train_logs), on='id', how='left')
train_feats            = train_feats.merge(product_to_keys(train_logs, train_essays), on='id', how='left')


print('< Mapping >')
train_scores   = pd.read_csv(data_path + 'train_scores.csv')
data           = train_feats.merge(train_scores, on='id', how='left')
x              = data.drop(['id', 'score'], axis=1)
y              = data['score'].values
print(f'Number of features: {len(x.columns)}')


print('< Testing Data >')
test_logs   = pl.scan_csv(data_path + 'test_logs.csv')
test_feats  = dev_feats(test_logs)
test_feats  = test_feats.collect().to_pandas()

test_logs             = test_logs.collect().to_pandas()
test_essays           = get_essay_df(test_logs)
test_feats            = test_feats.merge(word_feats(test_essays), on='id', how='left')
test_feats            = test_feats.merge(sent_feats(test_essays), on='id', how='left')
test_feats            = test_feats.merge(parag_feats(test_essays), on='id', how='left')
test_feats            = test_feats.merge(get_keys_pressed_per_second(test_logs), on='id', how='left')
test_feats            = test_feats.merge(product_to_keys(test_logs, test_essays), on='id', how='left')


test_ids = test_feats['id'].values
testin_x = test_feats.drop(['id'], axis=1)

print('< Learning and Evaluation >')
param = {'n_estimators': 1024,
         'learning_rate': 0.005,
         'metric': 'rmse',
         'random_state': 42,
         'force_col_wise': True,
         'verbosity': 0,}
solution = LGBMRegressor(**param)
y_pred   = evaluate(x.copy(), y.copy(), solution, test_x=testin_x.copy()) 

sub = pd.DataFrame({'id': test_ids, 'score': y_pred})
sub.to_csv('submission.csv', index=False)

< Count by values features >
< Input words stats features >
< Numerical columns features >
< Categorical columns features >
< Idle time features >
< P-bursts features >
< R-bursts features >
< Essay Reconstruction >
< Mapping >
Number of features: 165
< Testing Data >
< Count by values features >
< Input words stats features >
< Numerical columns features >
< Categorical columns features >
< Idle time features >
< P-bursts features >
< R-bursts features >
< Learning and Evaluation >
