Agenda
1. preparation
2. Model

In [1]:
import warnings
warnings.simplefilter('ignore')

import os
import gc
import re
from collections import Counter

import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 300)
from tqdm.auto import tqdm
tqdm.pandas()

from gensim.models import Word2Vec
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, StratifiedKFold, GroupKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import mean_squared_error

import lightgbm as lgb
from xgboost import XGBRegressor
from sklearn.ensemble import HistGradientBoostingRegressor

# 1-1. load data

In [2]:
train_logs = pd.read_csv('/kaggle/input/linking-writing-processes-to-writing-quality/train_logs.csv')
display(train_logs)
train_scores = pd.read_csv('/kaggle/input/linking-writing-processes-to-writing-quality/train_scores.csv')
display(train_scores)
test_logs = pd.read_csv('/kaggle/input/linking-writing-processes-to-writing-quality/test_logs.csv')
display(test_logs)

Unnamed: 0,id,event_id,down_time,up_time,action_time,activity,down_event,up_event,text_change,cursor_position,word_count
0,001519c8,1,4526,4557,31,Nonproduction,Leftclick,Leftclick,NoChange,0,0
1,001519c8,2,4558,4962,404,Nonproduction,Leftclick,Leftclick,NoChange,0,0
2,001519c8,3,106571,106571,0,Nonproduction,Shift,Shift,NoChange,0,0
3,001519c8,4,106686,106777,91,Input,q,q,q,1,1
4,001519c8,5,107196,107323,127,Input,q,q,q,2,1
...,...,...,...,...,...,...,...,...,...,...,...
8405893,fff05981,3615,2063944,2064440,496,Nonproduction,Leftclick,Leftclick,NoChange,1031,240
8405894,fff05981,3616,2064497,2064497,0,Nonproduction,Shift,Shift,NoChange,1031,240
8405895,fff05981,3617,2064657,2064765,108,Replace,q,q,q => q,1031,240
8405896,fff05981,3618,2069186,2069259,73,Nonproduction,Leftclick,Leftclick,NoChange,1028,240


Unnamed: 0,id,score
0,001519c8,3.5
1,0022f953,3.5
2,0042269b,6.0
3,0059420b,2.0
4,0075873a,4.0
...,...,...
2466,ffb8c745,3.5
2467,ffbef7e5,4.0
2468,ffccd6fd,1.5
2469,ffec5b38,5.0


Unnamed: 0,id,event_id,down_time,up_time,action_time,activity,down_event,up_event,text_change,cursor_position,word_count
0,0000aaaa,1,338433,338518,85,Input,Space,Space,,0,0
1,0000aaaa,2,760073,760160,87,Input,Space,Space,,1,0
2,2222bbbb,1,711956,712023,67,Input,q,q,q,0,1
3,2222bbbb,2,290502,290548,46,Input,q,q,q,1,1
4,4444cccc,1,635547,635641,94,Input,Space,Space,,0,0
5,4444cccc,2,184996,185052,56,Input,q,q,q,1,1


## 1-2. helper functions

### 1-2-1. const definition

In [3]:
# variables category
exp_key_column = "id"
obj_column = "score"

In [4]:
# count_elements
target_activities = ['Input', 'Remove/Cut', 'Nonproduction', 'Replace', 'Paste']
target_events = ['q', 'Space', 'Backspace', 'Shift', 'ArrowRight', 'Leftclick', 'ArrowLeft', '.', ',', 
          'ArrowDown', 'ArrowUp', 'Enter', 'CapsLock', "'", 'Delete', 'Unidentified']
target_text_changes = ['q', ' ', 'NoChange', '.', ',', '\n', "'", '"', '-', '?', ';', '=', '/', '\\', ':']
target_punctuations = ['"', '.', ',', "'", '-', ';', ':', '?', '!', '<', '>', '/',
                '@', '#', '$', '%', '^', '&', '*', '(', ')', '_', '+']

In [5]:
def element_counts(df: pd.DataFrame, key_colname: str, count_colname: str, count_elements: list, match: str):
    """
    This function counts each elements
    Parameters
    ----------
    df: DataFrame
    key_colname: aggregator
    count_colname: count columns
    count_elements: count value in this list
    match: exact or partital
        
    Returns
    -------
    d : pd.DataFrame
    """
    print(f"======================element_counts start({count_colname})!!======================")
    tmp_df = df.groupby(key_colname).agg({count_colname: list}).reset_index()
    display(tmp_df)
    ret = list()
    if match == "exact":
        for li in tqdm(tmp_df[count_colname].values):
            items = list(Counter(li).items())
            di = dict()
            for k in count_elements:
                di[k] = 0
            for item in items:
                k,v = item[0], item[1]
                if k in di: # this part counts disgnated part
                    di[k] = v
            ret.append(di)
        ret = pd.DataFrame(ret)
        display(ret)
        cols = [f"{count_colname}_{i}_count" for i in range(len(ret.columns))]
        ret.columns = cols
    elif match == "partitial":
        for li in tqdm(tmp_df['down_event'].values):
            cnt = 0
            items = list(Counter(li).items())
            for item in items:
                k, v = item[0], item[1]
                if k in count_elements:
                    cnt += v
            ret.append(cnt)
        ret = pd.DataFrame({'punct_cnt': ret})
        
    return ret
    
def get_input_words(df: pd.DataFrame):
    print("======================get_input_words start!!======================")
    tmp_df = df[(~df["text_change"].str.contains("=>"))&(df["text_change"] != "Nochange")].reset_index(drop= True)
    tmp_df = tmp_df.groupby(exp_key_column).agg({"text_change": list}).reset_index()
    # concat part
    tmp_df["text_change"] = tmp_df["text_change"].apply(lambda x: "".join(x))
    tmp_df['text_change'] = tmp_df['text_change'].apply(lambda x: re.findall(r'q+', x))
    
    tmp_df['input_word_count'] = tmp_df['text_change'].apply(len)
    display(tmp_df[["id","text_change"]])
    
    # calc part
    tmp_df['input_word_length_mean'] = tmp_df['text_change'].apply(lambda x: np.mean([len(i) for i in x] if len(x) > 0 else 0))
    tmp_df['input_word_length_max'] = tmp_df['text_change'].apply(lambda x: np.max([len(i) for i in x] if len(x) > 0 else 0))
    tmp_df['input_word_length_std'] = tmp_df['text_change'].apply(lambda x: np.std([len(i) for i in x] if len(x) > 0 else 0))
    tmp_df.drop(['text_change'], axis=1, inplace=True)
    return tmp_df

## 1-3. make features

In [6]:
def q1(x):
    return x.quantile(0.25)
def q2(x):
    return x.quantile(0.5)
def q3(x):
    return x.quantile(0.75)

In [7]:
def make_feats(df: pd.DataFrame):
    print("======================make_feats start!!======================")
    feats = pd.DataFrame({exp_key_column: df[exp_key_column].unique().tolist()})
    # time shift
    target_col = "up_time"
    df[f'{target_col}_shift1'] = df.groupby(exp_key_column)[target_col].shift(1) #(going under ↓)
    df['action_time_gap'] = df['down_time'] - df[f'{target_col}_shift1']
    df.drop(f'{target_col}_shift1', axis=1, inplace=True)
    
    # cursor position shift
    target_col = "cursor_position"
    df[f'{target_col}_shift1'] = df.groupby(exp_key_column)[target_col].shift(1)
    df[f'{target_col}_change'] = np.abs(df[target_col] - df[f'{target_col}_shift1'])
    df.drop(f'{target_col}_shift1', axis=1, inplace=True)
    
    # word count shift
    target_col = "word_count"
    df[f'{target_col}_shift1'] = df.groupby(exp_key_column)[target_col].shift(1)
    df[f'{target_col}_change'] = np.abs(df[target_col] - df[f'{target_col}_shift1'])
    df.drop(f'{target_col}_shift1', axis=1, inplace=True)
    
    for item in tqdm([
        ('event_id', ['max']),
        ('up_time', ['max']),
        ('action_time', ['sum', 'max', 'mean', 'std']),
        ('activity', ['nunique']),
        ('down_event', ['nunique']),
        ('up_event', ['nunique']),
        ('text_change', ['nunique']),
        ('cursor_position', ['nunique', 'max', 'mean']),
        ('word_count', ['nunique', 'max', 'mean', q1, q2, q3]),
        ('action_time_gap', ['max', 'min', 'mean', 'std', 'sum']),
        ('cursor_position_change', ['max', 'mean', 'std', 'sum']),
        ('word_count_change', ['max', 'mean', 'std', 'sum'])
    ]):
        colname, methods = item[0], item[1]
        for method in methods:
            print(f"calc: {method} of {colname}")
            if type(method) == str: 
                tmp_df = df.groupby([exp_key_column]).agg({colname: method}).reset_index().rename(columns = {colname: f"{colname}_{method}"})
                feats = feats.merge(tmp_df, on = exp_key_column, how= "left")
            else:
                tmp_df = df.groupby([exp_key_column]).agg({colname: method}).reset_index().rename(columns = {colname: f"{colname}_{method.__name__}"})
                feats = feats.merge(tmp_df, on = exp_key_column, how= "left")
    
    # counts
#     element_counts(df: pd.DataFrame, key_colname = exp_key_column: str, count_colname: str, count_elements: list, match: str):
    # tmp_df = activity_counts(df)
    tmp_df = element_counts(df, exp_key_column, "activity", target_activities, "exact")
    feats = pd.concat([feats, tmp_df], axis=1)
    
#     tmp_df = event_counts(df, 'down_event')
    tmp_df = element_counts(df, exp_key_column, "down_event", target_events, "exact")
    feats = pd.concat([feats, tmp_df], axis=1)
    
#     tmp_df = event_counts(df, 'up_event')
    tmp_df = element_counts(df, exp_key_column, "up_event", target_events, "exact")
    feats = pd.concat([feats, tmp_df], axis=1)
    
#     tmp_df = text_change_counts(df)
    tmp_df = element_counts(df, exp_key_column, "text_change", target_text_changes, "exact")
    feats = pd.concat([feats, tmp_df], axis=1)
    
#     tmp_df = match_punctuations(df)
    tmp_df = element_counts(df, exp_key_column, "down_event", target_punctuations, "partitial")
    feats = pd.concat([feats, tmp_df], axis=1)
    
    
    # input words
    tmp_df = get_input_words(df)
    feats = pd.merge(feats, tmp_df, on='id', how='left')
    
    # compare feats
    feats['word_time_ratio'] = feats['word_count_max'] / feats['up_time_max']
    feats['word_event_ratio'] = feats['word_count_max'] / feats['event_id_max']
    feats['event_time_ratio'] = feats['event_id_max']  / feats['up_time_max']
    feats['idle_time_ratio'] = feats['action_time_gap_sum'] / feats['up_time_max']
    
    return feats
    

In [8]:
train_feats = make_feats(train_logs)



  0%|          | 0/12 [00:00<?, ?it/s]

calc: max of event_id
calc: max of up_time
calc: sum of action_time
calc: max of action_time
calc: mean of action_time
calc: std of action_time
calc: nunique of activity
calc: nunique of down_event
calc: nunique of up_event
calc: nunique of text_change
calc: nunique of cursor_position
calc: max of cursor_position
calc: mean of cursor_position
calc: nunique of word_count
calc: max of word_count
calc: mean of word_count
calc: <function q1 at 0x7d11a2f3d240> of word_count
calc: <function q2 at 0x7d11a2f3d2d0> of word_count
calc: <function q3 at 0x7d11a2f3d360> of word_count
calc: max of action_time_gap
calc: min of action_time_gap
calc: mean of action_time_gap
calc: std of action_time_gap
calc: sum of action_time_gap
calc: max of cursor_position_change
calc: mean of cursor_position_change
calc: std of cursor_position_change
calc: sum of cursor_position_change
calc: max of word_count_change
calc: mean of word_count_change
calc: std of word_count_change
calc: sum of word_count_change


Unnamed: 0,id,activity
0,001519c8,"[Nonproduction, Nonproduction, Nonproduction, ..."
1,0022f953,"[Nonproduction, Nonproduction, Input, Input, I..."
2,0042269b,"[Nonproduction, Nonproduction, Input, Input, I..."
3,0059420b,"[Nonproduction, Nonproduction, Nonproduction, ..."
4,0075873a,"[Nonproduction, Nonproduction, Input, Input, I..."
...,...,...
2466,ffb8c745,"[Nonproduction, Nonproduction, Nonproduction, ..."
2467,ffbef7e5,"[Nonproduction, Nonproduction, Nonproduction, ..."
2468,ffccd6fd,"[Nonproduction, Nonproduction, Input, Input, I..."
2469,ffec5b38,"[Nonproduction, Nonproduction, Input, Input, I..."


  0%|          | 0/2471 [00:00<?, ?it/s]

Unnamed: 0,Input,Remove/Cut,Nonproduction,Replace,Paste
0,2010,417,120,7,0
1,1938,260,254,1,1
2,3515,439,175,7,0
3,1304,151,99,1,1
4,1942,517,72,0,0
...,...,...,...,...,...
2466,3588,960,189,2,0
2467,2395,60,148,1,0
2468,2849,88,126,0,0
2469,2895,276,71,0,0




Unnamed: 0,id,down_event
0,001519c8,"[Leftclick, Leftclick, Shift, q, q, q, q, q, q..."
1,0022f953,"[Leftclick, Shift, q, q, q, q, Space, q, q, Sp..."
2,0042269b,"[Leftclick, Shift, q, q, q, q, q, q, q, Space,..."
3,0059420b,"[Leftclick, Leftclick, Shift, Shift, Shift, Sh..."
4,0075873a,"[Leftclick, Shift, q, q, q, q, q, q, q, q, q, ..."
...,...,...
2466,ffb8c745,"[Leftclick, Tab, Leftclick, Space, Space, Spac..."
2467,ffbef7e5,"[Leftclick, Leftclick, Shift, q, q, q, q, Spac..."
2468,ffccd6fd,"[Leftclick, Leftclick, q, q, q, q, q, q, Space..."
2469,ffec5b38,"[Leftclick, Shift, q, q, q, q, q, q, q, q, Bac..."


  0%|          | 0/2471 [00:00<?, ?it/s]

Unnamed: 0,q,Space,Backspace,Shift,ArrowRight,Leftclick,ArrowLeft,.,",",ArrowDown,ArrowUp,Enter,CapsLock,',Delete,Unidentified
0,1619,357,417,27,2,92,2,21,12,0,0,4,0,3,0,0
1,1490,391,260,97,46,56,49,15,21,3,2,6,0,3,0,0
2,2904,552,439,39,6,129,0,21,23,0,0,17,0,0,0,0
3,1038,243,152,68,0,18,0,13,3,0,0,3,2,2,0,0
4,1541,324,517,39,0,33,0,23,24,0,0,10,0,17,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2466,2844,651,960,164,0,24,0,43,32,0,0,7,0,7,0,0
2467,1874,447,60,106,0,36,0,31,24,0,0,12,0,8,0,0
2468,969,1861,88,0,29,9,53,5,2,29,0,12,6,0,0,0
2469,2361,457,276,52,0,14,0,31,27,0,0,6,2,4,0,0




Unnamed: 0,id,up_event
0,001519c8,"[Leftclick, Leftclick, Shift, q, q, q, q, q, q..."
1,0022f953,"[Leftclick, Shift, q, q, q, q, Space, q, q, Sp..."
2,0042269b,"[Leftclick, Shift, q, q, q, q, q, q, q, Space,..."
3,0059420b,"[Leftclick, Leftclick, Shift, Shift, Shift, Sh..."
4,0075873a,"[Leftclick, Shift, q, q, q, q, q, q, q, q, q, ..."
...,...,...
2466,ffb8c745,"[Leftclick, Tab, Leftclick, Space, Space, Spac..."
2467,ffbef7e5,"[Leftclick, Leftclick, Shift, q, q, q, q, Spac..."
2468,ffccd6fd,"[Leftclick, Leftclick, q, q, q, q, q, q, Space..."
2469,ffec5b38,"[Leftclick, Shift, q, q, q, q, q, q, q, q, Bac..."


  0%|          | 0/2471 [00:00<?, ?it/s]

Unnamed: 0,q,Space,Backspace,Shift,ArrowRight,Leftclick,ArrowLeft,.,",",ArrowDown,ArrowUp,Enter,CapsLock,',Delete,Unidentified
0,1619,357,417,27,2,92,2,21,12,0,0,4,0,3,0,0
1,1490,391,260,97,46,56,49,15,21,3,2,6,0,3,0,0
2,2899,552,439,39,6,129,0,21,23,0,0,17,0,0,0,0
3,1038,243,152,68,0,18,0,13,3,0,0,3,2,2,0,0
4,1541,324,517,39,0,33,0,23,24,0,0,10,0,17,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2466,2844,651,960,164,0,24,0,43,32,0,0,7,0,7,0,0
2467,1874,447,60,106,0,36,0,31,24,0,0,12,0,8,0,0
2468,969,1861,88,0,29,9,53,5,2,29,0,12,6,0,0,0
2469,2361,457,276,52,0,14,0,31,27,0,0,6,2,4,0,0




Unnamed: 0,id,text_change
0,001519c8,"[NoChange, NoChange, NoChange, q, q, q, q, q, ..."
1,0022f953,"[NoChange, NoChange, q, q, q, q, , q, q, , N..."
2,0042269b,"[NoChange, NoChange, q, q, q, q, q, q, q, , q..."
3,0059420b,"[NoChange, NoChange, NoChange, NoChange, NoCha..."
4,0075873a,"[NoChange, NoChange, q, q, q, q, q, q, q, q, q..."
...,...,...
2466,ffb8c745,"[NoChange, NoChange, NoChange, , , , , , ..."
2467,ffbef7e5,"[NoChange, NoChange, NoChange, q, q, q, q, , ..."
2468,ffccd6fd,"[NoChange, NoChange, q, q, q, q, q, q, , q, q..."
2469,ffec5b38,"[NoChange, NoChange, q, q, q, q, q, q, q, q, q..."


  0%|          | 0/2471 [00:00<?, ?it/s]

Unnamed: 0,q,Unnamed: 2,NoChange,.,",",\n,',"""",-,?,;,=,/,\,:
0,1940,436,120,28,14,4,5,0,0,0,1,0,0,0,0
1,1698,432,254,18,24,7,4,6,6,3,0,0,0,0,0
2,3257,615,175,23,26,23,0,2,1,0,0,4,0,0,0
3,1146,281,99,13,3,4,3,0,0,0,0,5,0,0,0
4,1964,397,72,32,25,12,25,0,0,2,0,2,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2466,3605,813,189,59,42,11,8,6,1,0,0,0,0,0,2
2467,1920,457,148,33,24,12,9,0,0,0,0,0,0,0,0
2468,1031,1879,126,6,3,18,0,0,0,0,0,0,0,0,0
2469,2593,490,71,34,29,8,4,6,1,0,2,0,0,0,0




Unnamed: 0,id,down_event
0,001519c8,"[Leftclick, Leftclick, Shift, q, q, q, q, q, q..."
1,0022f953,"[Leftclick, Shift, q, q, q, q, Space, q, q, Sp..."
2,0042269b,"[Leftclick, Shift, q, q, q, q, q, q, q, Space,..."
3,0059420b,"[Leftclick, Leftclick, Shift, Shift, Shift, Sh..."
4,0075873a,"[Leftclick, Shift, q, q, q, q, q, q, q, q, q, ..."
...,...,...
2466,ffb8c745,"[Leftclick, Tab, Leftclick, Space, Space, Spac..."
2467,ffbef7e5,"[Leftclick, Leftclick, Shift, q, q, q, q, Spac..."
2468,ffccd6fd,"[Leftclick, Leftclick, q, q, q, q, q, q, Space..."
2469,ffec5b38,"[Leftclick, Shift, q, q, q, q, q, q, q, q, Bac..."


  0%|          | 0/2471 [00:00<?, ?it/s]



Unnamed: 0,id,text_change
0,001519c8,"[qqqqqq, qqq, qqqqqqq, qqqqqq, qq, qqqq, qqqqq..."
1,0022f953,"[qqqq, qq, qqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqq, ..."
2,0042269b,"[qqqqqqq, qqq, qqqq, qqqq, qqqq, qq, qqqqqqqq,..."
3,0059420b,"[qqqq, qq, qqqqqqq, qqqqqq, q, q, qqq, qqqqqqq..."
4,0075873a,"[qqqqqqqqqqq, qq, q, qq, qqqqq, qq, qqqqqqqqqq..."
...,...,...
2466,ffb8c745,"[qq, qqqqq, q, qqqqqqq, qqq, q, qqqqq, q, qqqq..."
2467,ffbef7e5,"[qqqq, qqqqqq, qqqqq, qq, qqqqq, qqqqq, qq, qq..."
2468,ffccd6fd,"[qqqqqq, qqqq, q, qqqqqqq, qqqqqqqq, q, qq, qq..."
2469,ffec5b38,"[qqqqqqqqqqqq, qqqqqqq, qqqqqq, qqqq, qqqqq, q..."


In [9]:
test_feats = make_feats(test_logs)



  0%|          | 0/12 [00:00<?, ?it/s]

calc: max of event_id
calc: max of up_time
calc: sum of action_time
calc: max of action_time
calc: mean of action_time
calc: std of action_time
calc: nunique of activity
calc: nunique of down_event
calc: nunique of up_event
calc: nunique of text_change
calc: nunique of cursor_position
calc: max of cursor_position
calc: mean of cursor_position
calc: nunique of word_count
calc: max of word_count
calc: mean of word_count
calc: <function q1 at 0x7d11a2f3d240> of word_count
calc: <function q2 at 0x7d11a2f3d2d0> of word_count
calc: <function q3 at 0x7d11a2f3d360> of word_count
calc: max of action_time_gap
calc: min of action_time_gap
calc: mean of action_time_gap
calc: std of action_time_gap
calc: sum of action_time_gap
calc: max of cursor_position_change
calc: mean of cursor_position_change
calc: std of cursor_position_change
calc: sum of cursor_position_change
calc: max of word_count_change
calc: mean of word_count_change
calc: std of word_count_change
calc: sum of word_count_change


Unnamed: 0,id,activity
0,0000aaaa,"[Input, Input]"
1,2222bbbb,"[Input, Input]"
2,4444cccc,"[Input, Input]"


  0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Input,Remove/Cut,Nonproduction,Replace,Paste
0,2,0,0,0,0
1,2,0,0,0,0
2,2,0,0,0,0




Unnamed: 0,id,down_event
0,0000aaaa,"[Space, Space]"
1,2222bbbb,"[q, q]"
2,4444cccc,"[Space, q]"


  0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,q,Space,Backspace,Shift,ArrowRight,Leftclick,ArrowLeft,.,",",ArrowDown,ArrowUp,Enter,CapsLock,',Delete,Unidentified
0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0




Unnamed: 0,id,up_event
0,0000aaaa,"[Space, Space]"
1,2222bbbb,"[q, q]"
2,4444cccc,"[Space, q]"


  0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,q,Space,Backspace,Shift,ArrowRight,Leftclick,ArrowLeft,.,",",ArrowDown,ArrowUp,Enter,CapsLock,',Delete,Unidentified
0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0




Unnamed: 0,id,text_change
0,0000aaaa,"[ , ]"
1,2222bbbb,"[q, q]"
2,4444cccc,"[ , q]"


  0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,q,Unnamed: 2,NoChange,.,",",\n,',"""",-,?,;,=,/,\,:
0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0




Unnamed: 0,id,down_event
0,0000aaaa,"[Space, Space]"
1,2222bbbb,"[q, q]"
2,4444cccc,"[Space, q]"


  0%|          | 0/3 [00:00<?, ?it/s]



Unnamed: 0,id,text_change
0,0000aaaa,[]
1,2222bbbb,[qq]
2,4444cccc,[q]


In [10]:
train_feats = train_feats.merge(train_scores, on='id', how='left') # join object variance

In [11]:
display(train_feats)
display(test_feats)

Unnamed: 0,id,event_id_max,up_time_max,action_time_sum,action_time_max,action_time_mean,action_time_std,activity_nunique,down_event_nunique,up_event_nunique,text_change_nunique,cursor_position_nunique,cursor_position_max,cursor_position_mean,word_count_nunique,word_count_max,word_count_mean,word_count_q1,word_count_q2,word_count_q3,action_time_gap_max,action_time_gap_min,action_time_gap_mean,action_time_gap_std,action_time_gap_sum,cursor_position_change_max,cursor_position_change_mean,cursor_position_change_std,cursor_position_change_sum,word_count_change_max,word_count_change_mean,word_count_change_std,word_count_change_sum,activity_0_count,activity_1_count,activity_2_count,activity_3_count,activity_4_count,down_event_0_count,down_event_1_count,down_event_2_count,down_event_3_count,down_event_4_count,down_event_5_count,down_event_6_count,down_event_7_count,down_event_8_count,down_event_9_count,down_event_10_count,down_event_11_count,down_event_12_count,down_event_13_count,down_event_14_count,down_event_15_count,up_event_0_count,up_event_1_count,up_event_2_count,up_event_3_count,up_event_4_count,up_event_5_count,up_event_6_count,up_event_7_count,up_event_8_count,up_event_9_count,up_event_10_count,up_event_11_count,up_event_12_count,up_event_13_count,up_event_14_count,up_event_15_count,text_change_0_count,text_change_1_count,text_change_2_count,text_change_3_count,text_change_4_count,text_change_5_count,text_change_6_count,text_change_7_count,text_change_8_count,text_change_9_count,text_change_10_count,text_change_11_count,text_change_12_count,text_change_13_count,text_change_14_count,punct_cnt,input_word_count,input_word_length_mean,input_word_length_max,input_word_length_std,word_time_ratio,word_event_ratio,event_time_ratio,idle_time_ratio,score
0,001519c8,2557,1801969,297243,2259,116.246774,91.797374,7,12,12,17,1469,1539,711.163473,257,256,128.116152,54.0,132.0,188.00,154136.0,-142.0,586.932707,4294.022274,1500200.0,1350.0,4.159624,43.180116,10632.0,2.0,0.172535,0.381013,441.0,2010,417,120,7,0,1619,357,417,27,2,92,2,21,12,0,0,4,0,3,0,0,1619,357,417,27,2,92,2,21,12,0,0,4,0,3,0,0,1940,436,120,28,14,4,5,0,0,0,1,0,0,0,0,37,377,5.169761,20,3.346931,0.000142,0.100117,0.001419,0.832534,3.5
1,0022f953,2454,1788969,275391,1758,112.221271,55.431189,5,17,17,12,1416,1676,776.205786,324,323,182.714751,104.0,186.0,268.00,145899.0,-166.0,604.547493,4897.303641,1482955.0,1581.0,9.819405,84.785626,24087.0,1.0,0.170404,0.376064,418.0,1938,260,254,1,1,1490,391,260,97,46,56,49,15,21,3,2,6,0,3,0,0,1490,391,260,97,46,56,49,15,21,3,2,6,0,3,0,0,1698,432,254,18,24,7,4,6,6,3,0,0,0,0,0,53,401,4.234414,33,3.062917,0.000181,0.131622,0.001372,0.828944,3.5
2,0042269b,4136,1771669,421201,3005,101.837766,82.383766,4,13,18,19,1649,2291,731.611702,405,404,194.772727,114.0,193.0,277.00,153886.0,-250.0,325.520435,3937.359025,1346027.0,1862.0,6.531318,71.786451,27007.0,28.0,0.167836,0.644564,694.0,3515,439,175,7,0,2904,552,439,39,6,129,0,21,23,0,0,17,0,0,0,0,2899,552,439,39,6,129,0,21,23,0,0,17,0,0,0,0,3257,615,175,23,26,23,0,2,1,0,0,4,0,0,0,47,639,5.344288,25,3.372135,0.000228,0.097679,0.002335,0.759751,6.0
3,0059420b,1556,1404469,189596,806,121.848329,113.768226,5,15,15,10,1048,1047,542.537275,207,206,103.618895,49.0,108.5,155.00,101690.0,-516.0,754.648232,4242.152639,1173478.0,357.0,1.457878,9.920533,2267.0,1.0,0.181350,0.385432,282.0,1304,151,99,1,1,1038,243,152,68,0,18,0,13,3,0,0,3,2,2,0,0,1038,243,152,68,0,18,0,13,3,0,0,3,2,2,0,0,1146,281,99,13,3,4,3,0,0,0,0,5,0,0,0,18,255,4.537255,15,2.867940,0.000147,0.132391,0.001108,0.835531,2.0
4,0075873a,2531,1662472,313702,701,123.943896,62.082013,3,11,11,9,1197,1402,600.050968,253,252,125.082971,55.0,113.0,212.00,110688.0,-158.0,502.094862,3896.209237,1270300.0,643.0,2.803953,24.251326,7094.0,1.0,0.168379,0.374277,426.0,1942,517,72,0,0,1541,324,517,39,0,33,0,23,24,0,0,10,0,17,0,0,1541,324,517,39,0,33,0,23,24,0,0,10,0,17,0,0,1964,397,72,32,25,12,25,0,0,2,0,2,0,0,0,66,431,4.556845,14,2.783927,0.000152,0.099565,0.001522,0.764103,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2466,ffb8c745,4739,1791649,499670,3323,105.437856,63.622575,4,13,13,13,1484,1634,774.000633,462,461,256.353661,163.0,297.0,339.00,128570.0,-117.0,267.942592,3458.823757,1269512.0,1262.0,1.990713,27.213406,9432.0,170.0,0.204095,2.495495,967.0,3588,960,189,2,0,2844,651,960,164,0,24,0,43,32,0,0,7,0,7,0,0,2844,651,960,164,0,24,0,43,32,0,0,7,0,7,0,0,3605,813,189,59,42,11,8,6,1,0,0,0,0,0,2,88,741,4.869096,20,2.977718,0.000257,0.097278,0.002645,0.708572,3.5
2467,ffbef7e5,2604,1799174,214221,1144,82.266129,36.178818,4,11,11,8,1808,1877,1022.078725,439,438,223.013057,114.0,227.5,336.00,267869.0,-64.0,600.545909,5630.628933,1563221.0,1124.0,2.820207,35.614696,7341.0,1.0,0.177488,0.382154,462.0,2395,60,148,1,0,1874,447,60,106,0,36,0,31,24,0,0,12,0,8,0,0,1874,447,60,106,0,36,0,31,24,0,0,12,0,8,0,0,1920,457,148,33,24,12,9,0,0,0,0,0,0,0,0,63,473,4.059197,13,2.221528,0.000243,0.168203,0.001447,0.868855,4.0
2468,ffccd6fd,3063,1959363,231580,564,75.605615,63.494975,3,11,11,6,2759,2761,1309.085537,202,201,157.589292,114.0,201.0,201.00,229804.0,-87.0,556.597322,5398.118769,1704301.0,427.0,1.344546,8.509608,4117.0,1.0,0.073481,0.260968,225.0,2849,88,126,0,0,969,1861,88,0,29,9,53,5,2,29,0,12,6,0,0,0,969,1861,88,0,29,9,53,5,2,29,0,12,6,0,0,0,1031,1879,126,6,3,18,0,0,0,0,0,0,0,0,0,7,232,4.443966,15,2.693600,0.000103,0.065622,0.001563,0.869824,1.5
2469,ffec5b38,3242,1508504,289439,1388,89.277915,54.515788,3,15,15,13,2106,2133,1192.640962,414,413,205.917027,106.0,205.0,307.75,127733.0,-132.0,370.003085,3462.066161,1199180.0,563.0,1.814563,16.147617,5881.0,9.0,0.163838,0.401387,531.0,2895,276,71,0,0,2361,457,276,52,0,14,0,31,27,0,0,6,2,4,0,0,2361,457,276,52,0,14,0,31,27,0,0,6,2,4,0,0,2593,490,71,34,29,8,4,6,1,0,2,0,0,0,0,70,512,5.169922,24,3.419903,0.000274,0.127390,0.002149,0.794947,5.0


Unnamed: 0,id,event_id_max,up_time_max,action_time_sum,action_time_max,action_time_mean,action_time_std,activity_nunique,down_event_nunique,up_event_nunique,text_change_nunique,cursor_position_nunique,cursor_position_max,cursor_position_mean,word_count_nunique,word_count_max,word_count_mean,word_count_q1,word_count_q2,word_count_q3,action_time_gap_max,action_time_gap_min,action_time_gap_mean,action_time_gap_std,action_time_gap_sum,cursor_position_change_max,cursor_position_change_mean,cursor_position_change_std,cursor_position_change_sum,word_count_change_max,word_count_change_mean,word_count_change_std,word_count_change_sum,activity_0_count,activity_1_count,activity_2_count,activity_3_count,activity_4_count,down_event_0_count,down_event_1_count,down_event_2_count,down_event_3_count,down_event_4_count,down_event_5_count,down_event_6_count,down_event_7_count,down_event_8_count,down_event_9_count,down_event_10_count,down_event_11_count,down_event_12_count,down_event_13_count,down_event_14_count,down_event_15_count,up_event_0_count,up_event_1_count,up_event_2_count,up_event_3_count,up_event_4_count,up_event_5_count,up_event_6_count,up_event_7_count,up_event_8_count,up_event_9_count,up_event_10_count,up_event_11_count,up_event_12_count,up_event_13_count,up_event_14_count,up_event_15_count,text_change_0_count,text_change_1_count,text_change_2_count,text_change_3_count,text_change_4_count,text_change_5_count,text_change_6_count,text_change_7_count,text_change_8_count,text_change_9_count,text_change_10_count,text_change_11_count,text_change_12_count,text_change_13_count,text_change_14_count,punct_cnt,input_word_count,input_word_length_mean,input_word_length_max,input_word_length_std,word_time_ratio,word_event_ratio,event_time_ratio,idle_time_ratio
0,0000aaaa,2,760160,172,87,86.0,1.414214,1,1,1,1,2,1,0.5,1,0,0.0,0.0,0.0,0.0,421555.0,421555.0,421555.0,,421555.0,1.0,1.0,,1.0,0.0,0.0,,0.0,2,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0.0,0.0,0.0,3e-06,0.554561
1,2222bbbb,2,712023,113,67,56.5,14.849242,1,1,1,1,2,1,0.5,1,1,1.0,1.0,1.0,1.0,-421521.0,-421521.0,-421521.0,,-421521.0,1.0,1.0,,1.0,0.0,0.0,,0.0,2,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2.0,2,0.0,1e-06,0.5,3e-06,-0.592005
2,4444cccc,2,635641,150,94,75.0,26.870058,1,2,2,2,2,1,0.5,2,1,0.5,0.25,0.5,0.75,-450645.0,-450645.0,-450645.0,,-450645.0,1.0,1.0,,1.0,1.0,1.0,,1.0,2,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1.0,1,0.0,2e-06,0.5,3e-06,-0.708962


# 2. Model

## 2-1. prepartation

### 2-1-1. package import

In [12]:
import optuna
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

### 2-1-2 . data split into X and Y

In [13]:
Y = train_feats[obj_column]
X = train_feats.drop([exp_key_column , obj_column] , axis=1)

### 2-1-3. metric function& predict function

In [14]:
metric = "rsme"

In [15]:
def classify_predict(predict):
    res = 0
    if predict <= 0.25:
        res = 0
    elif 0.25 < predict and predict <= 0.75:
        res = 0.5
    elif 0.75 < predict and predict <= 1.25:
        res = 1.0
    elif 1.25 < predict and predict <= 1.75:
        res = 1.5
    elif 1.75 < predict and predict <= 2.25:
        res = 2.0
    elif 2.25 < predict and predict <= 2.75:
        res = 2.5
    elif 2.75 < predict and predict <= 3.25:
        res = 3.0
    elif 3.25 < predict and predict <= 3.75:
        res = 3.5
    elif 3.75 < predict and predict <= 4.25:
        res = 4.0
    elif 4.25 < predict and predict <= 4.75:
        res = 4.5
    elif 4.75 < predict and predict <= 5.25:
        res = 5.0
    elif 5.25 < predict and predict <= 5.75:
        res = 5.5
    else:
        res = 6.0
    return res
        

In [16]:
classify_predict(4.8)

5.0

In [17]:
def classify_predict_vectorize(lis):
    res = np.vectorize(classify_predict)(lis)
    return res

## 2-2. AutoML

### 2-2-1. TPOT

In [18]:
# !pip install TPOT

In [19]:
# from tpot import TPOTRegressor

In [20]:
# tpot = TPOTRegressor(scoring='neg_mean_absolute_error',
#                      max_time_mins = 100,
#                      generations=10,
#                      population_size=2,
#                      random_state=42,
#                      verbosity=2,
#                      n_jobs=-1,
#                      memory = "auto"
#                     )
# tpot.fit(X,Y)
# tpot.fitted_pipeline_

In [21]:
# tpot

In [22]:
# def calculate_scores(true, pred):
#     """全ての評価指標を計算する

#     Parameters
#     ----------
#     true (np.array)       : 実測値
#     pred (np.array)       : 予測値

#     Returns
#     -------
#     scores (pd.DataFrame) : 各評価指標を纏めた結果

#     """
#     scores = {}
#     scores = np.sqrt(mean_squared_error(true, pred))
#     return scores

# scores = calculate_scores(Y, tpot.predict(X))
# print(scores)

In [23]:
# tpot_dict = {}
# tpot_dict["model"] = tpot.fitted_pipeline_

### 2-2-2. pycaret

In [24]:
# !pip install pycaret==2.0
# # !pip install pycaret--no-deps

In [25]:
# from pycaret.regression import *

In [26]:
# data_pycaret = X.join(Y)
# exp = setup(data = data_pycaret, target = obj_column,train_size=0.7,data_split_shuffle=True,session_id=2)
# compare_models()

## 2-2. study by Optuna
find best parameters of each models

In [27]:
models = {}
n_trials = 10
n_splits = 10

### 2-2-1. lgb

In [28]:
def lgb_objective(trial,data=X,target=Y):
    
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.2,random_state=42)
    param = {
        'metric': 'rmse', 
        'random_state': 42,
        'n_estimators': trial.suggest_int('n_estimators', 1000,30000),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.006,0.008,0.01,0.014,0.017,0.02]),
        'max_depth': trial.suggest_int('max_depth', 1 , 100),
        'num_leaves' : trial.suggest_int('num_leaves', 1, 1000),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 300),
        'cat_smooth' : trial.suggest_int('min_data_per_groups', 1, 100)
    }
    model = lgb.LGBMRegressor(**param)  
    model.fit(train_x,train_y,eval_set=[(test_x,test_y)],early_stopping_rounds=100,verbose=False)
    
    preds = model.predict(test_x)
    preds = classify_predict_vectorize(preds)
    
    rmse = mean_squared_error(test_y, preds,squared=False)
    
    return rmse

In [29]:
lgb_dict = {}
lgb_study = optuna.create_study(direction='minimize')
lgb_study.optimize(lgb_objective, n_trials=n_trials)
print('Number of finished trials:', len(lgb_study.trials))
print('Best trial:', lgb_study.best_trial.params)
lgb_param = lgb_study.best_params 
lgb_dict["model"] = lgb.LGBMRegressor(**lgb_param)  

[I 2023-10-11 09:46:04,882] A new study created in memory with name: no-name-a19185d2-fe8d-47be-a6b2-2968da47bc1a
[I 2023-10-11 09:46:10,029] Trial 0 finished with value: 0.6170964922992844 and parameters: {'n_estimators': 15801, 'reg_alpha': 1.786597292598502, 'reg_lambda': 0.005559939106367747, 'colsample_bytree': 0.9, 'subsample': 0.5, 'learning_rate': 0.006, 'max_depth': 82, 'num_leaves': 853, 'min_child_samples': 137, 'min_data_per_groups': 32}. Best is trial 0 with value: 0.6170964922992844.
[I 2023-10-11 09:46:12,074] Trial 1 finished with value: 0.6021845752144958 and parameters: {'n_estimators': 6035, 'reg_alpha': 0.5291269212259468, 'reg_lambda': 0.5302551415722379, 'colsample_bytree': 0.3, 'subsample': 0.5, 'learning_rate': 0.02, 'max_depth': 89, 'num_leaves': 535, 'min_child_samples': 150, 'min_data_per_groups': 83}. Best is trial 1 with value: 0.6021845752144958.
[I 2023-10-11 09:46:17,692] Trial 2 finished with value: 0.6088571841631424 and parameters: {'n_estimators': 13

Number of finished trials: 10
Best trial: {'n_estimators': 21288, 'reg_alpha': 0.0012425702065401837, 'reg_lambda': 1.4617793623963724, 'colsample_bytree': 0.5, 'subsample': 0.8, 'learning_rate': 0.02, 'max_depth': 63, 'num_leaves': 925, 'min_child_samples': 79, 'min_data_per_groups': 36}


### 2-2-2. xgb

In [30]:
def xgb_objective(trial,data=X,target=Y):
    
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.2,random_state=42)
    param = {
        'max_depth': trial.suggest_int('max_depth', 1, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 1.0),
        'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_float('gamma', 0.01, 1.0),
        'subsample': trial.suggest_float('subsample', 0.01, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.01, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.01, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.01, 1.0),
        'random_state': trial.suggest_int('random_state', 1, 1000)
    }
    model = XGBRegressor(**param)  
    
    model.fit(train_x,train_y,eval_set=[(test_x,test_y)],early_stopping_rounds=100,verbose=False)
    
    preds = model.predict(test_x)
    preds = classify_predict_vectorize(preds)
    
    rmse = mean_squared_error(test_y, preds,squared=False)
    
    return rmse

In [31]:
xgb_dict = {}
xgb_study = optuna.create_study(direction='minimize')
xgb_study.optimize(xgb_objective, n_trials=n_trials)
print('Number of finished trials:', len(xgb_study.trials))
print('Best trial:', xgb_study.best_trial.params)
xgb_params=xgb_study.best_params 
xgb_dict["model"] = XGBRegressor(**xgb_params)

[I 2023-10-11 09:46:45,425] A new study created in memory with name: no-name-62bb5f42-dac3-45dc-a150-58a53b62a47f
[I 2023-10-11 09:46:47,180] Trial 0 finished with value: 0.5988203892536214 and parameters: {'max_depth': 8, 'learning_rate': 0.015063863993035631, 'n_estimators': 406, 'min_child_weight': 5, 'gamma': 0.34762844034110707, 'subsample': 0.06816664091147229, 'colsample_bytree': 0.30082008054288906, 'reg_alpha': 0.45712384714818055, 'reg_lambda': 0.19194192743450403, 'random_state': 214}. Best is trial 0 with value: 0.5988203892536214.
[I 2023-10-11 09:46:47,474] Trial 1 finished with value: 0.7564537145273478 and parameters: {'max_depth': 5, 'learning_rate': 0.825486350640641, 'n_estimators': 867, 'min_child_weight': 8, 'gamma': 0.9570041760132174, 'subsample': 0.5886789939631742, 'colsample_bytree': 0.03135074753605189, 'reg_alpha': 0.49026655448554035, 'reg_lambda': 0.05081204505180769, 'random_state': 103}. Best is trial 0 with value: 0.5988203892536214.
[I 2023-10-11 09:46

Number of finished trials: 10
Best trial: {'max_depth': 2, 'learning_rate': 0.04334968253755717, 'n_estimators': 777, 'min_child_weight': 7, 'gamma': 0.555328963209988, 'subsample': 0.29477808952526724, 'colsample_bytree': 0.32306543091026313, 'reg_alpha': 0.9413049279874941, 'reg_lambda': 0.8642031453351593, 'random_state': 194}


### 2-2-3. randomforest

In [32]:
# def rf_objective(trial,data=X,target=Y):
    
#     train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.2,random_state=42)
#     param = {
#         "max_depth": trial.suggest_int('max_depth', 2, 1000),
#         "max_leaf_nodes": trial.suggest_int('max_leaf_nodes', 2,1000),
#     }
#     model = HistGradientBoostingRegressor(**param)  
    
#     model.fit(train_x,train_y)
    
#     preds = model.predict(test_x)
#     preds = classify_predict_vectorize(preds)
    
#     rmse = mean_squared_error(test_y, preds,squared=False)
    
#     return rmse

In [33]:
# rf_dict = {}
# rf_study = optuna.create_study(direction='minimize')
# rf_study.optimize(rf_objective, n_trials=n_trials)
# print('Number of finished trials:', len(rf_study.trials))
# print('Best trial:', rf_study.best_trial.params)
# rf_params=rf_study.best_params 
# rf_dict["model"] = HistGradientBoostingRegressor(**rf_params)

In [34]:
models["lgb"] = lgb_dict
models["xgb"] = xgb_dict
# models["tpot"] = tpot_dict
# models["rf"] = rf_dict

## 2-3. train lGBM

In [35]:
feature_names = list(filter(lambda x: x not in [obj_column, exp_key_column], train_feats.columns))

In [36]:
kfold = KFold(n_splits=n_splits, shuffle=True, random_state=42)
for key, value in models.items():
        print(f"***********{key}***************")
        best_model = value["model"]
        oof = []
        prediction = test_feats[[exp_key_column]]
        prediction[obj_column] = 0
        df_importance_list = []
        for fold_id, (trn_idx, val_idx) in enumerate(kfold.split(train_feats[feature_names])):
            X_train = train_feats.iloc[trn_idx][feature_names]
            Y_train = train_feats.iloc[trn_idx][obj_column]

            X_val = train_feats.iloc[val_idx][feature_names]
            Y_val = train_feats.iloc[val_idx][obj_column]

            print('\nFold_{} Training ================================\n'.format(fold_id+1))
            best_model = value["model"]

            ## we have to change because each models have different params
            if key == "xgb":
                model = best_model.fit(X_train,
                                      Y_train,
                                      # eval_names=['train', 'valid'],
                                      eval_set=[(X_train, Y_train), (X_val, Y_val)],
                                      verbose=20,
                                      # eval_metric= [metric],
                                      early_stopping_rounds=50)
                pred_val = model.predict(X_val)
                pred_test = model.predict(test_feats[feature_names])
                prediction[obj_column] += pred_test / kfold.n_splits
                df_oof = train_feats.iloc[val_idx][[exp_key_column, obj_column]].copy()
                df_oof['pred'] = pred_val
                df_oof[f"pred_{fold_id+1}"] = pred_val
                oof.append(df_oof)
                df_oof["pred_range"] = classify_predict_vectorize(pred_val)
                if fold_id + 1 == n_splits:
                    display(df_oof.head(10))
                    display(prediction.head(10))
                    
            elif key == "lgb":
                model = best_model.fit(X_train,Y_train,
                              eval_names=['train', 'valid'],
                              eval_set=[(X_train, Y_train), (X_val, Y_val)],
                              verbose=20,
                              eval_metric= metric,
                              early_stopping_rounds=50)
                pred_val = model.predict(X_val, num_iteration=model.best_iteration_)
                pred_test = model.predict(test_feats[feature_names], num_iteration=model.best_iteration_)
                prediction[obj_column] += pred_test / kfold.n_splits
                df_oof = train_feats.iloc[val_idx][[exp_key_column, obj_column]].copy()
                df_oof['pred'] = pred_val
                oof.append(df_oof)
                df_oof["pred_range"] = classify_predict_vectorize(pred_val)
                if fold_id + 1 == n_splits:
                    display(df_oof.head(10))
                    display(prediction.head(10))

            elif key == "rf":
                model = best_model.fit(X_train,Y_train)
                pred_val = model.predict(X_val)
                pred_test = model.predict(test_feats[feature_names])
                prediction[obj_column] += pred_test / kfold.n_splits
                df_oof = train_feats.iloc[val_idx][[exp_key_column, obj_column]].copy()
                df_oof['pred'] = pred_val
                oof.append(df_oof)
                if fold_id + 1 == n_splits:
                    display(df_oof.head(10))
                    display(prediction.head(10))
            elif key == "tpot":
                model = best_model.fit(X_train,
                          Y_train,
                          eval_names=['train', 'valid'],
                          eval_set=[(X_train, Y_train), (X_val, Y_val)],
                          verbose=20,
                          eval_metric= metric,
                          early_stopping_rounds=50)
                pred_val = model.predict(X_val, num_iteration=model.best_iteration_)
                pred_test = model.predict(test_feats[feature_names], num_iteration=model.best_iteration_)
                prediction[obj_column] += pred_test / kfold.n_splits
                df_oof = train_feats.iloc[val_idx][[exp_key_column, obj_column]].copy()
                df_oof['pred'] = pred_val
                oof.append(df_oof)
                df_oof["pred_range"] = classify_predict_vectorize(pred_val)
                if fold_id + 1 == n_splits:
                    display(df_oof.head(10))
                    display(prediction.head(10))
                
            models[key]["prediction"] = prediction
            if key in ["lgb","xgb"]:
                df_importance = pd.DataFrame({
                    'column': feature_names,
                    'importance': model.feature_importances_,
                })
                df_importance_list.append(df_importance)
            else:
                df_importance = pd.DataFrame([])
            del model, pred_val, pred_test
            gc.collect()
            df_oof = pd.concat(oof)
            rmse = mean_squared_error(df_oof[obj_column], np.clip(df_oof['pred'], a_min=0.5, a_max=6.0), squared=False)
            print('rmse:', rmse)
            models[key]["score"] = rmse

***********lgb***************


[20]	train's l2: 0.708534	valid's l2: 0.719253
[40]	train's l2: 0.538557	valid's l2: 0.566603
[60]	train's l2: 0.44885	valid's l2: 0.488881
[80]	train's l2: 0.395154	valid's l2: 0.44394
[100]	train's l2: 0.360956	valid's l2: 0.417866
[120]	train's l2: 0.336176	valid's l2: 0.403903
[140]	train's l2: 0.317166	valid's l2: 0.394236
[160]	train's l2: 0.301281	valid's l2: 0.386344
[180]	train's l2: 0.287652	valid's l2: 0.383419
[200]	train's l2: 0.275225	valid's l2: 0.379572
[220]	train's l2: 0.264116	valid's l2: 0.37873
[240]	train's l2: 0.253394	valid's l2: 0.377155
[260]	train's l2: 0.243794	valid's l2: 0.378549
[280]	train's l2: 0.234274	valid's l2: 0.378951
rmse: 0.6141291516911603


[20]	train's l2: 0.722161	valid's l2: 0.594251
[40]	train's l2: 0.550761	valid's l2: 0.442582
[60]	train's l2: 0.460133	valid's l2: 0.369163
[80]	train's l2: 0.404609	valid's l2: 0.331605
[100]	train's l2: 0.368889	valid's l2: 0.314353
[120]	train's l2: 0.34317	valid's l2: 0.

Unnamed: 0,id,score,pred,pred_range
1,0022f953,3.5,3.598767,3.5
21,019737b6,3.5,4.60266,4.5
34,031c0c58,4.0,3.877673,4.0
64,0666fb4e,4.5,5.011947,5.0
95,09a67581,4.5,4.803537,5.0
98,09eb3ce5,3.0,2.648533,2.5
103,0a4e3aec,3.5,3.437089,3.5
122,0ca05fe3,1.5,3.021105,3.0
130,0d71be2a,4.5,4.189695,4.0
146,0f2b0127,3.5,3.221151,3.0


Unnamed: 0,id,score
0,0000aaaa,1.520182
1,2222bbbb,1.500047
2,4444cccc,1.491797


rmse: 0.6413286802762734
***********xgb***************


[0]	validation_0-rmse:3.22935	validation_1-rmse:3.26715
[20]	validation_0-rmse:1.48390	validation_1-rmse:1.51260
[40]	validation_0-rmse:0.86237	validation_1-rmse:0.87946
[60]	validation_0-rmse:0.69026	validation_1-rmse:0.70142
[80]	validation_0-rmse:0.64468	validation_1-rmse:0.65573
[100]	validation_0-rmse:0.62739	validation_1-rmse:0.64048
[120]	validation_0-rmse:0.61736	validation_1-rmse:0.63182
[140]	validation_0-rmse:0.60936	validation_1-rmse:0.62621
[160]	validation_0-rmse:0.60359	validation_1-rmse:0.62119
[180]	validation_0-rmse:0.59819	validation_1-rmse:0.61945
[200]	validation_0-rmse:0.59219	validation_1-rmse:0.61692
[220]	validation_0-rmse:0.58810	validation_1-rmse:0.61491
[240]	validation_0-rmse:0.58336	validation_1-rmse:0.61446
[260]	validation_0-rmse:0.57939	validation_1-rmse:0.61411
[280]	validation_0-rmse:0.57561	validation_1-rmse:0.61296
[300]	validation_0-rmse:0.57235	validation_1-rmse:0.61207
[320]	validation_0-rm

Unnamed: 0,id,score,pred,pred_10,pred_range
1,0022f953,3.5,3.737928,3.737928,3.5
21,019737b6,3.5,4.673585,4.673585,4.5
34,031c0c58,4.0,4.027115,4.027115,4.0
64,0666fb4e,4.5,4.985784,4.985784,5.0
95,09a67581,4.5,4.96018,4.96018,5.0
98,09eb3ce5,3.0,2.702969,2.702969,2.5
103,0a4e3aec,3.5,3.338404,3.338404,3.5
122,0ca05fe3,1.5,2.975853,2.975853,3.0
130,0d71be2a,4.5,4.288417,4.288417,4.5
146,0f2b0127,3.5,3.314513,3.314513,3.5


Unnamed: 0,id,score
0,0000aaaa,1.085214
1,2222bbbb,1.017503
2,4444cccc,0.97673


rmse: 0.6349788763397844


In [37]:
# oof = []
# prediction = test_feats[[exp_key_column]]
# prediction[obj_column] = 0
# df_importance_list = []

# kfold = KFold(n_splits=n_splits, shuffle=True, random_state=42)
# for fold_id, (trn_idx, val_idx) in enumerate(kfold.split(train_feats[feature_names])):
#     X_train = train_feats.iloc[trn_idx][feature_names]
#     Y_train = train_feats.iloc[trn_idx][obj_column]

#     X_val = train_feats.iloc[val_idx][feature_names]
#     Y_val = train_feats.iloc[val_idx][obj_column]

#     print('\nFold_{} Training ================================\n'.format(fold_id+1))
#     for key, value in models.items():
#         print(f"***********{key}***************")
#         best_model = value["model"]
        
#         ## we have to change because each models have different params
#         if key == "xgb":
#             model = best_model.fit(X_train,
#                                   Y_train,
#                                   # eval_names=['train', 'valid'],
#                                   eval_set=[(X_train, Y_train), (X_val, Y_val)],
#                                   verbose=20,
#                                   # eval_metric= [metric],
#                                   early_stopping_rounds=50)
#             pred_val = model.predict(X_val)
#             pred_test = model.predict(test_feats[feature_names])
#             prediction[obj_column] += pred_test / kfold.n_splits
#             df_oof = train_feats.iloc[val_idx][[exp_key_column, obj_column]].copy()
#             df_oof['pred'] = pred_val
#             oof.append(df_oof)
#             df_oof["pred_range"] = classify_predict_vectorize(pred_val)
#             display(df_oof.head(10))
#         elif key == "lgb":
#             model = best_model.fit(X_train,
#                       Y_train,
#                       eval_names=['train', 'valid'],
#                       eval_set=[(X_train, Y_train), (X_val, Y_val)],
#                       verbose=20,
#                       eval_metric= metric,
#                       early_stopping_rounds=50)
#             pred_val = model.predict(X_val, num_iteration=model.best_iteration_)
#             pred_test = model.predict(test_feats[feature_names], num_iteration=model.best_iteration_)
#             prediction[obj_column] += pred_test / kfold.n_splits
#             df_oof = train_feats.iloc[val_idx][[exp_key_column, obj_column]].copy()
#             df_oof['pred'] = pred_val
#             oof.append(df_oof)
#             df_oof["pred_range"] = classify_predict_vectorize(pred_val)
#             display(df_oof.head(10))
        
#         elif key == "rf":
#             model = best_model.fit(X_train,Y_train)
#             pred_val = model.predict(X_val)
#             pred_test = model.predict(test_feats[feature_names])
#             prediction[obj_column] += pred_test / kfold.n_splits
#             df_oof = train_feats.iloc[val_idx][[exp_key_column, obj_column]].copy()
#             df_oof['pred'] = pred_val
#             oof.append(df_oof)
#             display(df_oof.head(10))
#         elif key == "tpot":
#             model = best_model.fit(X_train,
#                       Y_train,
#                       eval_names=['train', 'valid'],
#                       eval_set=[(X_train, Y_train), (X_val, Y_val)],
#                       verbose=20,
#                       eval_metric= metric,
#                       early_stopping_rounds=50)
#             pred_val = model.predict(X_val, num_iteration=model.best_iteration_)
#             pred_test = model.predict(test_feats[feature_names], num_iteration=model.best_iteration_)
#             prediction[obj_column] += pred_test / kfold.n_splits
#             df_oof = train_feats.iloc[val_idx][[exp_key_column, obj_column]].copy()
#             df_oof['pred'] = pred_val
#             oof.append(df_oof)
#             df_oof["pred_range"] = classify_predict_vectorize(pred_val)
#             display(df_oof.head(10))
            
#         models[key]["prediction"] = prediction
#         if key in ["lgb","xgb"]:
#             df_importance = pd.DataFrame({
#                 'column': feature_names,
#                 'importance': model.feature_importances_,
#             })
#             df_importance_list.append(df_importance)
#         else:
#             df_importance = pd.DataFrame([])
#         del model, pred_val, pred_test
#         gc.collect()
#         df_oof = pd.concat(oof)
#         rmse = mean_squared_error(df_oof[obj_column], np.clip(df_oof['pred'], a_min=0.5, a_max=6.0), squared=False)
#         print('rmse:', rmse)
#         models[key]["score"] = rmse

In [38]:
for key, value in models.items():
    print(key)
#     print(value.values())
#     print(value["model"])
    print(value["score"])

lgb
0.6413286802762734
xgb
0.6349788763397844


# 3. submission

In [39]:
prediction

Unnamed: 0,id,score
0,0000aaaa,1.085214
1,2222bbbb,1.017503
2,4444cccc,0.97673


In [40]:
# prediction['score'] = np.clip(prediction['score'], a_min=0.5, a_max=6.0)
display(models["lgb"]["prediction"])
prediction['score'] = classify_predict_vectorize(models["lgb"]["prediction"]["score"])
display(prediction)
prediction.to_csv('submission.csv', index=False)

Unnamed: 0,id,score
0,0000aaaa,1.520182
1,2222bbbb,1.500047
2,4444cccc,1.491797


Unnamed: 0,id,score
0,0000aaaa,1.5
1,2222bbbb,1.5
2,4444cccc,1.5
