Agenda
1. preparation
2. Model

In [1]:
import warnings
warnings.simplefilter('ignore')

import os
import gc
import re
from collections import Counter

import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 300)
from tqdm.auto import tqdm
tqdm.pandas()

from gensim.models import Word2Vec
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, StratifiedKFold, GroupKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import mean_squared_error

import lightgbm as lgb
from xgboost import XGBRegressor
from sklearn.ensemble import HistGradientBoostingRegressor

# 1-1. load data

In [2]:
train_logs = pd.read_csv('/kaggle/input/linking-writing-processes-to-writing-quality/train_logs.csv')
display(train_logs)
train_scores = pd.read_csv('/kaggle/input/linking-writing-processes-to-writing-quality/train_scores.csv')
display(train_scores)
test_logs = pd.read_csv('/kaggle/input/linking-writing-processes-to-writing-quality/test_logs.csv')
display(test_logs)

Unnamed: 0,id,event_id,down_time,up_time,action_time,activity,down_event,up_event,text_change,cursor_position,word_count
0,001519c8,1,4526,4557,31,Nonproduction,Leftclick,Leftclick,NoChange,0,0
1,001519c8,2,4558,4962,404,Nonproduction,Leftclick,Leftclick,NoChange,0,0
2,001519c8,3,106571,106571,0,Nonproduction,Shift,Shift,NoChange,0,0
3,001519c8,4,106686,106777,91,Input,q,q,q,1,1
4,001519c8,5,107196,107323,127,Input,q,q,q,2,1
...,...,...,...,...,...,...,...,...,...,...,...
8405893,fff05981,3615,2063944,2064440,496,Nonproduction,Leftclick,Leftclick,NoChange,1031,240
8405894,fff05981,3616,2064497,2064497,0,Nonproduction,Shift,Shift,NoChange,1031,240
8405895,fff05981,3617,2064657,2064765,108,Replace,q,q,q => q,1031,240
8405896,fff05981,3618,2069186,2069259,73,Nonproduction,Leftclick,Leftclick,NoChange,1028,240


Unnamed: 0,id,score
0,001519c8,3.5
1,0022f953,3.5
2,0042269b,6.0
3,0059420b,2.0
4,0075873a,4.0
...,...,...
2466,ffb8c745,3.5
2467,ffbef7e5,4.0
2468,ffccd6fd,1.5
2469,ffec5b38,5.0


Unnamed: 0,id,event_id,down_time,up_time,action_time,activity,down_event,up_event,text_change,cursor_position,word_count
0,0000aaaa,1,338433,338518,85,Input,Space,Space,,0,0
1,0000aaaa,2,760073,760160,87,Input,Space,Space,,1,0
2,2222bbbb,1,711956,712023,67,Input,q,q,q,0,1
3,2222bbbb,2,290502,290548,46,Input,q,q,q,1,1
4,4444cccc,1,635547,635641,94,Input,Space,Space,,0,0
5,4444cccc,2,184996,185052,56,Input,q,q,q,1,1


In [3]:
result_tmp = pd.DataFrame([dict(Counter(train_logs["activity"]))])

## 1-2. helper functions

### 1-2-1. const definition

In [4]:
# variables category
exp_key_column = "id"
obj_column = "score"

In [5]:
# count_elements
target_activities = ['Input', 'Remove/Cut', 'Nonproduction', 'Replace', 'Paste']
target_events = ['q', 'Space', 'Backspace', 'Shift', 'ArrowRight', 'Leftclick', 'ArrowLeft', '.', ',', 
          'ArrowDown', 'ArrowUp', 'Enter', 'CapsLock', "'", 'Delete', 'Unidentified']
target_text_changes = ['q', ' ', 'NoChange', '.', ',', '\n', "'", '"', '-', '?', ';', '=', '/', '\\', ':']
target_punctuations = ['"', '.', ',', "'", '-', ';', ':', '?', '!', '<', '>', '/',
                '@', '#', '$', '%', '^', '&', '*', '(', ')', '_', '+']

In [6]:
# def element_counts_in_list(id_col,target_col,count_element):
#     print("============element_counts_in_list================")
#     print(target_col)
#     result_tmp = pd.DataFrame([dict(Counter(target_col))])
#     result = result_tmp.copy()
#     result["tmp"] = 0
#     final_columns = [id_col, f"{target_col}_others_cnt"]
#     display(result_tmp)
#     for element_col in result_tmp.columns:
#         try:
#             if element_col in count_element:
#                 result[f"{target_col}_{element_col}_cnt"] = result_tmp[element_col]
#                 final_columns.append(f"{target_col}_{element_col}_cnt")
#             else:
#                 result["tmp"] += result_tmp[element_col]
#                 result[f"{target_col}_others_cnt"] = result["tmp"]
#         except Exception as e:
#             print(f"error:{e}")
#     return result[final_columns]
# def element_counts_in_list_vectorize(df, id_col, target_col, count_element):
#     res = np.vectorize(element_counts_in_list)(df[id_col], df[target_col], count_element)
#     return res

In [7]:
def element_counts_in_list(id_colname,id,target_colname,target, target_name):
    #print("============element_counts_in_list================")
    result_tmp = pd.DataFrame([dict(Counter(target))])
    #display(result_tmp)
    result = result_tmp.copy()
    result["tmp"] = 0
    result[id_colname] = id
    final_columns = [id_colname]
    count_element = []
    if target_name == "target_activities":
        count_element = target_activities
    elif target_name == "target_events":
        count_element = target_events
    elif target_name == "target_text_changes":
        count_element = target_text_changes
    #print(count_element)
    for element_col in result_tmp.columns:
        if element_col in count_element:
            result[f"{target_colname}_{element_col}_cnt"] = result_tmp[element_col]
            final_columns.append(f"{target_colname}_{element_col}_cnt")
        else:
            result["tmp"] += result_tmp[element_col]
            result[f"{target_colname}_others_cnt"] = result["tmp"]
            if f"{target_colname}_others_cnt" not in final_columns:
                final_columns.append(f"{target_colname}_others_cnt")
    #display(result[final_columns])
    #final_columns = list(final_columns)
    return result[final_columns]
def element_counts_in_list_vectorize(df, id, target, target_name):
    res = np.vectorize(element_counts_in_list)(id, df[id], target, df[target], target_name)
    return res

In [8]:
def element_counts(df: pd.DataFrame, key_colname: str, count_colname: str, count_elements_name: str, count_elements: list, match: str):
    """
    This function counts each elements
    Parameters
    ----------
    df: DataFrame
    key_colname: aggregator
    count_colname: count columns
    count_elements: count value in this list
    match: exact or partital
        
    Returns
    -------
    d : pd.DataFrame
    """
    print(f"======================element_counts start({count_colname})!!======================")
    tmp_df = df.groupby(key_colname).agg({count_colname: list}).reset_index()
    #display(tmp_df)
    ret = list()
    if match == "exact":
        # def element_counts_in_list(id_colname,id,target_colname,target,count_element):
#         ret = tmp_df.apply(element_counts_in_list,args = (key_colname, tmp_df[key_colname], count_colname, tmp_df[count_colname], count_elements) )
        ret = element_counts_in_list_vectorize(tmp_df, key_colname, count_colname, count_elements_name)
#         for li in tqdm(tmp_df[count_colname].values):
#             items = list(Counter(li).items())
#             di = dict()
#             # make all keys
#             for k in count_elements:
#                 di[k] = 0
#             for item in items:
#                 k,v = item[0], item[1]
#                 if k in di: # this part counts disgnated part
#                     di[k] = v
#             ret.append(di)
        #display(ret)
        ret = pd.DataFrame(ret)
        # display(ret)
        cols = [f"{count_colname}_{i}_count" for i in range(len(ret.columns))]
        ret.columns = cols
        #result = pd.DataFrame()
        #for number in range(len(ret)):
            
    elif match == "partitial":
        for li in tqdm(tmp_df['down_event'].values):
            cnt = 0
            items = list(Counter(li).items())
            for item in items:
                k, v = item[0], item[1]
                if k in count_elements:
                    cnt += v
            ret.append(cnt)
        ret = pd.DataFrame({'punct_cnt': ret})
    return ret
    
def get_input_words(df: pd.DataFrame):
    print("======================get_input_words start!!======================")
    tmp_df = df[(~df["text_change"].str.contains("=>"))&(df["text_change"] != "Nochange")].reset_index(drop= True)
    tmp_df = tmp_df.groupby(exp_key_column).agg({"text_change": list}).reset_index()
    # concat part
    tmp_df["text_change"] = tmp_df["text_change"].apply(lambda x: "".join(x))
    tmp_df['text_change'] = tmp_df['text_change'].apply(lambda x: re.findall(r'q+', x))
    
    tmp_df['input_word_count'] = tmp_df['text_change'].apply(len)
    display(tmp_df[["id","text_change"]])
    
    # calc part
    tmp_df['input_word_length_mean'] = tmp_df['text_change'].apply(lambda x: np.mean([len(i) for i in x] if len(x) > 0 else 0))
    tmp_df['input_word_length_max'] = tmp_df['text_change'].apply(lambda x: np.max([len(i) for i in x] if len(x) > 0 else 0))
    tmp_df['input_word_length_std'] = tmp_df['text_change'].apply(lambda x: np.std([len(i) for i in x] if len(x) > 0 else 0))
    tmp_df.drop(['text_change'], axis=1, inplace=True)
    return tmp_df

## 1-3. make features

In [9]:
def q1(x):
    return x.quantile(0.1)
def q2(x):
    return x.quantile(0.2)
def q3(x):
    return x.quantile(0.3)
def q4(x):
    return x.quantile(0.4)
def q5(x):
    return x.quantile(0.5)
def q6(x):
    return x.quantile(0.6)
def q7(x):
    return x.quantile(0.7)
def q8(x):
    return x.quantile(0.8)
def q9(x):
    return x.quantile(0.9)

In [10]:
def make_feats(df: pd.DataFrame):
    print("======================make_feats start!!======================")
    feats = pd.DataFrame({exp_key_column: df[exp_key_column].unique().tolist()})
    # time shift
    target_col = "up_time"
    df[f'{target_col}_shift1'] = df.groupby(exp_key_column)[target_col].shift(1) #(going under ↓)
    df['action_time_gap'] = df['down_time'] - df[f'{target_col}_shift1']
    df.drop(f'{target_col}_shift1', axis=1, inplace=True)
    
    # cursor position shift
    target_col = "cursor_position"
    df[f'{target_col}_shift1'] = df.groupby(exp_key_column)[target_col].shift(1)
    df[f'{target_col}_change'] = np.abs(df[target_col] - df[f'{target_col}_shift1'])
    df.drop(f'{target_col}_shift1', axis=1, inplace=True)
    
    # word count shift
    target_col = "word_count"
    df[f'{target_col}_shift1'] = df.groupby(exp_key_column)[target_col].shift(1)
    df[f'{target_col}_change'] = np.abs(df[target_col] - df[f'{target_col}_shift1'])
    df.drop(f'{target_col}_shift1', axis=1, inplace=True)
    
    for item in tqdm([
        ('event_id', ['max']),
        ('up_time', ['max']),
        ('action_time', ['sum', 'max', 'mean', 'std']),
        ('activity', ['nunique']),
        ('down_event', ['nunique']),
        ('up_event', ['nunique']),
        ('text_change', ['nunique']),
        ('cursor_position', ['nunique', 'max', 'mean']),
        ('word_count', ['nunique', 'max', 'mean', "sum", q1, q2, q3, q4, q5, q6, q7, q8, q9]),
        ('action_time_gap', ['max', 'min', 'mean', 'std', 'sum', q1, q2, q3, q4, q5, q6, q7, q8, q9]),
        ('cursor_position_change', ['max', 'mean', 'std', 'sum']),
        ('word_count_change', ['max', 'mean', 'std', 'sum'])
    ]):
        colname, methods = item[0], item[1]
        for method in methods:
            print(f"calc: {method} of {colname}")
            if type(method) == str: 
                tmp_df = df.groupby([exp_key_column]).agg({colname: method}).reset_index().rename(columns = {colname: f"{colname}_{method}"})
                feats = feats.merge(tmp_df, on = exp_key_column, how= "left")
            else:
                tmp_df = df.groupby([exp_key_column]).agg({colname: method}).reset_index().rename(columns = {colname: f"{colname}_{method.__name__}"})
                feats = feats.merge(tmp_df, on = exp_key_column, how= "left")
    
    # counts
#     element_counts(df: pd.DataFrame, key_colname = exp_key_column: str, count_colname: str, count_elements: list, match: str):
    # tmp_df = activity_counts(df)
    tmp_df = pd.DataFrame(element_counts(df, exp_key_column, "activity", "target_activities", target_activities, "exact"))
    #print("==========tmp_df===============")
    elements_counts_df = pd.DataFrame()
    for i in range(len(tmp_df)):
        tmp = pd.DataFrame(tmp_df.iloc[i]).iloc[0,0]
        #display(tmp)
        elements_counts_df = pd.concat([elements_counts_df,tmp.fillna(0)])
        #element_counts_df = pd.concat([elements_counts_df,pd.DataFrame(tmp_df.iloc[i]).iloc[0,0]])
    display(elements_counts_df)
    feats = feats.merge(elements_counts_df,on="id",how="left")
    #display(element_tmp_df)
    #feats = pd.concat([feats, tmp_df], axis=1)
    
    
    tmp_df = element_counts(df, exp_key_column, "down_event", "target_events", target_events, "exact")
    elements_counts_df = pd.DataFrame()
    for i in range(len(tmp_df)):
        tmp = pd.DataFrame(tmp_df.iloc[i]).iloc[0,0]
        #display(tmp)
        elements_counts_df = pd.concat([elements_counts_df,tmp.fillna(0)])
    #display(elements_counts_df)
    feats = feats.merge(elements_counts_df,on="id",how="left")
#     print("==========tmp_df=============")
#     display(tmp_df)
#     feats = pd.concat([feats, tmp_df], axis=1)
    
#     tmp_df = event_counts(df, 'up_event')
    tmp_df = element_counts(df, exp_key_column, "up_event", "target_events", target_events, "exact")
    elements_counts_df = pd.DataFrame()
    for i in range(len(tmp_df)):
        #element_counts_df = pd.concat([elements_counts_df,pd.DataFrame(tmp_df.iloc[i]).iloc[0,0]])
        tmp = pd.DataFrame(tmp_df.iloc[i]).iloc[0,0]
        #display(tmp)
        elements_counts_df = pd.concat([elements_counts_df,tmp.fillna(0)])
    #display(elements_counts_df)
    feats = feats.merge(elements_counts_df,on="id",how="left")
#     print("==========tmp_df=============")
#     display(tmp_df)
#     feats = pd.concat([feats, tmp_df], axis=1)
    
#     tmp_df = text_change_counts(df)
    tmp_df = element_counts(df, exp_key_column, "text_change", "target_text_changes", target_text_changes, "exact")
    elements_counts_df = pd.DataFrame()
    for i in range(len(tmp_df)):
        #element_counts_df = pd.concat([elements_counts_df,pd.DataFrame(tmp_df.iloc[i]).iloc[0,0]])
        tmp = pd.DataFrame(tmp_df.iloc[i]).iloc[0,0]
        #display(tmp)
        #elements_counts_df = pd.concat([elements_counts_df,tmp])
        elements_counts_df = pd.concat([elements_counts_df,tmp.fillna(0)])
    #display(elements_counts_df)
    feats = feats.merge(elements_counts_df,on="id",how="left")
    
    tmp_df = element_counts(df, exp_key_column, "down_event", "target_punctuations", target_punctuations, "partitial")
    feats = pd.concat([feats, tmp_df], axis=1)
    
    
    # input words
    tmp_df = get_input_words(df)
    print("==========tmp_df=============")
    display(tmp_df)
    feats = pd.merge(feats, tmp_df, on='id', how='left')
    
    # compare feats
    feats['word_time_ratio'] = feats['word_count_max'] / feats['up_time_max']
    feats['word_event_ratio'] = feats['word_count_max'] / feats['event_id_max']
    feats['event_time_ratio'] = feats['event_id_max']  / feats['up_time_max']
    feats['idle_time_ratio'] = feats['action_time_gap_sum'] / feats['up_time_max']
    
    return feats
    

In [11]:
train_feats = make_feats(train_logs)



  0%|          | 0/12 [00:00<?, ?it/s]

calc: max of event_id
calc: max of up_time
calc: sum of action_time
calc: max of action_time
calc: mean of action_time
calc: std of action_time
calc: nunique of activity
calc: nunique of down_event
calc: nunique of up_event
calc: nunique of text_change
calc: nunique of cursor_position
calc: max of cursor_position
calc: mean of cursor_position
calc: nunique of word_count
calc: max of word_count
calc: mean of word_count
calc: sum of word_count
calc: <function q1 at 0x7ba51db95750> of word_count
calc: <function q2 at 0x7ba51db957e0> of word_count
calc: <function q3 at 0x7ba51db95870> of word_count
calc: <function q4 at 0x7ba51db95900> of word_count
calc: <function q5 at 0x7ba51db95990> of word_count
calc: <function q6 at 0x7ba51db95a20> of word_count
calc: <function q7 at 0x7ba51db95ab0> of word_count
calc: <function q8 at 0x7ba51db95b40> of word_count
calc: <function q9 at 0x7ba51db95bd0> of word_count
calc: max of action_time_gap
calc: min of action_time_gap
calc: mean of action_time_ga

Unnamed: 0,id,activity_Nonproduction_cnt,activity_Input_cnt,activity_Remove/Cut_cnt,activity_Replace_cnt,activity_others_cnt,activity_Paste_cnt
0,001519c8,120,2010,417.0,7.0,3.0,
0,0022f953,254,1938,260.0,1.0,,1.0
0,0042269b,175,3515,439.0,7.0,,
0,0059420b,99,1304,151.0,1.0,,1.0
0,0075873a,72,1942,517.0,,,
...,...,...,...,...,...,...,...
0,ffb8c745,189,3588,960.0,2.0,,
0,ffbef7e5,148,2395,60.0,1.0,,
0,ffccd6fd,126,2849,88.0,,,
0,ffec5b38,71,2895,276.0,,,




  0%|          | 0/2471 [00:00<?, ?it/s]



Unnamed: 0,id,text_change
0,001519c8,"[qqqqqq, qqq, qqqqqqq, qqqqqq, qq, qqqq, qqqqq..."
1,0022f953,"[qqqq, qq, qqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqq, ..."
2,0042269b,"[qqqqqqq, qqq, qqqq, qqqq, qqqq, qq, qqqqqqqq,..."
3,0059420b,"[qqqq, qq, qqqqqqq, qqqqqq, q, q, qqq, qqqqqqq..."
4,0075873a,"[qqqqqqqqqqq, qq, q, qq, qqqqq, qq, qqqqqqqqqq..."
...,...,...
2466,ffb8c745,"[qq, qqqqq, q, qqqqqqq, qqq, q, qqqqq, q, qqqq..."
2467,ffbef7e5,"[qqqq, qqqqqq, qqqqq, qq, qqqqq, qqqqq, qq, qq..."
2468,ffccd6fd,"[qqqqqq, qqqq, q, qqqqqqq, qqqqqqqq, q, qq, qq..."
2469,ffec5b38,"[qqqqqqqqqqqq, qqqqqqq, qqqqqq, qqqq, qqqqq, q..."




Unnamed: 0,id,input_word_count,input_word_length_mean,input_word_length_max,input_word_length_std
0,001519c8,377,5.169761,20,3.346931
1,0022f953,401,4.234414,33,3.062917
2,0042269b,639,5.344288,25,3.372135
3,0059420b,255,4.537255,15,2.867940
4,0075873a,431,4.556845,14,2.783927
...,...,...,...,...,...
2466,ffb8c745,741,4.869096,20,2.977718
2467,ffbef7e5,473,4.059197,13,2.221528
2468,ffccd6fd,232,4.443966,15,2.693600
2469,ffec5b38,512,5.169922,24,3.419903


In [12]:
test_feats = make_feats(test_logs)



  0%|          | 0/12 [00:00<?, ?it/s]

calc: max of event_id
calc: max of up_time
calc: sum of action_time
calc: max of action_time
calc: mean of action_time
calc: std of action_time
calc: nunique of activity
calc: nunique of down_event
calc: nunique of up_event
calc: nunique of text_change
calc: nunique of cursor_position
calc: max of cursor_position
calc: mean of cursor_position
calc: nunique of word_count
calc: max of word_count
calc: mean of word_count
calc: sum of word_count
calc: <function q1 at 0x7ba51db95750> of word_count
calc: <function q2 at 0x7ba51db957e0> of word_count
calc: <function q3 at 0x7ba51db95870> of word_count
calc: <function q4 at 0x7ba51db95900> of word_count
calc: <function q5 at 0x7ba51db95990> of word_count
calc: <function q6 at 0x7ba51db95a20> of word_count
calc: <function q7 at 0x7ba51db95ab0> of word_count
calc: <function q8 at 0x7ba51db95b40> of word_count
calc: <function q9 at 0x7ba51db95bd0> of word_count
calc: max of action_time_gap
calc: min of action_time_gap
calc: mean of action_time_ga

Unnamed: 0,id,activity_Input_cnt
0,0000aaaa,2
0,2222bbbb,2
0,4444cccc,2




  0%|          | 0/3 [00:00<?, ?it/s]



Unnamed: 0,id,text_change
0,0000aaaa,[]
1,2222bbbb,[qq]
2,4444cccc,[q]




Unnamed: 0,id,input_word_count,input_word_length_mean,input_word_length_max,input_word_length_std
0,0000aaaa,0,0.0,0,0.0
1,2222bbbb,1,2.0,2,0.0
2,4444cccc,1,1.0,1,0.0


In [13]:
for col in train_feats.columns:
    if col in test_feats.columns:
        test_feats[col] = test_feats[col]
    else:
        test_feats[col] = 0

In [14]:
test_feats.shape

(3, 114)

In [15]:
train_feats = train_feats.merge(train_scores, on='id', how='left') # join object variance

In [16]:
display(train_feats)
print(train_feats.shape)
for col in train_feats.columns:
    print(col)
display(test_feats)
print(test_feats.shape)
print(test_feats.columns)
for col in test_feats.columns:
    print(col)

Unnamed: 0,id,event_id_max,up_time_max,action_time_sum,action_time_max,action_time_mean,action_time_std,activity_nunique,down_event_nunique,up_event_nunique,text_change_nunique,cursor_position_nunique,cursor_position_max,cursor_position_mean,word_count_nunique,word_count_max,word_count_mean,word_count_sum,word_count_q1,word_count_q2,word_count_q3,word_count_q4,word_count_q5,word_count_q6,word_count_q7,word_count_q8,word_count_q9,action_time_gap_max,action_time_gap_min,action_time_gap_mean,action_time_gap_std,action_time_gap_sum,action_time_gap_q1,action_time_gap_q2,action_time_gap_q3,action_time_gap_q4,action_time_gap_q5,action_time_gap_q6,action_time_gap_q7,action_time_gap_q8,action_time_gap_q9,cursor_position_change_max,cursor_position_change_mean,cursor_position_change_std,cursor_position_change_sum,word_count_change_max,word_count_change_mean,word_count_change_std,word_count_change_sum,activity_Nonproduction_cnt,activity_Input_cnt,activity_Remove/Cut_cnt,activity_Replace_cnt,activity_others_cnt,activity_Paste_cnt,down_event_Leftclick_cnt,down_event_Shift_cnt,down_event_q_cnt,down_event_Space_cnt,down_event_Backspace_cnt,down_event_._cnt,"down_event_,_cnt",down_event_Enter_cnt,down_event_ArrowLeft_cnt,down_event_'_cnt,down_event_others_cnt,down_event_ArrowRight_cnt,down_event_ArrowUp_cnt,down_event_ArrowDown_cnt,down_event_CapsLock_cnt,down_event_Delete_cnt,down_event_Unidentified_cnt,up_event_Leftclick_cnt,up_event_Shift_cnt,up_event_q_cnt,up_event_Space_cnt,up_event_Backspace_cnt,up_event_._cnt,"up_event_,_cnt",up_event_Enter_cnt,up_event_ArrowLeft_cnt,up_event_'_cnt,up_event_others_cnt,up_event_ArrowRight_cnt,up_event_ArrowUp_cnt,up_event_ArrowDown_cnt,up_event_CapsLock_cnt,up_event_Delete_cnt,up_event_Unidentified_cnt,text_change_NoChange_cnt,text_change_q_cnt,text_change_ _cnt,text_change_._cnt,"text_change_,_cnt",text_change_others_cnt,text_change_\n_cnt,text_change_'_cnt,text_change_;_cnt,text_change_-_cnt,text_change_?_cnt,"text_change_""_cnt",text_change_=_cnt,text_change_/_cnt,text_change_:_cnt,text_change_\_cnt,punct_cnt,input_word_count,input_word_length_mean,input_word_length_max,input_word_length_std,word_time_ratio,word_event_ratio,event_time_ratio,idle_time_ratio,score
0,001519c8,2557,1801969,297243,2259,116.246774,91.797374,7,12,12,17,1469,1539,711.163473,257,256,128.116152,327593,21.0,43.0,71.0,106.0,132.0,160.0,170.0,208.0,237.0,154136.0,-142.0,586.932707,4294.022274,1500200.0,-52.0,-22.0,-3.0,22.0,51.0,70.0,102.0,233.0,707.0,1350.0,4.159624,43.180116,10632.0,2.0,0.172535,0.381013,441.0,120,2010,417.0,7.0,3.0,,92.0,27.0,1619,357,417.0,21.0,12.0,4.0,2.0,3.0,1.0,2.0,,,,,,92.0,27.0,1619,357,417.0,21.0,12.0,4.0,2.0,3.0,1.0,2.0,,,,,,120,1940,436,28,14.0,9.0,4.0,5.0,1.0,,,,,,,,37,377,5.169761,20,3.346931,0.000142,0.100117,0.001419,0.832534,3.5
1,0022f953,2454,1788969,275391,1758,112.221271,55.431189,5,17,17,12,1416,1676,776.205786,324,323,182.714751,448382,34.3,81.0,124.0,162.0,186.0,221.0,252.0,284.0,315.0,145899.0,-166.0,604.547493,4897.303641,1482955.0,-34.8,-13.0,21.0,32.0,52.0,74.0,103.0,185.0,547.8,1581.0,9.819405,84.785626,24087.0,1.0,0.170404,0.376064,418.0,254,1938,260.0,1.0,,1.0,56.0,97.0,1490,391,260.0,15.0,21.0,6.0,49.0,3.0,15.0,46.0,2.0,3.0,,,,56.0,97.0,1490,391,260.0,15.0,21.0,6.0,49.0,3.0,15.0,46.0,2.0,3.0,,,,254,1698,432,18,24.0,2.0,7.0,4.0,,6.0,3.0,6.0,,,,,53,401,4.234414,33,3.062917,0.000181,0.131622,0.001372,0.828944,3.5
2,0042269b,4136,1771669,421201,3005,101.837766,82.383766,4,13,18,19,1649,2291,731.611702,405,404,194.772727,805580,34.5,89.0,133.0,176.0,193.0,214.0,262.0,295.0,350.0,153886.0,-250.0,325.520435,3937.359025,1346027.0,-57.0,-40.0,-25.0,-7.0,0.0,13.0,39.0,86.0,196.0,1862.0,6.531318,71.786451,27007.0,28.0,0.167836,0.644564,694.0,175,3515,439.0,7.0,,,129.0,39.0,2904,552,439.0,21.0,23.0,17.0,,,6.0,6.0,,,,,,129.0,39.0,2899,552,439.0,21.0,23.0,17.0,,,11.0,6.0,,,,,,175,3257,615,23,26.0,10.0,23.0,,,1.0,,2.0,4.0,,,,47,639,5.344288,25,3.372135,0.000228,0.097679,0.002335,0.759751,6.0
3,0059420b,1556,1404469,189596,806,121.848329,113.768226,5,15,15,10,1048,1047,542.537275,207,206,103.618895,161231,16.0,39.0,61.0,82.0,108.5,130.0,150.0,165.0,187.0,101690.0,-516.0,754.648232,4242.152639,1173478.0,-8.8,3.0,46.2,99.0,131.0,196.0,353.2,594.4,1236.6,357.0,1.457878,9.920533,2267.0,1.0,0.181350,0.385432,282.0,99,1304,151.0,1.0,,1.0,18.0,68.0,1038,243,152.0,13.0,3.0,3.0,,2.0,14.0,,,,2.0,,,18.0,68.0,1038,243,152.0,13.0,3.0,3.0,,2.0,14.0,,,,2.0,,,99,1146,281,13,3.0,2.0,4.0,3.0,,,,,5.0,,,,18,255,4.537255,15,2.867940,0.000147,0.132391,0.001108,0.835531,2.0
4,0075873a,2531,1662472,313702,701,123.943896,62.082013,3,11,11,9,1197,1402,600.050968,253,252,125.082971,316585,32.0,48.0,61.0,86.0,113.0,144.0,189.0,219.0,228.0,110688.0,-158.0,502.094862,3896.209237,1270300.0,-46.0,-20.0,0.0,0.0,36.0,73.0,105.0,160.0,362.1,643.0,2.803953,24.251326,7094.0,1.0,0.168379,0.374277,426.0,72,1942,517.0,,,,33.0,39.0,1541,324,517.0,23.0,24.0,10.0,,17.0,3.0,,,,,,,33.0,39.0,1541,324,517.0,23.0,24.0,10.0,,17.0,3.0,,,,,,,72,1964,397,32,25.0,,12.0,25.0,,,2.0,,2.0,,,,66,431,4.556845,14,2.783927,0.000152,0.099565,0.001522,0.764103,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2466,ffb8c745,4739,1791649,499670,3323,105.437856,63.622575,4,13,13,13,1484,1634,774.000633,462,461,256.353661,1214860,70.0,133.0,190.0,253.0,297.0,315.0,328.0,350.4,394.0,128570.0,-117.0,267.942592,3458.823757,1269512.0,-41.0,-23.0,-8.0,0.0,0.0,23.0,52.0,83.0,187.0,1262.0,1.990713,27.213406,9432.0,170.0,0.204095,2.495495,967.0,189,3588,960.0,2.0,,,24.0,164.0,2844,651,960.0,43.0,32.0,7.0,,7.0,7.0,,,,,,,24.0,164.0,2844,651,960.0,43.0,32.0,7.0,,7.0,7.0,,,,,,,189,3605,813,59,42.0,3.0,11.0,8.0,,1.0,,6.0,,,2.0,,88,741,4.869096,20,2.977718,0.000257,0.097278,0.002645,0.708572,3.5
2467,ffbef7e5,2604,1799174,214221,1144,82.266129,36.178818,4,11,11,8,1808,1877,1022.078725,439,438,223.013057,580726,41.0,90.6,135.0,180.0,227.5,275.8,316.0,354.0,392.0,267869.0,-64.0,600.545909,5630.628933,1563221.0,59.0,91.0,117.0,141.0,172.0,214.2,264.0,339.0,539.8,1124.0,2.820207,35.614696,7341.0,1.0,0.177488,0.382154,462.0,148,2395,60.0,1.0,,,36.0,106.0,1874,447,60.0,31.0,24.0,12.0,,8.0,6.0,,,,,,,36.0,106.0,1874,447,60.0,31.0,24.0,12.0,,8.0,6.0,,,,,,,148,1920,457,33,24.0,1.0,12.0,9.0,,,,,,,,,63,473,4.059197,13,2.221528,0.000243,0.168203,0.001447,0.868855,4.0
2468,ffccd6fd,3063,1959363,231580,564,75.605615,63.494975,3,11,11,6,2759,2761,1309.085537,202,201,157.589292,482696,53.0,88.0,141.0,181.0,201.0,201.0,201.0,201.0,201.0,229804.0,-87.0,556.597322,5398.118769,1704301.0,0.0,0.0,0.0,0.0,116.0,153.0,195.0,311.0,613.4,427.0,1.344546,8.509608,4117.0,1.0,0.073481,0.260968,225.0,126,2849,88.0,,,,9.0,,969,1861,88.0,5.0,2.0,12.0,53.0,,,29.0,,29.0,6.0,,,9.0,,969,1861,88.0,5.0,2.0,12.0,53.0,,,29.0,,29.0,6.0,,,126,1031,1879,6,3.0,,18.0,,,,,,,,,,7,232,4.443966,15,2.693600,0.000103,0.065622,0.001563,0.869824,1.5
2469,ffec5b38,3242,1508504,289439,1388,89.277915,54.515788,3,15,15,13,2106,2133,1192.640962,414,413,205.917027,667583,42.0,80.0,124.0,163.4,205.0,252.0,283.0,326.0,370.0,127733.0,-132.0,370.003085,3462.066161,1199180.0,-20.0,0.0,32.0,57.0,89.0,118.0,151.0,213.0,396.0,563.0,1.814563,16.147617,5881.0,9.0,0.163838,0.401387,531.0,71,2895,276.0,,,,14.0,52.0,2361,457,276.0,31.0,27.0,6.0,,4.0,12.0,,,,2.0,,,14.0,52.0,2361,457,276.0,31.0,27.0,6.0,,4.0,12.0,,,,2.0,,,71,2593,490,34,29.0,4.0,8.0,4.0,2.0,1.0,,6.0,,,,,70,512,5.169922,24,3.419903,0.000274,0.127390,0.002149,0.794947,5.0


(2471, 115)
id
event_id_max
up_time_max
action_time_sum
action_time_max
action_time_mean
action_time_std
activity_nunique
down_event_nunique
up_event_nunique
text_change_nunique
cursor_position_nunique
cursor_position_max
cursor_position_mean
word_count_nunique
word_count_max
word_count_mean
word_count_sum
word_count_q1
word_count_q2
word_count_q3
word_count_q4
word_count_q5
word_count_q6
word_count_q7
word_count_q8
word_count_q9
action_time_gap_max
action_time_gap_min
action_time_gap_mean
action_time_gap_std
action_time_gap_sum
action_time_gap_q1
action_time_gap_q2
action_time_gap_q3
action_time_gap_q4
action_time_gap_q5
action_time_gap_q6
action_time_gap_q7
action_time_gap_q8
action_time_gap_q9
cursor_position_change_max
cursor_position_change_mean
cursor_position_change_std
cursor_position_change_sum
word_count_change_max
word_count_change_mean
word_count_change_std
word_count_change_sum
activity_Nonproduction_cnt
activity_Input_cnt
activity_Remove/Cut_cnt
activity_Replace_cnt
activ

Unnamed: 0,id,event_id_max,up_time_max,action_time_sum,action_time_max,action_time_mean,action_time_std,activity_nunique,down_event_nunique,up_event_nunique,text_change_nunique,cursor_position_nunique,cursor_position_max,cursor_position_mean,word_count_nunique,word_count_max,word_count_mean,word_count_sum,word_count_q1,word_count_q2,word_count_q3,word_count_q4,word_count_q5,word_count_q6,word_count_q7,word_count_q8,word_count_q9,action_time_gap_max,action_time_gap_min,action_time_gap_mean,action_time_gap_std,action_time_gap_sum,action_time_gap_q1,action_time_gap_q2,action_time_gap_q3,action_time_gap_q4,action_time_gap_q5,action_time_gap_q6,action_time_gap_q7,action_time_gap_q8,action_time_gap_q9,cursor_position_change_max,cursor_position_change_mean,cursor_position_change_std,cursor_position_change_sum,word_count_change_max,word_count_change_mean,word_count_change_std,word_count_change_sum,activity_Input_cnt,down_event_Space_cnt,down_event_q_cnt,up_event_Space_cnt,up_event_q_cnt,text_change_ _cnt,text_change_q_cnt,punct_cnt,input_word_count,input_word_length_mean,input_word_length_max,input_word_length_std,word_time_ratio,word_event_ratio,event_time_ratio,idle_time_ratio,activity_Nonproduction_cnt,activity_Remove/Cut_cnt,activity_Replace_cnt,activity_others_cnt,activity_Paste_cnt,down_event_Leftclick_cnt,down_event_Shift_cnt,down_event_Backspace_cnt,down_event_._cnt,"down_event_,_cnt",down_event_Enter_cnt,down_event_ArrowLeft_cnt,down_event_'_cnt,down_event_others_cnt,down_event_ArrowRight_cnt,down_event_ArrowUp_cnt,down_event_ArrowDown_cnt,down_event_CapsLock_cnt,down_event_Delete_cnt,down_event_Unidentified_cnt,up_event_Leftclick_cnt,up_event_Shift_cnt,up_event_Backspace_cnt,up_event_._cnt,"up_event_,_cnt",up_event_Enter_cnt,up_event_ArrowLeft_cnt,up_event_'_cnt,up_event_others_cnt,up_event_ArrowRight_cnt,up_event_ArrowUp_cnt,up_event_ArrowDown_cnt,up_event_CapsLock_cnt,up_event_Delete_cnt,up_event_Unidentified_cnt,text_change_NoChange_cnt,text_change_._cnt,"text_change_,_cnt",text_change_others_cnt,text_change_\n_cnt,text_change_'_cnt,text_change_;_cnt,text_change_-_cnt,text_change_?_cnt,"text_change_""_cnt",text_change_=_cnt,text_change_/_cnt,text_change_:_cnt,text_change_\_cnt
0,0000aaaa,2,760160,172,87,86.0,1.414214,1,1,1,1,2,1,0.5,1,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,421555.0,421555.0,421555.0,,421555.0,421555.0,421555.0,421555.0,421555.0,421555.0,421555.0,421555.0,421555.0,421555.0,1.0,1.0,,1.0,0.0,0.0,,0.0,2,2.0,,2.0,,2.0,,0,0,0.0,0,0.0,0.0,0.0,3e-06,0.554561,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2222bbbb,2,712023,113,67,56.5,14.849242,1,1,1,1,2,1,0.5,1,1,1.0,2,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,-421521.0,-421521.0,-421521.0,,-421521.0,-421521.0,-421521.0,-421521.0,-421521.0,-421521.0,-421521.0,-421521.0,-421521.0,-421521.0,1.0,1.0,,1.0,0.0,0.0,,0.0,2,,2.0,,2.0,,2.0,0,1,2.0,2,0.0,1e-06,0.5,3e-06,-0.592005,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,4444cccc,2,635641,150,94,75.0,26.870058,1,2,2,2,2,1,0.5,2,1,0.5,1,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,-450645.0,-450645.0,-450645.0,,-450645.0,-450645.0,-450645.0,-450645.0,-450645.0,-450645.0,-450645.0,-450645.0,-450645.0,-450645.0,1.0,1.0,,1.0,1.0,1.0,,1.0,2,1.0,1.0,1.0,1.0,1.0,1.0,0,1,1.0,1,0.0,2e-06,0.5,3e-06,-0.708962,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


(3, 114)
Index(['id', 'event_id_max', 'up_time_max', 'action_time_sum',
       'action_time_max', 'action_time_mean', 'action_time_std',
       'activity_nunique', 'down_event_nunique', 'up_event_nunique',
       ...
       'text_change_\n_cnt', 'text_change_'_cnt', 'text_change_;_cnt',
       'text_change_-_cnt', 'text_change_?_cnt', 'text_change_"_cnt',
       'text_change_=_cnt', 'text_change_/_cnt', 'text_change_:_cnt',
       'text_change_\_cnt'],
      dtype='object', length=114)
id
event_id_max
up_time_max
action_time_sum
action_time_max
action_time_mean
action_time_std
activity_nunique
down_event_nunique
up_event_nunique
text_change_nunique
cursor_position_nunique
cursor_position_max
cursor_position_mean
word_count_nunique
word_count_max
word_count_mean
word_count_sum
word_count_q1
word_count_q2
word_count_q3
word_count_q4
word_count_q5
word_count_q6
word_count_q7
word_count_q8
word_count_q9
action_time_gap_max
action_time_gap_min
action_time_gap_mean
action_time_gap_std
action

In [17]:
# in order to use data in model , change colname
col_map_org_to_new = {}
col_map_new_to_org = {}
for i,col in enumerate(train_feats.columns):
    if col not in [exp_key_column , obj_column]:
        col_map_org_to_new[col] = f"col_{i}"
        col_map_new_to_org[f"col_{i}"] = col

In [18]:
train_feats = train_feats.rename(columns= col_map_org_to_new)
test_feats = test_feats.rename(columns= col_map_org_to_new)

In [19]:
train_feats

Unnamed: 0,id,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,col_10,col_11,col_12,col_13,col_14,col_15,col_16,col_17,col_18,col_19,col_20,col_21,col_22,col_23,col_24,col_25,col_26,col_27,col_28,col_29,col_30,col_31,col_32,col_33,col_34,col_35,col_36,col_37,col_38,col_39,col_40,col_41,col_42,col_43,col_44,col_45,col_46,col_47,col_48,col_49,col_50,col_51,col_52,col_53,col_54,col_55,col_56,col_57,col_58,col_59,col_60,col_61,col_62,col_63,col_64,col_65,col_66,col_67,col_68,col_69,col_70,col_71,col_72,col_73,col_74,col_75,col_76,col_77,col_78,col_79,col_80,col_81,col_82,col_83,col_84,col_85,col_86,col_87,col_88,col_89,col_90,col_91,col_92,col_93,col_94,col_95,col_96,col_97,col_98,col_99,col_100,col_101,col_102,col_103,col_104,col_105,col_106,col_107,col_108,col_109,col_110,col_111,col_112,col_113,score
0,001519c8,2557,1801969,297243,2259,116.246774,91.797374,7,12,12,17,1469,1539,711.163473,257,256,128.116152,327593,21.0,43.0,71.0,106.0,132.0,160.0,170.0,208.0,237.0,154136.0,-142.0,586.932707,4294.022274,1500200.0,-52.0,-22.0,-3.0,22.0,51.0,70.0,102.0,233.0,707.0,1350.0,4.159624,43.180116,10632.0,2.0,0.172535,0.381013,441.0,120,2010,417.0,7.0,3.0,,92.0,27.0,1619,357,417.0,21.0,12.0,4.0,2.0,3.0,1.0,2.0,,,,,,92.0,27.0,1619,357,417.0,21.0,12.0,4.0,2.0,3.0,1.0,2.0,,,,,,120,1940,436,28,14.0,9.0,4.0,5.0,1.0,,,,,,,,37,377,5.169761,20,3.346931,0.000142,0.100117,0.001419,0.832534,3.5
1,0022f953,2454,1788969,275391,1758,112.221271,55.431189,5,17,17,12,1416,1676,776.205786,324,323,182.714751,448382,34.3,81.0,124.0,162.0,186.0,221.0,252.0,284.0,315.0,145899.0,-166.0,604.547493,4897.303641,1482955.0,-34.8,-13.0,21.0,32.0,52.0,74.0,103.0,185.0,547.8,1581.0,9.819405,84.785626,24087.0,1.0,0.170404,0.376064,418.0,254,1938,260.0,1.0,,1.0,56.0,97.0,1490,391,260.0,15.0,21.0,6.0,49.0,3.0,15.0,46.0,2.0,3.0,,,,56.0,97.0,1490,391,260.0,15.0,21.0,6.0,49.0,3.0,15.0,46.0,2.0,3.0,,,,254,1698,432,18,24.0,2.0,7.0,4.0,,6.0,3.0,6.0,,,,,53,401,4.234414,33,3.062917,0.000181,0.131622,0.001372,0.828944,3.5
2,0042269b,4136,1771669,421201,3005,101.837766,82.383766,4,13,18,19,1649,2291,731.611702,405,404,194.772727,805580,34.5,89.0,133.0,176.0,193.0,214.0,262.0,295.0,350.0,153886.0,-250.0,325.520435,3937.359025,1346027.0,-57.0,-40.0,-25.0,-7.0,0.0,13.0,39.0,86.0,196.0,1862.0,6.531318,71.786451,27007.0,28.0,0.167836,0.644564,694.0,175,3515,439.0,7.0,,,129.0,39.0,2904,552,439.0,21.0,23.0,17.0,,,6.0,6.0,,,,,,129.0,39.0,2899,552,439.0,21.0,23.0,17.0,,,11.0,6.0,,,,,,175,3257,615,23,26.0,10.0,23.0,,,1.0,,2.0,4.0,,,,47,639,5.344288,25,3.372135,0.000228,0.097679,0.002335,0.759751,6.0
3,0059420b,1556,1404469,189596,806,121.848329,113.768226,5,15,15,10,1048,1047,542.537275,207,206,103.618895,161231,16.0,39.0,61.0,82.0,108.5,130.0,150.0,165.0,187.0,101690.0,-516.0,754.648232,4242.152639,1173478.0,-8.8,3.0,46.2,99.0,131.0,196.0,353.2,594.4,1236.6,357.0,1.457878,9.920533,2267.0,1.0,0.181350,0.385432,282.0,99,1304,151.0,1.0,,1.0,18.0,68.0,1038,243,152.0,13.0,3.0,3.0,,2.0,14.0,,,,2.0,,,18.0,68.0,1038,243,152.0,13.0,3.0,3.0,,2.0,14.0,,,,2.0,,,99,1146,281,13,3.0,2.0,4.0,3.0,,,,,5.0,,,,18,255,4.537255,15,2.867940,0.000147,0.132391,0.001108,0.835531,2.0
4,0075873a,2531,1662472,313702,701,123.943896,62.082013,3,11,11,9,1197,1402,600.050968,253,252,125.082971,316585,32.0,48.0,61.0,86.0,113.0,144.0,189.0,219.0,228.0,110688.0,-158.0,502.094862,3896.209237,1270300.0,-46.0,-20.0,0.0,0.0,36.0,73.0,105.0,160.0,362.1,643.0,2.803953,24.251326,7094.0,1.0,0.168379,0.374277,426.0,72,1942,517.0,,,,33.0,39.0,1541,324,517.0,23.0,24.0,10.0,,17.0,3.0,,,,,,,33.0,39.0,1541,324,517.0,23.0,24.0,10.0,,17.0,3.0,,,,,,,72,1964,397,32,25.0,,12.0,25.0,,,2.0,,2.0,,,,66,431,4.556845,14,2.783927,0.000152,0.099565,0.001522,0.764103,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2466,ffb8c745,4739,1791649,499670,3323,105.437856,63.622575,4,13,13,13,1484,1634,774.000633,462,461,256.353661,1214860,70.0,133.0,190.0,253.0,297.0,315.0,328.0,350.4,394.0,128570.0,-117.0,267.942592,3458.823757,1269512.0,-41.0,-23.0,-8.0,0.0,0.0,23.0,52.0,83.0,187.0,1262.0,1.990713,27.213406,9432.0,170.0,0.204095,2.495495,967.0,189,3588,960.0,2.0,,,24.0,164.0,2844,651,960.0,43.0,32.0,7.0,,7.0,7.0,,,,,,,24.0,164.0,2844,651,960.0,43.0,32.0,7.0,,7.0,7.0,,,,,,,189,3605,813,59,42.0,3.0,11.0,8.0,,1.0,,6.0,,,2.0,,88,741,4.869096,20,2.977718,0.000257,0.097278,0.002645,0.708572,3.5
2467,ffbef7e5,2604,1799174,214221,1144,82.266129,36.178818,4,11,11,8,1808,1877,1022.078725,439,438,223.013057,580726,41.0,90.6,135.0,180.0,227.5,275.8,316.0,354.0,392.0,267869.0,-64.0,600.545909,5630.628933,1563221.0,59.0,91.0,117.0,141.0,172.0,214.2,264.0,339.0,539.8,1124.0,2.820207,35.614696,7341.0,1.0,0.177488,0.382154,462.0,148,2395,60.0,1.0,,,36.0,106.0,1874,447,60.0,31.0,24.0,12.0,,8.0,6.0,,,,,,,36.0,106.0,1874,447,60.0,31.0,24.0,12.0,,8.0,6.0,,,,,,,148,1920,457,33,24.0,1.0,12.0,9.0,,,,,,,,,63,473,4.059197,13,2.221528,0.000243,0.168203,0.001447,0.868855,4.0
2468,ffccd6fd,3063,1959363,231580,564,75.605615,63.494975,3,11,11,6,2759,2761,1309.085537,202,201,157.589292,482696,53.0,88.0,141.0,181.0,201.0,201.0,201.0,201.0,201.0,229804.0,-87.0,556.597322,5398.118769,1704301.0,0.0,0.0,0.0,0.0,116.0,153.0,195.0,311.0,613.4,427.0,1.344546,8.509608,4117.0,1.0,0.073481,0.260968,225.0,126,2849,88.0,,,,9.0,,969,1861,88.0,5.0,2.0,12.0,53.0,,,29.0,,29.0,6.0,,,9.0,,969,1861,88.0,5.0,2.0,12.0,53.0,,,29.0,,29.0,6.0,,,126,1031,1879,6,3.0,,18.0,,,,,,,,,,7,232,4.443966,15,2.693600,0.000103,0.065622,0.001563,0.869824,1.5
2469,ffec5b38,3242,1508504,289439,1388,89.277915,54.515788,3,15,15,13,2106,2133,1192.640962,414,413,205.917027,667583,42.0,80.0,124.0,163.4,205.0,252.0,283.0,326.0,370.0,127733.0,-132.0,370.003085,3462.066161,1199180.0,-20.0,0.0,32.0,57.0,89.0,118.0,151.0,213.0,396.0,563.0,1.814563,16.147617,5881.0,9.0,0.163838,0.401387,531.0,71,2895,276.0,,,,14.0,52.0,2361,457,276.0,31.0,27.0,6.0,,4.0,12.0,,,,2.0,,,14.0,52.0,2361,457,276.0,31.0,27.0,6.0,,4.0,12.0,,,,2.0,,,71,2593,490,34,29.0,4.0,8.0,4.0,2.0,1.0,,6.0,,,,,70,512,5.169922,24,3.419903,0.000274,0.127390,0.002149,0.794947,5.0


In [20]:
test_feats

Unnamed: 0,id,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,col_10,col_11,col_12,col_13,col_14,col_15,col_16,col_17,col_18,col_19,col_20,col_21,col_22,col_23,col_24,col_25,col_26,col_27,col_28,col_29,col_30,col_31,col_32,col_33,col_34,col_35,col_36,col_37,col_38,col_39,col_40,col_41,col_42,col_43,col_44,col_45,col_46,col_47,col_48,col_50,col_58,col_57,col_75,col_74,col_91,col_90,col_105,col_106,col_107,col_108,col_109,col_110,col_111,col_112,col_113,col_49,col_51,col_52,col_53,col_54,col_55,col_56,col_59,col_60,col_61,col_62,col_63,col_64,col_65,col_66,col_67,col_68,col_69,col_70,col_71,col_72,col_73,col_76,col_77,col_78,col_79,col_80,col_81,col_82,col_83,col_84,col_85,col_86,col_87,col_88,col_89,col_92,col_93,col_94,col_95,col_96,col_97,col_98,col_99,col_100,col_101,col_102,col_103,col_104
0,0000aaaa,2,760160,172,87,86.0,1.414214,1,1,1,1,2,1,0.5,1,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,421555.0,421555.0,421555.0,,421555.0,421555.0,421555.0,421555.0,421555.0,421555.0,421555.0,421555.0,421555.0,421555.0,1.0,1.0,,1.0,0.0,0.0,,0.0,2,2.0,,2.0,,2.0,,0,0,0.0,0,0.0,0.0,0.0,3e-06,0.554561,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2222bbbb,2,712023,113,67,56.5,14.849242,1,1,1,1,2,1,0.5,1,1,1.0,2,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,-421521.0,-421521.0,-421521.0,,-421521.0,-421521.0,-421521.0,-421521.0,-421521.0,-421521.0,-421521.0,-421521.0,-421521.0,-421521.0,1.0,1.0,,1.0,0.0,0.0,,0.0,2,,2.0,,2.0,,2.0,0,1,2.0,2,0.0,1e-06,0.5,3e-06,-0.592005,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,4444cccc,2,635641,150,94,75.0,26.870058,1,2,2,2,2,1,0.5,2,1,0.5,1,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,-450645.0,-450645.0,-450645.0,,-450645.0,-450645.0,-450645.0,-450645.0,-450645.0,-450645.0,-450645.0,-450645.0,-450645.0,-450645.0,1.0,1.0,,1.0,1.0,1.0,,1.0,2,1.0,1.0,1.0,1.0,1.0,1.0,0,1,1.0,1,0.0,2e-06,0.5,3e-06,-0.708962,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


# 2. Model

## 2-1. prepartation

### 2-1-1. package import

In [21]:
import optuna
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

### 2-1-2 . data split into X and Y

In [22]:
Y = train_feats[obj_column]
X = train_feats.drop([exp_key_column , obj_column] , axis=1)

### 2-1-3. metric function& predict function

In [23]:
metric = "rsme"

In [24]:
def classify_predict(predict):
    res = 0
    if predict <= 0.25:
        res = 0
    elif 0.25 < predict and predict <= 0.75:
        res = 0.5
    elif 0.75 < predict and predict <= 1.25:
        res = 1.0
    elif 1.25 < predict and predict <= 1.75:
        res = 1.5
    elif 1.75 < predict and predict <= 2.25:
        res = 2.0
    elif 2.25 < predict and predict <= 2.75:
        res = 2.5
    elif 2.75 < predict and predict <= 3.25:
        res = 3.0
    elif 3.25 < predict and predict <= 3.75:
        res = 3.5
    elif 3.75 < predict and predict <= 4.25:
        res = 4.0
    elif 4.25 < predict and predict <= 4.75:
        res = 4.5
    elif 4.75 < predict and predict <= 5.25:
        res = 5.0
    elif 5.25 < predict and predict <= 5.75:
        res = 5.5
    else:
        res = 6.0
    return res
        

In [25]:
classify_predict(4.8)

5.0

In [26]:
def classify_predict_vectorize(lis):
    res = np.vectorize(classify_predict)(lis)
    return res

## 2-2. AutoML

### 2-2-1. TPOT

In [27]:
# !pip install TPOT

In [28]:
# from tpot import TPOTRegressor

In [29]:
# tpot = TPOTRegressor(scoring='neg_mean_absolute_error',
#                      max_time_mins = 100,
#                      generations=10,
#                      population_size=2,
#                      random_state=42,
#                      verbosity=2,
#                      n_jobs=-1,
#                      memory = "auto"
#                     )
# tpot.fit(X,Y)
# tpot.fitted_pipeline_

In [30]:
# tpot

In [31]:
# def calculate_scores(true, pred):
#     """全ての評価指標を計算する

#     Parameters
#     ----------
#     true (np.array)       : 実測値
#     pred (np.array)       : 予測値

#     Returns
#     -------
#     scores (pd.DataFrame) : 各評価指標を纏めた結果

#     """
#     scores = {}
#     scores = np.sqrt(mean_squared_error(true, pred))
#     return scores

# scores = calculate_scores(Y, tpot.predict(X))
# print(scores)

In [32]:
# tpot_dict = {}
# tpot_dict["model"] = tpot.fitted_pipeline_

### 2-2-2. pycaret

In [33]:
# !pip install pycaret==2.0
# # !pip install pycaret--no-deps

In [34]:
# from pycaret.regression import *

In [35]:
# data_pycaret = X.join(Y)
# exp = setup(data = data_pycaret, target = obj_column,train_size=0.7,data_split_shuffle=True,session_id=2)
# compare_models()

## 2-2. study by Optuna
find best parameters of each models

In [36]:
models = {}
n_trials = 10
n_splits = 10

### 2-2-1. lgb

In [37]:
def lgb_objective(trial,data=X,target=Y):
    
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.2,random_state=42)
    param = {
        'metric': 'rmse', 
        'random_state': 42,
        'n_estimators': trial.suggest_int('n_estimators', 1000,30000),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.006,0.008,0.01,0.014,0.017,0.02]),
        'max_depth': trial.suggest_int('max_depth', 1 , 100),
        'num_leaves' : trial.suggest_int('num_leaves', 1, 1000),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 300),
        'cat_smooth' : trial.suggest_int('min_data_per_groups', 1, 100)
    }
    model = lgb.LGBMRegressor(**param)  
    model.fit(train_x,train_y,eval_set=[(test_x,test_y)],early_stopping_rounds=100,verbose=False)
    
    preds = model.predict(test_x)
    preds = classify_predict_vectorize(preds)
    
    rmse = mean_squared_error(test_y, preds,squared=False)
    
    return rmse

In [38]:
lgb_dict = {}
lgb_study = optuna.create_study(direction='minimize')
lgb_study.optimize(lgb_objective, n_trials=n_trials)
print('Number of finished trials:', len(lgb_study.trials))
print('Best trial:', lgb_study.best_trial.params)
lgb_param = lgb_study.best_params 
lgb_dict["model"] = lgb.LGBMRegressor(**lgb_param)  

[I 2023-10-15 12:01:20,610] A new study created in memory with name: no-name-563344ab-b64e-4a57-9bea-883713b94c6e
[I 2023-10-15 12:01:24,952] Trial 0 finished with value: 0.5958611457802103 and parameters: {'n_estimators': 14846, 'reg_alpha': 2.159009812627497, 'reg_lambda': 0.06267290133873017, 'colsample_bytree': 0.8, 'subsample': 0.7, 'learning_rate': 0.01, 'max_depth': 60, 'num_leaves': 136, 'min_child_samples': 75, 'min_data_per_groups': 4}. Best is trial 0 with value: 0.5958611457802103.
[I 2023-10-15 12:01:47,183] Trial 1 finished with value: 0.6063635606439157 and parameters: {'n_estimators': 1109, 'reg_alpha': 0.0016806174790213136, 'reg_lambda': 0.4176570454211782, 'colsample_bytree': 0.8, 'subsample': 0.8, 'learning_rate': 0.017, 'max_depth': 50, 'num_leaves': 529, 'min_child_samples': 9, 'min_data_per_groups': 88}. Best is trial 0 with value: 0.5958611457802103.
[I 2023-10-15 12:01:49,387] Trial 2 finished with value: 0.5958611457802103 and parameters: {'n_estimators': 2646

Number of finished trials: 10
Best trial: {'n_estimators': 26445, 'reg_alpha': 0.08281031175927213, 'reg_lambda': 0.004897235767472211, 'colsample_bytree': 0.7, 'subsample': 0.4, 'learning_rate': 0.008, 'max_depth': 74, 'num_leaves': 295, 'min_child_samples': 108, 'min_data_per_groups': 43}


### 2-2-2. xgb

In [39]:
def xgb_objective(trial,data=X,target=Y):
    
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.2,random_state=42)
    param = {
        'max_depth': trial.suggest_int('max_depth', 1, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 1.0),
        'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_float('gamma', 0.01, 1.0),
        'subsample': trial.suggest_float('subsample', 0.01, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.01, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.01, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.01, 1.0),
        'random_state': trial.suggest_int('random_state', 1, 1000)
    }
    model = XGBRegressor(**param)  
    
    model.fit(train_x,train_y,eval_set=[(test_x,test_y)],early_stopping_rounds=100,verbose=False)
    
    preds = model.predict(test_x)
    preds = classify_predict_vectorize(preds)
    
    rmse = mean_squared_error(test_y, preds,squared=False)
    
    return rmse

In [40]:
xgb_dict = {}
xgb_study = optuna.create_study(direction='minimize')
xgb_study.optimize(xgb_objective, n_trials=n_trials)
print('Number of finished trials:', len(xgb_study.trials))
print('Best trial:', xgb_study.best_trial.params)
xgb_params=xgb_study.best_params 
xgb_dict["model"] = XGBRegressor(**xgb_params)

[I 2023-10-15 12:02:31,465] A new study created in memory with name: no-name-bf00d52a-54ec-4fe0-9b73-609a37640ec4
[I 2023-10-15 12:02:33,815] Trial 0 finished with value: 0.6617241025372945 and parameters: {'max_depth': 9, 'learning_rate': 0.38895736449771984, 'n_estimators': 772, 'min_child_weight': 1, 'gamma': 0.9205509738431664, 'subsample': 0.8743591558792854, 'colsample_bytree': 0.31008802029514376, 'reg_alpha': 0.8905703909278885, 'reg_lambda': 0.3128264087816382, 'random_state': 672}. Best is trial 0 with value: 0.6617241025372945.
[I 2023-10-15 12:02:34,827] Trial 1 finished with value: 0.6134024993938534 and parameters: {'max_depth': 7, 'learning_rate': 0.10684836086877675, 'n_estimators': 878, 'min_child_weight': 1, 'gamma': 0.8310268611487704, 'subsample': 0.330899664590287, 'colsample_bytree': 0.12501810095639546, 'reg_alpha': 0.49011906137512795, 'reg_lambda': 0.46270067977546686, 'random_state': 279}. Best is trial 1 with value: 0.6134024993938534.
[I 2023-10-15 12:02:38,

Number of finished trials: 10
Best trial: {'max_depth': 1, 'learning_rate': 0.11901826660601969, 'n_estimators': 865, 'min_child_weight': 1, 'gamma': 0.09415631864283998, 'subsample': 0.6677796817397793, 'colsample_bytree': 0.667828841165899, 'reg_alpha': 0.8202487758136123, 'reg_lambda': 0.862356894171387, 'random_state': 212}


### 2-2-3. randomforest

In [41]:
# def rf_objective(trial,data=X,target=Y):
    
#     train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.2,random_state=42)
#     param = {
#         "max_depth": trial.suggest_int('max_depth', 2, 1000),
#         "max_leaf_nodes": trial.suggest_int('max_leaf_nodes', 2,1000),
#     }
#     model = HistGradientBoostingRegressor(**param)  
    
#     model.fit(train_x,train_y)
    
#     preds = model.predict(test_x)
#     preds = classify_predict_vectorize(preds)
    
#     rmse = mean_squared_error(test_y, preds,squared=False)
    
#     return rmse

In [42]:
# rf_dict = {}
# rf_study = optuna.create_study(direction='minimize')
# rf_study.optimize(rf_objective, n_trials=n_trials)
# print('Number of finished trials:', len(rf_study.trials))
# print('Best trial:', rf_study.best_trial.params)
# rf_params=rf_study.best_params 
# rf_dict["model"] = HistGradientBoostingRegressor(**rf_params)

In [43]:
models["lgb"] = lgb_dict
models["xgb"] = xgb_dict
# models["tpot"] = tpot_dict
# models["rf"] = rf_dict

## 2-3. train lGBM

In [44]:
feature_names = list(filter(lambda x: x not in [obj_column, exp_key_column], train_feats.columns))

In [45]:
train_feats

Unnamed: 0,id,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,col_10,col_11,col_12,col_13,col_14,col_15,col_16,col_17,col_18,col_19,col_20,col_21,col_22,col_23,col_24,col_25,col_26,col_27,col_28,col_29,col_30,col_31,col_32,col_33,col_34,col_35,col_36,col_37,col_38,col_39,col_40,col_41,col_42,col_43,col_44,col_45,col_46,col_47,col_48,col_49,col_50,col_51,col_52,col_53,col_54,col_55,col_56,col_57,col_58,col_59,col_60,col_61,col_62,col_63,col_64,col_65,col_66,col_67,col_68,col_69,col_70,col_71,col_72,col_73,col_74,col_75,col_76,col_77,col_78,col_79,col_80,col_81,col_82,col_83,col_84,col_85,col_86,col_87,col_88,col_89,col_90,col_91,col_92,col_93,col_94,col_95,col_96,col_97,col_98,col_99,col_100,col_101,col_102,col_103,col_104,col_105,col_106,col_107,col_108,col_109,col_110,col_111,col_112,col_113,score
0,001519c8,2557,1801969,297243,2259,116.246774,91.797374,7,12,12,17,1469,1539,711.163473,257,256,128.116152,327593,21.0,43.0,71.0,106.0,132.0,160.0,170.0,208.0,237.0,154136.0,-142.0,586.932707,4294.022274,1500200.0,-52.0,-22.0,-3.0,22.0,51.0,70.0,102.0,233.0,707.0,1350.0,4.159624,43.180116,10632.0,2.0,0.172535,0.381013,441.0,120,2010,417.0,7.0,3.0,,92.0,27.0,1619,357,417.0,21.0,12.0,4.0,2.0,3.0,1.0,2.0,,,,,,92.0,27.0,1619,357,417.0,21.0,12.0,4.0,2.0,3.0,1.0,2.0,,,,,,120,1940,436,28,14.0,9.0,4.0,5.0,1.0,,,,,,,,37,377,5.169761,20,3.346931,0.000142,0.100117,0.001419,0.832534,3.5
1,0022f953,2454,1788969,275391,1758,112.221271,55.431189,5,17,17,12,1416,1676,776.205786,324,323,182.714751,448382,34.3,81.0,124.0,162.0,186.0,221.0,252.0,284.0,315.0,145899.0,-166.0,604.547493,4897.303641,1482955.0,-34.8,-13.0,21.0,32.0,52.0,74.0,103.0,185.0,547.8,1581.0,9.819405,84.785626,24087.0,1.0,0.170404,0.376064,418.0,254,1938,260.0,1.0,,1.0,56.0,97.0,1490,391,260.0,15.0,21.0,6.0,49.0,3.0,15.0,46.0,2.0,3.0,,,,56.0,97.0,1490,391,260.0,15.0,21.0,6.0,49.0,3.0,15.0,46.0,2.0,3.0,,,,254,1698,432,18,24.0,2.0,7.0,4.0,,6.0,3.0,6.0,,,,,53,401,4.234414,33,3.062917,0.000181,0.131622,0.001372,0.828944,3.5
2,0042269b,4136,1771669,421201,3005,101.837766,82.383766,4,13,18,19,1649,2291,731.611702,405,404,194.772727,805580,34.5,89.0,133.0,176.0,193.0,214.0,262.0,295.0,350.0,153886.0,-250.0,325.520435,3937.359025,1346027.0,-57.0,-40.0,-25.0,-7.0,0.0,13.0,39.0,86.0,196.0,1862.0,6.531318,71.786451,27007.0,28.0,0.167836,0.644564,694.0,175,3515,439.0,7.0,,,129.0,39.0,2904,552,439.0,21.0,23.0,17.0,,,6.0,6.0,,,,,,129.0,39.0,2899,552,439.0,21.0,23.0,17.0,,,11.0,6.0,,,,,,175,3257,615,23,26.0,10.0,23.0,,,1.0,,2.0,4.0,,,,47,639,5.344288,25,3.372135,0.000228,0.097679,0.002335,0.759751,6.0
3,0059420b,1556,1404469,189596,806,121.848329,113.768226,5,15,15,10,1048,1047,542.537275,207,206,103.618895,161231,16.0,39.0,61.0,82.0,108.5,130.0,150.0,165.0,187.0,101690.0,-516.0,754.648232,4242.152639,1173478.0,-8.8,3.0,46.2,99.0,131.0,196.0,353.2,594.4,1236.6,357.0,1.457878,9.920533,2267.0,1.0,0.181350,0.385432,282.0,99,1304,151.0,1.0,,1.0,18.0,68.0,1038,243,152.0,13.0,3.0,3.0,,2.0,14.0,,,,2.0,,,18.0,68.0,1038,243,152.0,13.0,3.0,3.0,,2.0,14.0,,,,2.0,,,99,1146,281,13,3.0,2.0,4.0,3.0,,,,,5.0,,,,18,255,4.537255,15,2.867940,0.000147,0.132391,0.001108,0.835531,2.0
4,0075873a,2531,1662472,313702,701,123.943896,62.082013,3,11,11,9,1197,1402,600.050968,253,252,125.082971,316585,32.0,48.0,61.0,86.0,113.0,144.0,189.0,219.0,228.0,110688.0,-158.0,502.094862,3896.209237,1270300.0,-46.0,-20.0,0.0,0.0,36.0,73.0,105.0,160.0,362.1,643.0,2.803953,24.251326,7094.0,1.0,0.168379,0.374277,426.0,72,1942,517.0,,,,33.0,39.0,1541,324,517.0,23.0,24.0,10.0,,17.0,3.0,,,,,,,33.0,39.0,1541,324,517.0,23.0,24.0,10.0,,17.0,3.0,,,,,,,72,1964,397,32,25.0,,12.0,25.0,,,2.0,,2.0,,,,66,431,4.556845,14,2.783927,0.000152,0.099565,0.001522,0.764103,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2466,ffb8c745,4739,1791649,499670,3323,105.437856,63.622575,4,13,13,13,1484,1634,774.000633,462,461,256.353661,1214860,70.0,133.0,190.0,253.0,297.0,315.0,328.0,350.4,394.0,128570.0,-117.0,267.942592,3458.823757,1269512.0,-41.0,-23.0,-8.0,0.0,0.0,23.0,52.0,83.0,187.0,1262.0,1.990713,27.213406,9432.0,170.0,0.204095,2.495495,967.0,189,3588,960.0,2.0,,,24.0,164.0,2844,651,960.0,43.0,32.0,7.0,,7.0,7.0,,,,,,,24.0,164.0,2844,651,960.0,43.0,32.0,7.0,,7.0,7.0,,,,,,,189,3605,813,59,42.0,3.0,11.0,8.0,,1.0,,6.0,,,2.0,,88,741,4.869096,20,2.977718,0.000257,0.097278,0.002645,0.708572,3.5
2467,ffbef7e5,2604,1799174,214221,1144,82.266129,36.178818,4,11,11,8,1808,1877,1022.078725,439,438,223.013057,580726,41.0,90.6,135.0,180.0,227.5,275.8,316.0,354.0,392.0,267869.0,-64.0,600.545909,5630.628933,1563221.0,59.0,91.0,117.0,141.0,172.0,214.2,264.0,339.0,539.8,1124.0,2.820207,35.614696,7341.0,1.0,0.177488,0.382154,462.0,148,2395,60.0,1.0,,,36.0,106.0,1874,447,60.0,31.0,24.0,12.0,,8.0,6.0,,,,,,,36.0,106.0,1874,447,60.0,31.0,24.0,12.0,,8.0,6.0,,,,,,,148,1920,457,33,24.0,1.0,12.0,9.0,,,,,,,,,63,473,4.059197,13,2.221528,0.000243,0.168203,0.001447,0.868855,4.0
2468,ffccd6fd,3063,1959363,231580,564,75.605615,63.494975,3,11,11,6,2759,2761,1309.085537,202,201,157.589292,482696,53.0,88.0,141.0,181.0,201.0,201.0,201.0,201.0,201.0,229804.0,-87.0,556.597322,5398.118769,1704301.0,0.0,0.0,0.0,0.0,116.0,153.0,195.0,311.0,613.4,427.0,1.344546,8.509608,4117.0,1.0,0.073481,0.260968,225.0,126,2849,88.0,,,,9.0,,969,1861,88.0,5.0,2.0,12.0,53.0,,,29.0,,29.0,6.0,,,9.0,,969,1861,88.0,5.0,2.0,12.0,53.0,,,29.0,,29.0,6.0,,,126,1031,1879,6,3.0,,18.0,,,,,,,,,,7,232,4.443966,15,2.693600,0.000103,0.065622,0.001563,0.869824,1.5
2469,ffec5b38,3242,1508504,289439,1388,89.277915,54.515788,3,15,15,13,2106,2133,1192.640962,414,413,205.917027,667583,42.0,80.0,124.0,163.4,205.0,252.0,283.0,326.0,370.0,127733.0,-132.0,370.003085,3462.066161,1199180.0,-20.0,0.0,32.0,57.0,89.0,118.0,151.0,213.0,396.0,563.0,1.814563,16.147617,5881.0,9.0,0.163838,0.401387,531.0,71,2895,276.0,,,,14.0,52.0,2361,457,276.0,31.0,27.0,6.0,,4.0,12.0,,,,2.0,,,14.0,52.0,2361,457,276.0,31.0,27.0,6.0,,4.0,12.0,,,,2.0,,,71,2593,490,34,29.0,4.0,8.0,4.0,2.0,1.0,,6.0,,,,,70,512,5.169922,24,3.419903,0.000274,0.127390,0.002149,0.794947,5.0


In [46]:
test_feats

Unnamed: 0,id,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,col_10,col_11,col_12,col_13,col_14,col_15,col_16,col_17,col_18,col_19,col_20,col_21,col_22,col_23,col_24,col_25,col_26,col_27,col_28,col_29,col_30,col_31,col_32,col_33,col_34,col_35,col_36,col_37,col_38,col_39,col_40,col_41,col_42,col_43,col_44,col_45,col_46,col_47,col_48,col_50,col_58,col_57,col_75,col_74,col_91,col_90,col_105,col_106,col_107,col_108,col_109,col_110,col_111,col_112,col_113,col_49,col_51,col_52,col_53,col_54,col_55,col_56,col_59,col_60,col_61,col_62,col_63,col_64,col_65,col_66,col_67,col_68,col_69,col_70,col_71,col_72,col_73,col_76,col_77,col_78,col_79,col_80,col_81,col_82,col_83,col_84,col_85,col_86,col_87,col_88,col_89,col_92,col_93,col_94,col_95,col_96,col_97,col_98,col_99,col_100,col_101,col_102,col_103,col_104
0,0000aaaa,2,760160,172,87,86.0,1.414214,1,1,1,1,2,1,0.5,1,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,421555.0,421555.0,421555.0,,421555.0,421555.0,421555.0,421555.0,421555.0,421555.0,421555.0,421555.0,421555.0,421555.0,1.0,1.0,,1.0,0.0,0.0,,0.0,2,2.0,,2.0,,2.0,,0,0,0.0,0,0.0,0.0,0.0,3e-06,0.554561,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2222bbbb,2,712023,113,67,56.5,14.849242,1,1,1,1,2,1,0.5,1,1,1.0,2,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,-421521.0,-421521.0,-421521.0,,-421521.0,-421521.0,-421521.0,-421521.0,-421521.0,-421521.0,-421521.0,-421521.0,-421521.0,-421521.0,1.0,1.0,,1.0,0.0,0.0,,0.0,2,,2.0,,2.0,,2.0,0,1,2.0,2,0.0,1e-06,0.5,3e-06,-0.592005,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,4444cccc,2,635641,150,94,75.0,26.870058,1,2,2,2,2,1,0.5,2,1,0.5,1,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,-450645.0,-450645.0,-450645.0,,-450645.0,-450645.0,-450645.0,-450645.0,-450645.0,-450645.0,-450645.0,-450645.0,-450645.0,-450645.0,1.0,1.0,,1.0,1.0,1.0,,1.0,2,1.0,1.0,1.0,1.0,1.0,1.0,0,1,1.0,1,0.0,2e-06,0.5,3e-06,-0.708962,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [47]:
feature_names

['col_1',
 'col_2',
 'col_3',
 'col_4',
 'col_5',
 'col_6',
 'col_7',
 'col_8',
 'col_9',
 'col_10',
 'col_11',
 'col_12',
 'col_13',
 'col_14',
 'col_15',
 'col_16',
 'col_17',
 'col_18',
 'col_19',
 'col_20',
 'col_21',
 'col_22',
 'col_23',
 'col_24',
 'col_25',
 'col_26',
 'col_27',
 'col_28',
 'col_29',
 'col_30',
 'col_31',
 'col_32',
 'col_33',
 'col_34',
 'col_35',
 'col_36',
 'col_37',
 'col_38',
 'col_39',
 'col_40',
 'col_41',
 'col_42',
 'col_43',
 'col_44',
 'col_45',
 'col_46',
 'col_47',
 'col_48',
 'col_49',
 'col_50',
 'col_51',
 'col_52',
 'col_53',
 'col_54',
 'col_55',
 'col_56',
 'col_57',
 'col_58',
 'col_59',
 'col_60',
 'col_61',
 'col_62',
 'col_63',
 'col_64',
 'col_65',
 'col_66',
 'col_67',
 'col_68',
 'col_69',
 'col_70',
 'col_71',
 'col_72',
 'col_73',
 'col_74',
 'col_75',
 'col_76',
 'col_77',
 'col_78',
 'col_79',
 'col_80',
 'col_81',
 'col_82',
 'col_83',
 'col_84',
 'col_85',
 'col_86',
 'col_87',
 'col_88',
 'col_89',
 'col_90',
 'col_91',
 'col_92

In [48]:
train_feats[['col_49', 'col_51', 'col_52', 'col_53', 'col_54', 'col_55', 'col_56', 'col_59', 'col_60', 'col_61', 'col_62', 'col_63', 'col_64', 'col_65', 'col_66', 'col_67', 'col_68', 'col_69', 'col_70', 'col_71', 'col_72', 'col_73', 'col_76', 'col_77', 'col_78', 'col_79', 'col_80', 'col_81', 'col_82', 'col_83', 'col_84', 'col_85', 'col_86', 'col_87', 'col_88', 'col_89', 'col_92', 'col_93', 'col_94', 'col_95', 'col_96', 'col_97', 'col_98', 'col_99', 'col_100', 'col_101', 'col_102', 'col_103', 'col_104']]

Unnamed: 0,col_49,col_51,col_52,col_53,col_54,col_55,col_56,col_59,col_60,col_61,col_62,col_63,col_64,col_65,col_66,col_67,col_68,col_69,col_70,col_71,col_72,col_73,col_76,col_77,col_78,col_79,col_80,col_81,col_82,col_83,col_84,col_85,col_86,col_87,col_88,col_89,col_92,col_93,col_94,col_95,col_96,col_97,col_98,col_99,col_100,col_101,col_102,col_103,col_104
0,120,417.0,7.0,3.0,,92.0,27.0,417.0,21.0,12.0,4.0,2.0,3.0,1.0,2.0,,,,,,92.0,27.0,417.0,21.0,12.0,4.0,2.0,3.0,1.0,2.0,,,,,,120,28,14.0,9.0,4.0,5.0,1.0,,,,,,,
1,254,260.0,1.0,,1.0,56.0,97.0,260.0,15.0,21.0,6.0,49.0,3.0,15.0,46.0,2.0,3.0,,,,56.0,97.0,260.0,15.0,21.0,6.0,49.0,3.0,15.0,46.0,2.0,3.0,,,,254,18,24.0,2.0,7.0,4.0,,6.0,3.0,6.0,,,,
2,175,439.0,7.0,,,129.0,39.0,439.0,21.0,23.0,17.0,,,6.0,6.0,,,,,,129.0,39.0,439.0,21.0,23.0,17.0,,,11.0,6.0,,,,,,175,23,26.0,10.0,23.0,,,1.0,,2.0,4.0,,,
3,99,151.0,1.0,,1.0,18.0,68.0,152.0,13.0,3.0,3.0,,2.0,14.0,,,,2.0,,,18.0,68.0,152.0,13.0,3.0,3.0,,2.0,14.0,,,,2.0,,,99,13,3.0,2.0,4.0,3.0,,,,,5.0,,,
4,72,517.0,,,,33.0,39.0,517.0,23.0,24.0,10.0,,17.0,3.0,,,,,,,33.0,39.0,517.0,23.0,24.0,10.0,,17.0,3.0,,,,,,,72,32,25.0,,12.0,25.0,,,2.0,,2.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2466,189,960.0,2.0,,,24.0,164.0,960.0,43.0,32.0,7.0,,7.0,7.0,,,,,,,24.0,164.0,960.0,43.0,32.0,7.0,,7.0,7.0,,,,,,,189,59,42.0,3.0,11.0,8.0,,1.0,,6.0,,,2.0,
2467,148,60.0,1.0,,,36.0,106.0,60.0,31.0,24.0,12.0,,8.0,6.0,,,,,,,36.0,106.0,60.0,31.0,24.0,12.0,,8.0,6.0,,,,,,,148,33,24.0,1.0,12.0,9.0,,,,,,,,
2468,126,88.0,,,,9.0,,88.0,5.0,2.0,12.0,53.0,,,29.0,,29.0,6.0,,,9.0,,88.0,5.0,2.0,12.0,53.0,,,29.0,,29.0,6.0,,,126,6,3.0,,18.0,,,,,,,,,
2469,71,276.0,,,,14.0,52.0,276.0,31.0,27.0,6.0,,4.0,12.0,,,,2.0,,,14.0,52.0,276.0,31.0,27.0,6.0,,4.0,12.0,,,,2.0,,,71,34,29.0,4.0,8.0,4.0,2.0,1.0,,6.0,,,,


In [49]:
kfold = KFold(n_splits=n_splits, shuffle=True, random_state=42)
for key, value in models.items():
        print(f"***********{key}***************")
        best_model = value["model"]
        oof = []
        prediction = test_feats[[exp_key_column]]
        prediction[obj_column] = 0
        df_importance_list = []
        for fold_id, (trn_idx, val_idx) in enumerate(kfold.split(train_feats[feature_names])):
            X_train = train_feats.iloc[trn_idx][feature_names]
            Y_train = train_feats.iloc[trn_idx][obj_column]

            X_val = train_feats.iloc[val_idx][feature_names]
            Y_val = train_feats.iloc[val_idx][obj_column]

            print('\nFold_{} Training ================================\n'.format(fold_id+1))
            best_model = value["model"]

            ## we have to change because each models have different params
            if key == "xgb":
                model = best_model.fit(X_train,
                                      Y_train,
                                      # eval_names=['train', 'valid'],
                                      eval_set=[(X_train, Y_train), (X_val, Y_val)],
                                      verbose=20,
                                      # eval_metric= [metric],
                                      early_stopping_rounds=50)
                pred_val = model.predict(X_val)
                pred_test = model.predict(test_feats[feature_names])
                prediction[obj_column] += pred_test / kfold.n_splits
                df_oof = train_feats.iloc[val_idx][[exp_key_column, obj_column]].copy()
                df_oof['pred'] = pred_val
                df_oof[f"pred_{fold_id+1}"] = pred_val
                oof.append(df_oof)
                df_oof["pred_range"] = classify_predict_vectorize(pred_val)
                if fold_id + 1 == n_splits:
                    display(df_oof.head(10))
                    display(prediction.head(10))
                    
            elif key == "lgb":
                model = best_model.fit(X_train,Y_train,
                              eval_names=['train', 'valid'],
                              eval_set=[(X_train, Y_train), (X_val, Y_val)],
                              verbose=20,
                              eval_metric= metric,
                              early_stopping_rounds=50)
                pred_val = model.predict(X_val, num_iteration=model.best_iteration_)
                pred_test = model.predict(test_feats[feature_names], num_iteration=model.best_iteration_)
                prediction[obj_column] += pred_test / kfold.n_splits
                df_oof = train_feats.iloc[val_idx][[exp_key_column, obj_column]].copy()
                df_oof['pred'] = pred_val
                oof.append(df_oof)
                df_oof["pred_range"] = classify_predict_vectorize(pred_val)
                if fold_id + 1 == n_splits:
                    display(df_oof.head(10))
                    display(prediction.head(10))

            elif key == "rf":
                model = best_model.fit(X_train,Y_train)
                pred_val = model.predict(X_val)
                pred_test = model.predict(test_feats[feature_names])
                prediction[obj_column] += pred_test / kfold.n_splits
                df_oof = train_feats.iloc[val_idx][[exp_key_column, obj_column]].copy()
                df_oof['pred'] = pred_val
                oof.append(df_oof)
                if fold_id + 1 == n_splits:
                    display(df_oof.head(10))
                    display(prediction.head(10))
            elif key == "tpot":
                model = best_model.fit(X_train,
                          Y_train,
                          eval_names=['train', 'valid'],
                          eval_set=[(X_train, Y_train), (X_val, Y_val)],
                          verbose=20,
                          eval_metric= metric,
                          early_stopping_rounds=50)
                pred_val = model.predict(X_val, num_iteration=model.best_iteration_)
                pred_test = model.predict(test_feats[feature_names], num_iteration=model.best_iteration_)
                prediction[obj_column] += pred_test / kfold.n_splits
                df_oof = train_feats.iloc[val_idx][[exp_key_column, obj_column]].copy()
                df_oof['pred'] = pred_val
                oof.append(df_oof)
                df_oof["pred_range"] = classify_predict_vectorize(pred_val)
                if fold_id + 1 == n_splits:
                    display(df_oof.head(10))
                    display(prediction.head(10))
                
            models[key]["prediction"] = prediction
            if key in ["lgb","xgb"]:
                df_importance = pd.DataFrame({
                    'column': feature_names,
                    'importance': model.feature_importances_,
                })
                df_importance_list.append(df_importance)
            else:
                df_importance = pd.DataFrame([])
            del model, pred_val, pred_test
            gc.collect()
            df_oof = pd.concat(oof)
            rmse = mean_squared_error(df_oof[obj_column], np.clip(df_oof['pred'], a_min=0.5, a_max=6.0), squared=False)
            print('rmse:', rmse)
            models[key]["score"] = rmse

***********lgb***************


[20]	train's l2: 0.884231	valid's l2: 0.880368
[40]	train's l2: 0.760387	valid's l2: 0.761134
[60]	train's l2: 0.667454	valid's l2: 0.673699
[80]	train's l2: 0.597344	valid's l2: 0.608725
[100]	train's l2: 0.543657	valid's l2: 0.559528
[120]	train's l2: 0.502237	valid's l2: 0.522997
[140]	train's l2: 0.470099	valid's l2: 0.493397
[160]	train's l2: 0.443959	valid's l2: 0.469455
[180]	train's l2: 0.422315	valid's l2: 0.450174
[200]	train's l2: 0.405097	valid's l2: 0.435051
[220]	train's l2: 0.390871	valid's l2: 0.422682
[240]	train's l2: 0.378637	valid's l2: 0.413054
[260]	train's l2: 0.367946	valid's l2: 0.406527
[280]	train's l2: 0.357675	valid's l2: 0.399738
[300]	train's l2: 0.348423	valid's l2: 0.394854
[320]	train's l2: 0.34044	valid's l2: 0.391128
[340]	train's l2: 0.333508	valid's l2: 0.388417
[360]	train's l2: 0.326909	valid's l2: 0.385475
[380]	train's l2: 0.320818	valid's l2: 0.382759
[400]	train's l2: 0.315295	valid's l2: 0.381175
[420]	train's

Unnamed: 0,id,score,pred,pred_range
1,0022f953,3.5,3.680069,3.5
21,019737b6,3.5,4.659227,4.5
34,031c0c58,4.0,3.831261,4.0
64,0666fb4e,4.5,4.98589,5.0
95,09a67581,4.5,4.713637,4.5
98,09eb3ce5,3.0,2.769073,3.0
103,0a4e3aec,3.5,3.338173,3.5
122,0ca05fe3,1.5,2.958319,3.0
130,0d71be2a,4.5,4.228828,4.0
146,0f2b0127,3.5,2.972157,3.0


Unnamed: 0,id,score
0,0000aaaa,1.760326
1,2222bbbb,1.865967
2,4444cccc,1.860676


rmse: 0.6353058038226609
***********xgb***************


[0]	validation_0-rmse:2.99063	validation_1-rmse:3.03037
[20]	validation_0-rmse:0.73504	validation_1-rmse:0.73828
[40]	validation_0-rmse:0.65300	validation_1-rmse:0.64691
[60]	validation_0-rmse:0.63547	validation_1-rmse:0.62991
[80]	validation_0-rmse:0.62470	validation_1-rmse:0.62185
[100]	validation_0-rmse:0.61653	validation_1-rmse:0.61928
[120]	validation_0-rmse:0.61003	validation_1-rmse:0.61318
[140]	validation_0-rmse:0.60435	validation_1-rmse:0.61049
[160]	validation_0-rmse:0.59937	validation_1-rmse:0.60711
[180]	validation_0-rmse:0.59472	validation_1-rmse:0.60638
[200]	validation_0-rmse:0.59050	validation_1-rmse:0.60767
[220]	validation_0-rmse:0.58685	validation_1-rmse:0.60868
[227]	validation_0-rmse:0.58569	validation_1-rmse:0.60813
rmse: 0.6050524365234516


[0]	validation_0-rmse:3.00106	validation_1-rmse:2.95623
[20]	validation_0-rmse:0.74339	validation_1-rmse:0.64731
[40]	validation_0-rmse:0.65867	validation_1-rmse:0.5658

Unnamed: 0,id,score,pred,pred_10,pred_range
1,0022f953,3.5,3.925856,3.925856,4.0
21,019737b6,3.5,4.667128,4.667128,4.5
34,031c0c58,4.0,4.265974,4.265974,4.5
64,0666fb4e,4.5,4.882967,4.882967,5.0
95,09a67581,4.5,5.167342,5.167342,5.0
98,09eb3ce5,3.0,2.80211,2.80211,3.0
103,0a4e3aec,3.5,3.249371,3.249371,3.0
122,0ca05fe3,1.5,2.575143,2.575143,2.5
130,0d71be2a,4.5,4.288205,4.288205,4.5
146,0f2b0127,3.5,3.111348,3.111348,3.0


Unnamed: 0,id,score
0,0000aaaa,0.701999
1,2222bbbb,1.108444
2,4444cccc,1.016786


rmse: 0.6308719208910265


In [50]:
# oof = []
# prediction = test_feats[[exp_key_column]]
# prediction[obj_column] = 0
# df_importance_list = []

# kfold = KFold(n_splits=n_splits, shuffle=True, random_state=42)
# for fold_id, (trn_idx, val_idx) in enumerate(kfold.split(train_feats[feature_names])):
#     X_train = train_feats.iloc[trn_idx][feature_names]
#     Y_train = train_feats.iloc[trn_idx][obj_column]

#     X_val = train_feats.iloc[val_idx][feature_names]
#     Y_val = train_feats.iloc[val_idx][obj_column]

#     print('\nFold_{} Training ================================\n'.format(fold_id+1))
#     for key, value in models.items():
#         print(f"***********{key}***************")
#         best_model = value["model"]
        
#         ## we have to change because each models have different params
#         if key == "xgb":
#             model = best_model.fit(X_train,
#                                   Y_train,
#                                   # eval_names=['train', 'valid'],
#                                   eval_set=[(X_train, Y_train), (X_val, Y_val)],
#                                   verbose=20,
#                                   # eval_metric= [metric],
#                                   early_stopping_rounds=50)
#             pred_val = model.predict(X_val)
#             pred_test = model.predict(test_feats[feature_names])
#             prediction[obj_column] += pred_test / kfold.n_splits
#             df_oof = train_feats.iloc[val_idx][[exp_key_column, obj_column]].copy()
#             df_oof['pred'] = pred_val
#             oof.append(df_oof)
#             df_oof["pred_range"] = classify_predict_vectorize(pred_val)
#             display(df_oof.head(10))
#         elif key == "lgb":
#             model = best_model.fit(X_train,
#                       Y_train,
#                       eval_names=['train', 'valid'],
#                       eval_set=[(X_train, Y_train), (X_val, Y_val)],
#                       verbose=20,
#                       eval_metric= metric,
#                       early_stopping_rounds=50)
#             pred_val = model.predict(X_val, num_iteration=model.best_iteration_)
#             pred_test = model.predict(test_feats[feature_names], num_iteration=model.best_iteration_)
#             prediction[obj_column] += pred_test / kfold.n_splits
#             df_oof = train_feats.iloc[val_idx][[exp_key_column, obj_column]].copy()
#             df_oof['pred'] = pred_val
#             oof.append(df_oof)
#             df_oof["pred_range"] = classify_predict_vectorize(pred_val)
#             display(df_oof.head(10))
        
#         elif key == "rf":
#             model = best_model.fit(X_train,Y_train)
#             pred_val = model.predict(X_val)
#             pred_test = model.predict(test_feats[feature_names])
#             prediction[obj_column] += pred_test / kfold.n_splits
#             df_oof = train_feats.iloc[val_idx][[exp_key_column, obj_column]].copy()
#             df_oof['pred'] = pred_val
#             oof.append(df_oof)
#             display(df_oof.head(10))
#         elif key == "tpot":
#             model = best_model.fit(X_train,
#                       Y_train,
#                       eval_names=['train', 'valid'],
#                       eval_set=[(X_train, Y_train), (X_val, Y_val)],
#                       verbose=20,
#                       eval_metric= metric,
#                       early_stopping_rounds=50)
#             pred_val = model.predict(X_val, num_iteration=model.best_iteration_)
#             pred_test = model.predict(test_feats[feature_names], num_iteration=model.best_iteration_)
#             prediction[obj_column] += pred_test / kfold.n_splits
#             df_oof = train_feats.iloc[val_idx][[exp_key_column, obj_column]].copy()
#             df_oof['pred'] = pred_val
#             oof.append(df_oof)
#             df_oof["pred_range"] = classify_predict_vectorize(pred_val)
#             display(df_oof.head(10))
            
#         models[key]["prediction"] = prediction
#         if key in ["lgb","xgb"]:
#             df_importance = pd.DataFrame({
#                 'column': feature_names,
#                 'importance': model.feature_importances_,
#             })
#             df_importance_list.append(df_importance)
#         else:
#             df_importance = pd.DataFrame([])
#         del model, pred_val, pred_test
#         gc.collect()
#         df_oof = pd.concat(oof)
#         rmse = mean_squared_error(df_oof[obj_column], np.clip(df_oof['pred'], a_min=0.5, a_max=6.0), squared=False)
#         print('rmse:', rmse)
#         models[key]["score"] = rmse

In [51]:
for key, value in models.items():
    print(key)
#     print(value.values())
#     print(value["model"])
    print(value["score"])

lgb
0.6353058038226609
xgb
0.6308719208910265


# 3. submission

In [52]:
prediction

Unnamed: 0,id,score
0,0000aaaa,0.701999
1,2222bbbb,1.108444
2,4444cccc,1.016786


In [53]:
# prediction['score'] = np.clip(prediction['score'], a_min=0.5, a_max=6.0)
display(models["lgb"]["prediction"])
prediction['score'] = classify_predict_vectorize(models["lgb"]["prediction"]["score"])
display(prediction)
prediction.to_csv('submission.csv', index=False)

Unnamed: 0,id,score
0,0000aaaa,1.760326
1,2222bbbb,1.865967
2,4444cccc,1.860676


Unnamed: 0,id,score
0,0000aaaa,2.0
1,2222bbbb,2.0
2,4444cccc,2.0
