# exp076

exp074(exp075)のoof修正版 meta_featureにも影響あり

In [1]:
import os
import sys
import traceback
import gc
import time
import random
import pickle
import pathlib
import subprocess
from dataclasses import dataclass
from collections import defaultdict

import pandas as pd
import numpy as np
from sklearn.metrics import f1_score
from sklearn.metrics import log_loss
from sklearn.model_selection import GroupKFold
import lightgbm as lgb

import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import itertools

import warnings
warnings.simplefilter('ignore')



In [2]:
@dataclass
class Cfg:
    mode = "local_cv" # "local_cv" or "kaggle_inf" 
    exp_name = "exp076"
    input_dir = "/mnt/predict-student-performance-from-game-play/input/"
    output_dir = "/mnt/predict-student-performance-from-game-play/output/"
    prep_dir = "/mnt/predict-student-performance-from-game-play/prep/"
    seed = 42
    n_splits = 5
    best_threshold = 0.630 # local_cvの結果を入れる
    base_exp = None # 特徴量重要度を使う元のexp
    n_features = 500 # 特徴量削減の数
cfg = Cfg()

if cfg.mode == "local_cv":
    os.makedirs(os.path.join(cfg.output_dir, cfg.exp_name), exist_ok=True)
    os.makedirs(os.path.join(cfg.output_dir, cfg.exp_name, "cache"), exist_ok=True)

elif cfg.mode == "kaggle_inf":
    import jo_wilder_310

In [3]:
params = {
    'objective': 'binary', 
    'boosting': 'gbdt', 
    'learning_rate': 0.01, 
    'metric': 'binary_logloss', 
    'seed': cfg.seed, 
    'feature_pre_filter': False, 
    'lambda_l1': 4.134488140102331, 
    'lambda_l2': 0.007775200046481757, 
    'num_leaves': 75, 
    'feature_fraction': 0.5, 
    'bagging_fraction': 0.7036110805680353, 
    'bagging_freq': 3, 
    'min_data_in_leaf': 50, 
    'min_child_samples': 100
} 

In [4]:
level_group_list = ['0-4', '5-12', '13-22']
level_group_map = {
    "q1":"0-4", "q2":"0-4", "q3":"0-4",
    "q4":"5-12", "q5":"5-12", "q6":"5-12", "q7":"5-12", "q8":"5-12", "q9":"5-12", "q10":"5-12", "q11":"5-12", "q12":"5-12", "q13":"5-12",
    "q14":"13-22", "q15":"13-22", "q16":"13-22", "q17":"13-22", "q18":"13-22"  
}

In [5]:
if cfg.mode == "local_cv":
    with open(cfg.prep_dir + 'cat_col_lists_v2.pkl', 'rb') as f:
        cat_col_lists = pickle.load(f) 

elif cfg.mode == "kaggle_inf":
    with open("/kaggle/input/psp-cat-col-lists/cat_col_lists_v2.pkl", 'rb') as f:
        cat_col_lists = pickle.load(f) 

In [6]:
def transform_labels_df_train(labels_):
    """
    labelsデータを整形する
    """
    labels = labels_.copy()
    labels["question"] = labels["session_id"].apply(lambda x: x.split("_")[1].replace("q", "")).astype(int)
    labels["session_id"] = labels["session_id"].apply(lambda x: x.split("_")[0]).astype(int)

    # trainの特徴量と結合するためにquestionに対応するlabel_groupを列として設けておく
    labels["level_group"] = ""
    labels.loc[labels["question"]<=3, "level_group"] = "0-4"
    labels.loc[(labels["question"]>=4)&(labels["question"]<=13), "level_group"] = "5-12"
    labels.loc[labels["question"]>=14, "level_group"] = "13-22"

    return labels


def transform_labels_df_inf(labels_):
    """
    labelsデータを整形する
    """
    labels = labels_.copy()
    labels["question"] = labels["session_id"].apply(lambda x: x.split("_")[1].replace("q", "")).astype(int)
    labels["session_id"] = labels["session_id"].apply(lambda x: x.split("_")[0]).astype(int)

    return labels

In [7]:
class FeaturesTrain:
    def __init__(self, sessions_df, labels):
        self.sessions_df = sessions_df.sort_values(["session_id", "level_group", "index"], ignore_index=True)
        self.features = self.sessions_df[["session_id", "level_group"]].drop_duplicates().copy()
        self.result = labels
        self.group = sessions_df["level_group"].values[0]

    def _prep(self):
        self.sessions_df["time_diff"] = self.sessions_df["elapsed_time"] - self.sessions_df.groupby(["session_id", "level_group"])["elapsed_time"].shift(1)
        self.sessions_df["time_diff"] = np.where(self.sessions_df["time_diff"]<0, 0, self.sessions_df["time_diff"])
        self.sessions_df["time_diff"] = np.nan_to_num(self.sessions_df["time_diff"], 0)
        self.sessions_df["event_name+name"] = self.sessions_df["event_name"] + "_" + self.sessions_df["name"]
        self.sessions_df["event_name+room_fqid"] = self.sessions_df["event_name"] + "_" + self.sessions_df["room_fqid"]

    def _total_record_cnt(self):
        """level_groupごとのレコード数
        """
        add_features = self.sessions_df.groupby(["session_id", "level_group"])["index"].count().reset_index().rename(columns={"index":f"{self.group}_record_cnt"})
        self.features = self.features.merge(add_features, on=["session_id", "level_group"], how="left")

    def _group_elapsed_time(self):
        """level_groupごと、epapsed_timeのmax - min（経過時間）
        """
        add_features = self.sessions_df.groupby(["session_id", "level_group"])["elapsed_time"].agg([max,min]).reset_index()
        add_features[f"{self.group}_group_elapsed_time"] = add_features["max"] - add_features["min"]
        add_features[f"{self.group}_group_elapsed_time"] = add_features[f"{self.group}_group_elapsed_time"].astype(np.float32)
        add_features = add_features[["session_id", "level_group", f"{self.group}_group_elapsed_time"]].copy()
        self.features = self.features.merge(add_features, on=["session_id", "level_group"], how="left")

    def _cat_record_cnt(self, cat_col):
        """level_groupごと、各{cat}のレコード数
        """
        cat_list = cat_col_lists[self.group][cat_col]
        add_features = self.sessions_df.groupby(["session_id", "level_group", cat_col])["index"].count().reset_index().rename(columns={"index":"cnt"})
        for cat in cat_list:
            feat_name = f"{self.group}_{cat_col}_{str(cat)}_record_cnt"
            tmp = add_features[add_features[cat_col]==cat][["session_id", "level_group", "cnt"]].copy()
            if len(tmp) > 0:
                tmp = tmp.rename(columns={"cnt": feat_name})
                self.features = self.features.merge(tmp, on=["session_id", "level_group"], how="left")
                self.features[feat_name] = self.features[feat_name].fillna(0).astype(int)
            else:
                self.features[feat_name] = int(0)

    def _cat_col_nunique(self, cat_col):
        """level_groupごと、[col]のユニーク数
        """
        add_features = self.sessions_df.dropna(subset=[cat_col]).drop_duplicates(["session_id", "level_group", cat_col])
        add_features = add_features.groupby(["session_id", "level_group"])["index"].count().reset_index().rename(columns={"index":f"{self.group}_{cat_col}_nunique"})
        self.features = self.features.merge(add_features, on=["session_id", "level_group"], how="left")        

    def _agg_features(self, val_cols, aggs):
        new_cols = [f"{self.group}_{v}_{a}" for v,a in itertools.product(val_cols, aggs)]
        add_features = self.sessions_df.groupby(["session_id", "level_group"])[val_cols].agg(aggs).reset_index()
        add_features.columns = ["session_id", "level_group"] + new_cols
        add_features[new_cols] = add_features[new_cols].astype(np.float32)
        self.features = self.features.merge(add_features, on=["session_id", "level_group"], how="left")

    def _cat_agg_features(self, val_cols, aggs, cat_col, not_use_cats=None):
        add_features = self.sessions_df.groupby(["session_id", "level_group", cat_col])[val_cols].agg(aggs).reset_index()

        if not_use_cats is not None:
            cat_list = [c for c in cat_col_lists[self.group][cat_col] if c not in not_use_cats]
        else:
            cat_list = cat_col_lists[self.group][cat_col]

        for cat in cat_list:
            new_cols = [f"{self.group}_{cat_col}_{cat}_{v}_{a}" for v,a in itertools.product(val_cols, aggs)]
            tmp = add_features[add_features[cat_col]==cat].copy()
            if len(tmp) > 0:
                tmp.columns = ["session_id", "level_group", cat_col] + new_cols
                tmp = tmp.drop(columns=[cat_col])
                self.features = self.features.merge(tmp, on=["session_id", "level_group"], how="left")
                self.features[new_cols] = self.features[new_cols].fillna(-1)
            else:
                self.features[new_cols] = -1
            self.features[new_cols] = self.features[new_cols].astype(np.float32)

    def _cat_change_cnt(self, cat_col):
        """cat_colの変化回数
        """
        tmp = self.sessions_df[["session_id", "level_group", cat_col]].copy()
        tmp[cat_col] = tmp[cat_col].fillna("nan")
        tmp[f"{self.group}_{cat_col}_change_cnt"] = (tmp[cat_col] != tmp.groupby(["session_id", "level_group"])[cat_col].shift(1)).astype(int)
        add_features = tmp.groupby(["session_id", "level_group"])[f"{self.group}_{cat_col}_change_cnt"].sum().reset_index()
        self.features = self.features.merge(add_features, on=["session_id", "level_group"], how="left")


    def _add_minigame_features(self, start_fqid, end_fqid):
        game_name = start_fqid
        dfs = []
        for session_id in tqdm(self.sessions_df["session_id"].unique()):
            tmp = self.sessions_df[self.sessions_df["session_id"]==session_id].copy()
            start_indexes = tmp[(tmp["event_name"]=="navigate_click")&(tmp["fqid"]==start_fqid)]["index"].values
            end_indexes = tmp[(tmp["event_name"]=="object_click")&(tmp["fqid"]==end_fqid)]["index"].values
            if len(start_indexes) > 0:
                start_index = start_indexes[0]
            else:
                start_index = np.nan
            if len(end_indexes) > 0:
                end_index = end_indexes[0]
            else:
                end_index = np.nan

            if start_index < end_index:
                mini_game_sessions = tmp[(tmp["index"]>start_index)&(tmp["index"]<=end_index)].copy()
                record_cnt = len(mini_game_sessions)
                total_duration = mini_game_sessions["time_diff"].sum()
                total_hover_duration = mini_game_sessions["hover_duration"].sum()

                hover_sessions = mini_game_sessions[mini_game_sessions["event_name"]=="object_hover"].copy()
                if len(hover_sessions) > 0:
                    hover_cnt = len(hover_sessions)
                else:
                    hover_cnt = 0

                click_sessions = mini_game_sessions[mini_game_sessions["event_name"]=="object_click"].copy()
                if len(click_sessions) > 0:
                    click_cnt = len(click_sessions)
                else:
                    click_cnt = 0

                feature_tmp = pd.DataFrame([[session_id, record_cnt, total_duration, total_hover_duration, hover_cnt, click_cnt]],
                                            columns=["session_id", f"{self.group}_minigame_{game_name}_record_cnt", f"{self.group}_minigame_{game_name}_total_duration", f"{self.group}_minigame_{game_name}_total_hover_duration",
                                                    f"{self.group}_minigame_{game_name}_hover_cnt", f"{self.group}_minigame_{game_name}_click_cnt"]
                                        )
            else:
                feature_tmp = pd.DataFrame([[session_id, 0, 0, 0, 0, 0]],
                                            columns=["session_id", f"{self.group}_minigame_{game_name}_record_cnt", f"{self.group}_minigame_{game_name}_total_duration", f"{self.group}_minigame_{game_name}_total_hover_duration",
                                                    f"{self.group}_minigame_{game_name}_hover_cnt", f"{self.group}_minigame_{game_name}_click_cnt"]
                                        )
            dfs.append(feature_tmp)
        add_features = pd.concat(dfs, ignore_index=True)
        self.features = self.features.merge(add_features, on="session_id", how="left")


    def get_train(self):
        self._prep()
        self._total_record_cnt()
        self._group_elapsed_time()
        self._cat_record_cnt("event_name")
        self._cat_record_cnt("name")
        self._cat_record_cnt("page")
        self._cat_record_cnt("level")
        self._cat_record_cnt("room_fqid")
        self._cat_record_cnt("fqid")
        self._cat_record_cnt("text_fqid")
        self._cat_record_cnt("event_name+name")
        self._cat_record_cnt("event_name+room_fqid")
        self._cat_col_nunique("text")
        self._cat_col_nunique("text_fqid")
        self._cat_col_nunique("room_fqid")
        self._cat_col_nunique("fqid")

        self._agg_features(val_cols=["room_coor_x", "room_coor_y", "screen_coor_x", "screen_coor_y"], 
                           aggs=["mean"])
        self._agg_features(val_cols=["time_diff", "hover_duration"], 
                           aggs=["mean", "max", "min", "std", "sum"])
        
        self._agg_features(val_cols=["elapsed_time", "index"], 
                           aggs=["max", "min"])

        self._cat_agg_features(val_cols=["time_diff"],
                               aggs=["mean", "max", "min", "std", "sum"],
                               cat_col="event_name")
        self._cat_agg_features(val_cols=["time_diff"],
                               aggs=["mean", "max", "min", "std", "sum"],
                               cat_col="room_fqid")
        
        self._cat_agg_features(val_cols=["time_diff"],
                               aggs=["mean", "max", "min", "std", "sum"],
                               cat_col="fqid")
        self._cat_agg_features(val_cols=["elapsed_time"],
                               aggs=["max", "min"],
                               cat_col="fqid")

        self._cat_agg_features(val_cols=["time_diff"],
                               aggs=["mean", "max", "min", "std", "sum"],
                               cat_col="text_fqid")
        self._cat_agg_features(val_cols=["elapsed_time", "index"],
                               aggs=["max", "min"],
                               cat_col="text_fqid")
        
        self._cat_agg_features(val_cols=["time_diff"],
                               aggs=["mean", "max", "min", "std", "sum"],
                               cat_col="level")
        self._cat_agg_features(val_cols=["elapsed_time", "index"],
                               aggs=["max", "min"],
                               cat_col="level")
        
        self._cat_agg_features(val_cols=["room_coor_x", "room_coor_y", "screen_coor_x", "screen_coor_y"],
                               aggs=["mean"],
                               cat_col="event_name",
                               not_use_cats=['checkpoint', 'map_hover', 'object_hover'])        
        self._cat_agg_features(val_cols=["room_coor_x", "room_coor_y", "screen_coor_x", "screen_coor_y"],
                               aggs=["mean"],
                               cat_col="name")
        
        self._cat_agg_features(val_cols=["hover_duration"],
                               aggs=["mean", "max", "min", "std", "sum"],
                               cat_col="event_name",
                               not_use_cats=['cutscene_click', 'person_click', 'navigate_click',
                                             'observation_click', 'notification_click', 'object_click',
                                             'map_click', 'checkpoint', 'notebook_click'])
        
        self._cat_agg_features(val_cols=["time_diff"],
                               aggs=["mean", "max", "min", "std", "sum"],
                               cat_col="event_name+name")
        
        self._cat_agg_features(val_cols=["time_diff"],
                               aggs=["mean", "max", "min", "std", "sum"],
                               cat_col="event_name+room_fqid")    

        
        self._cat_change_cnt("text_fqid")
        self._cat_change_cnt("room_fqid")

        if self.group == "0-4":
            self._add_minigame_features("tunic", "tunic.hub.slip")
            self._add_minigame_features("plaque", "plaque.face.date")
        
        elif self.group == "5-12":
            self._add_minigame_features("businesscards", "businesscards.card_bingo.bingo")
            self._add_minigame_features("logbook", "logbook.page.bingo")
            self._add_minigame_features("reader", "reader.paper2.bingo")
            self._add_minigame_features("journals", "journals.pic_2.bingo")
        
        elif self.group == "13-22":
            self._add_minigame_features("tracks", "tracks.hub.deer")
            self._add_minigame_features("reader_flag", "reader_flag.paper2.bingo")
            self._add_minigame_features("journals_flag", "journals_flag.pic_0.bingo")
        
        self.result = self.result.merge(self.features, on=["session_id", "level_group"], how="left")
        return self.result

In [8]:
class FeaturesInf:
    def __init__(self, sessions_df, labels, feature_select=False, need_create_features=[]):
        self.sessions_df = sessions_df.sort_values(["index"], ignore_index=True)
        self.result = labels
        self.group = sessions_df["level_group"].values[0]
        self.use_cols = [
            "elapsed_time", "event_name", "name", "level", "page", "index",
            "room_coor_x", "room_coor_y", "screen_coor_x", "screen_coor_y",
            "hover_duration", "text", "fqid", "room_fqid", "text_fqid", "event_name+name", "event_name+room_fqid", "time_diff"
        ]
        self.feature_select = feature_select
        self.need_create_features = need_create_features


    def _prep(self):
        self.sessions_df["event_name+name"] = self.sessions_df["event_name"] + "_" + self.sessions_df["name"]
        self.sessions_df["event_name+room_fqid"] = self.sessions_df["event_name"] + "_" + self.sessions_df["room_fqid"]
        self.sessions_df["time_diff"] = self.sessions_df["elapsed_time"] - self.sessions_df["elapsed_time"].shift(1).values
        self.sessions_df["time_diff"] = np.where(self.sessions_df["time_diff"]<0, 0, self.sessions_df["time_diff"])
        self.sessions_df["time_diff"] = np.nan_to_num(self.sessions_df["time_diff"], 0)
        # dataframeの各列をnumpy arrayで保持
        self.sessions = {}
        for c in self.use_cols:
            self.sessions[c] = self.sessions_df[c].values
        

    def _total_record_cnt(self):
        """level_groupごとのレコード数
        """
        feat_name = f"{self.group}_record_cnt"
        if self.feature_select & (feat_name not in self.need_create_features):
            pass
        else:
            add_feature = len(self.sessions["elapsed_time"])
            self.result[feat_name] = add_feature

    def _group_elapsed_time(self):
        """level_groupごと、epapsed_timeのmax - min（経過時間）
        """
        feat_name = f"{self.group}_group_elapsed_time"
        if self.feature_select & (feat_name not in self.need_create_features):
            pass
        else:
            add_feature = np.max(self.sessions["elapsed_time"]) - np.min(self.sessions["elapsed_time"])
            self.result[feat_name] = np.float32(add_feature)

    def _cat_record_cnt(self, cat_col):
        """level_groupごと、各{cat}のレコード数
        """
        cat_list = cat_col_lists[self.group][cat_col]
        for cat in cat_list:
            feat_name = f"{self.group}_{cat_col}_{str(cat)}_record_cnt"
            if self.feature_select & (feat_name not in self.need_create_features):
                pass
            else:
                add_feature = (self.sessions[cat_col] == cat).astype(int).sum()
                self.result[feat_name] = add_feature

    def _cat_col_nunique(self, cat_col):
        """level_groupごと、[col]のユニーク数
        """
        feat_name = f"{self.group}_{cat_col}_nunique"
        if self.feature_select & (feat_name not in self.need_create_features):
            pass
        else:
            self.result[feat_name] = self.sessions_df[cat_col].dropna().nunique()     

    def _agg_features(self, val_cols, aggs):
        for val_col, agg in itertools.product(val_cols, aggs):
            feat_name = f"{self.group}_{val_col}_{agg}"
            if self.feature_select & (feat_name not in self.need_create_features):
                pass
            else:
                if agg == "mean":
                    add_feature = np.nanmean(self.sessions[val_col])
                elif agg == "max":
                    add_feature = np.nanmax(self.sessions[val_col])
                elif agg == "min":
                    add_feature = np.nanmin(self.sessions[val_col])
                elif agg == "std":
                    add_feature = np.nanstd(self.sessions[val_col], ddof=1)
                elif agg == "sum":
                    add_feature = np.nansum(self.sessions[val_col])
                elif agg == "median":
                    add_feature = np.nanmedian(self.sessions[val_col])
                self.result[feat_name] = np.float32(add_feature)

    def _cat_agg_features(self, val_cols, aggs, cat_col, not_use_cats=None):
        if not_use_cats is not None:
            cat_list = [c for c in cat_col_lists[self.group][cat_col] if c not in not_use_cats]
        else:
            cat_list = cat_col_lists[self.group][cat_col]

        for cat in cat_list:
            idx = self.sessions[cat_col] == cat
        
            if idx.sum() == 0:
                for val_col, agg in itertools.product(val_cols, aggs):
                    feat_name = f"{self.group}_{cat_col}_{cat}_{val_col}_{agg}"
                    if self.feature_select & (feat_name not in self.need_create_features):
                        pass
                    else:
                        self.result[feat_name] = np.float32(-1)
            else:
                for val_col, agg in itertools.product(val_cols, aggs):
                    feat_name = f"{self.group}_{cat_col}_{cat}_{val_col}_{agg}"
                    if self.feature_select & (feat_name not in self.need_create_features):
                        pass
                    else:
                        tmp = self.sessions[val_col][idx]
                        if agg == "mean":
                            add_feature = np.nanmean(tmp)
                        elif agg == "max":
                            add_feature = np.nanmax(tmp)
                        elif agg == "min":
                            add_feature = np.nanmin(tmp)
                        elif agg == "std":
                            add_feature = np.nanstd(tmp, ddof=1)
                        elif agg == "sum":
                            add_feature = np.nansum(tmp)
                        elif agg == "median":
                            add_feature = np.nanmedian(tmp)
                        if np.isnan(add_feature):
                            self.result[feat_name] = np.float32(-1)
                        else:
                            self.result[feat_name] = np.float32(add_feature)

    def _cat_change_cnt(self, cat_col):
        """cat_colの変化回数
        """
        feat_name = f"{self.group}_{cat_col}_change_cnt"
        if self.feature_select & (feat_name not in self.need_create_features):
            pass
        else:
            tmp = self.sessions_df[cat_col].copy()
            tmp = tmp.fillna("nan")
            self.result[feat_name] = (tmp != tmp.shift(1)).sum()


    def _add_minigame_features(self, start_fqid, end_fqid):
        game_name = start_fqid
        start_indexes = self.sessions_df[(self.sessions_df["event_name"]=="navigate_click")&(self.sessions_df["fqid"]==start_fqid)]["index"].values
        end_indexes = self.sessions_df[(self.sessions_df["event_name"]=="object_click")&(self.sessions_df["fqid"]==end_fqid)]["index"].values
        if len(start_indexes) > 0:
            start_index = start_indexes[0]
        else:
            start_index = np.nan
        if len(end_indexes) > 0:
            end_index = end_indexes[0]
        else:
            end_index = np.nan

        if start_index < end_index:
            mini_game_sessions = self.sessions_df[(self.sessions_df["index"]>start_index)&(self.sessions_df["index"]<=end_index)].copy()
            record_cnt = len(mini_game_sessions)
            total_duration = mini_game_sessions["time_diff"].sum()
            total_hover_duration = mini_game_sessions["hover_duration"].sum()

            hover_sessions = mini_game_sessions[mini_game_sessions["event_name"]=="object_hover"].copy()
            if len(hover_sessions) > 0:
                hover_cnt = len(hover_sessions)
            else:
                hover_cnt = 0

            click_sessions = mini_game_sessions[mini_game_sessions["event_name"]=="object_click"].copy()
            if len(click_sessions) > 0:
                click_cnt = len(click_sessions)
            else:
                click_cnt = 0
                                    
        else:
            record_cnt = 0
            total_duration = 0
            total_hover_duration = 0
            hover_cnt = 0
            click_cnt = 0
        
        self.result[f"{self.group}_minigame_{game_name}_record_cnt"] = record_cnt
        self.result[f"{self.group}_minigame_{game_name}_total_duration"] = total_duration
        self.result[f"{self.group}_minigame_{game_name}_total_hover_duration"] = total_hover_duration
        self.result[f"{self.group}_minigame_{game_name}_hover_cnt"] = hover_cnt
        self.result[f"{self.group}_minigame_{game_name}_click_cnt"] = click_cnt
            

    def get_test(self):
        self._prep()
        self._total_record_cnt()
        self._group_elapsed_time()
        self._cat_record_cnt("event_name")
        self._cat_record_cnt("name")
        self._cat_record_cnt("page")
        self._cat_record_cnt("level")
        self._cat_record_cnt("room_fqid")
        self._cat_record_cnt("fqid")
        self._cat_record_cnt("text_fqid")
        self._cat_record_cnt("event_name+name")
        self._cat_record_cnt("event_name+room_fqid")
        self._cat_col_nunique("text")
        self._cat_col_nunique("text_fqid")
        self._cat_col_nunique("room_fqid")
        self._cat_col_nunique("fqid")

        self._agg_features(val_cols=["room_coor_x", "room_coor_y", "screen_coor_x", "screen_coor_y"], 
                           aggs=["mean"])
        self._agg_features(val_cols=["time_diff", "hover_duration"], 
                           aggs=["mean", "max", "min", "std", "sum"])
        
        self._agg_features(val_cols=["elapsed_time", "index"], 
                           aggs=["max", "min"])

        self._cat_agg_features(val_cols=["time_diff"],
                               aggs=["mean", "max", "min", "std", "sum"],
                               cat_col="event_name")
        self._cat_agg_features(val_cols=["time_diff"],
                               aggs=["mean", "max", "min", "std", "sum"],
                               cat_col="room_fqid")
        
        self._cat_agg_features(val_cols=["time_diff"],
                               aggs=["mean", "max", "min", "std", "sum"],
                               cat_col="fqid")
        self._cat_agg_features(val_cols=["elapsed_time"],
                               aggs=["max", "min"],
                               cat_col="fqid")

        self._cat_agg_features(val_cols=["time_diff"],
                               aggs=["mean", "max", "min", "std", "sum"],
                               cat_col="text_fqid")
        self._cat_agg_features(val_cols=["elapsed_time", "index"],
                               aggs=["max", "min"],
                               cat_col="text_fqid")
        
        self._cat_agg_features(val_cols=["time_diff"],
                               aggs=["mean", "max", "min", "std", "sum"],
                               cat_col="level")
        self._cat_agg_features(val_cols=["elapsed_time", "index"],
                               aggs=["max", "min"],
                               cat_col="level")
        
        self._cat_agg_features(val_cols=["room_coor_x", "room_coor_y", "screen_coor_x", "screen_coor_y"],
                               aggs=["mean"],
                               cat_col="event_name",
                               not_use_cats=['checkpoint', 'map_hover', 'object_hover'])        
        self._cat_agg_features(val_cols=["room_coor_x", "room_coor_y", "screen_coor_x", "screen_coor_y"],
                               aggs=["mean"],
                               cat_col="name")
        
        self._cat_agg_features(val_cols=["hover_duration"],
                               aggs=["mean", "max", "min", "std", "sum"],
                               cat_col="event_name",
                               not_use_cats=['cutscene_click', 'person_click', 'navigate_click',
                                             'observation_click', 'notification_click', 'object_click',
                                             'map_click', 'checkpoint', 'notebook_click'])
        
        self._cat_agg_features(val_cols=["time_diff"],
                               aggs=["mean", "max", "min", "std", "sum"],
                               cat_col="event_name+name")
        
        self._cat_agg_features(val_cols=["time_diff"],
                               aggs=["mean", "max", "min", "std", "sum"],
                               cat_col="event_name+room_fqid")    

        
        self._cat_change_cnt("text_fqid")
        self._cat_change_cnt("room_fqid")

        if self.group == "0-4":
            self._add_minigame_features("tunic", "tunic.hub.slip")
            self._add_minigame_features("plaque", "plaque.face.date")
        
        elif self.group == "5-12":
            self._add_minigame_features("businesscards", "businesscards.card_bingo.bingo")
            self._add_minigame_features("logbook", "logbook.page.bingo")
            self._add_minigame_features("reader", "reader.paper2.bingo")
            self._add_minigame_features("journals", "journals.pic_2.bingo")
        
        elif self.group == "13-22":
            self._add_minigame_features("tracks", "tracks.hub.deer")
            self._add_minigame_features("reader_flag", "reader_flag.paper2.bingo")
            self._add_minigame_features("journals_flag", "journals_flag.pic_0.bingo")
        
        return self.result

In [9]:
def get_train_dataset(sessions, labels):
    # labelデータの整形
    labels = transform_labels_df_train(labels)

    # 特徴量生成
    feat = FeaturesTrain(sessions, labels)
    train = feat.get_train()
    train["question"] = train["question"].astype("category")

    return train

def get_test_dataset(sessions, labels, feature_select=False, need_create_features=[]):
    # labelデータの整形
    labels = transform_labels_df_inf(labels)

    # 特徴量生成
    feat = FeaturesInf(sessions, labels, feature_select, need_create_features)
    test = feat.get_test()
    test["question"] = test["question"].astype("category")

    return test   

In [10]:
def calc_metrics(oof):
    logloss = log_loss(oof["correct"], oof["pred"])

    # find best th
    scores = []; thresholds = []
    best_score = 0; best_threshold = 0

    for threshold in np.arange(0.4,0.81,0.01):
        preds = (oof["pred"].values>threshold).astype(int)
        m = f1_score(oof["correct"].values, preds, average='macro')   
        scores.append(m)
        thresholds.append(threshold)
        if m>best_score:
            best_score = m
            best_threshold = threshold
    print("logloss", format(logloss, ".6f"))
    print("best_score", format(best_score, ".6f"))
    print("best_threshold", format(best_threshold, ".3f"))

    # Q別スコア
    print("---"*10)
    for q in range(18):
        q = q + 1
        preds = (oof[oof["question"]==q]["pred"].values>threshold).astype(int)
        m = f1_score(oof[oof["question"]==q]["correct"].values, preds, average='macro')
        print(f"Q{q} : F1 = {format(m, '.6f')}")
    return best_threshold

In [11]:
class FeaturesSelect:
    def __init__(self, df, init_features, not_drop_cols=None, corr_th=0.99):
        self.init_features = init_features
        self.df = df
        self.corr_th = corr_th
        self.drop_cols = []
        self.not_drop_cols = not_drop_cols
    
    def _high_corr_features_drop(self):
        # 特徴量間の相関行列を計算
        corr_matrix = self.df[self.init_features].corr().abs()
        # 相関行列の上三角行列を取得します。（相関行列が対称であるため、重複する相関を取り除くため）
        upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

        cols = upper.columns
        if self.not_drop_cols:
            cols = list(set(cols)-set(self.not_drop_cols))

        drop_cols = []
        for c in cols:
            if any(upper[c] > self.corr_th):
                drop_cols.append(c)
                upper = upper.drop(index=c)
        print(f"特徴量間の相関性が高い特徴量を{str(len(drop_cols))}個削除")
        self.df = self.df.drop(columns=drop_cols)
        self.drop_cols = self.drop_cols + drop_cols

    def features_select(self):
        self._high_corr_features_drop()
        selected_features = list(set(self.init_features) - set(self.drop_cols))
        return selected_features

In [12]:
def run_train():
    oofs = []
    prev_features_df = None # 次のlevel_groupで特徴量を使うための保持データ。0-4は前のlevel_groupがないので初期値はNone
    for group in level_group_list:
        print(group)
        # データ読み込み
        train_sessions = pd.read_csv(cfg.prep_dir + f"train{group}_cleaned.csv")
        labels = pd.read_csv(cfg.prep_dir + f"train_labels{group}.csv")
        train = get_train_dataset(train_sessions, labels)

        # 一つ前のlevel_groupの特徴量を追加
        if prev_features_df is not None:
            train = train.merge(prev_features_df, on=["session_id"], how="left")
        else:
            pass

        # 前のlevel_groupのquestionパートの経過時間特徴量
        if group == "5-12":
            train["0-4_question_duration_time"] = train["5-12_elapsed_time_min"] - train["0-4_elapsed_time_max"]
            train["0-4_question_duration_index"] = train["5-12_index_min"] - train["0-4_index_max"]
        elif group == "13-22":
            train["5-12_question_duration_time"] = train["13-22_elapsed_time_min"] - train["5-12_elapsed_time_max"]
            train["5-12_question_duration_index"] = train["13-22_index_min"] - train["5-12_index_max"]
    
        target = "correct"
        not_use_cols = [target, "session_id", "level_group"]
        features = [c for c in train.columns if c not in not_use_cols]

        # 特徴量選択
        if cfg.base_exp is None:
            not_drop_cols = ["0-4_elapsed_time_max", "0-4_index_max", "5-12_elapsed_time_max", "5-12_index_max", "13-22_elapsed_time_max", "13-22_index_max",
                            "0-4_elapsed_time_min", "0-4_index_min", "5-12_elapsed_time_min", "5-12_index_min", "13-22_elapsed_time_min", "13-22_index_min"]
            features = FeaturesSelect(train, features, not_drop_cols).features_select()
        else:
            # 使用する特徴量の抽出
            features = pd.read_csv(cfg.output_dir + f"{cfg.base_exp}/fi_{group}.csv").head(cfg.n_features)["feature"].tolist()

        gkf = GroupKFold(n_splits=cfg.n_splits)
        fis = []
        
        oof_groups = []
        for i, (tr_idx, vl_idx) in enumerate(gkf.split(train[features], train[target], train["session_id"])):
            model_path = cfg.output_dir + f"{cfg.exp_name}/{cfg.exp_name}_model_{group}_{i}.lgb"
            
            print(f"fold : {i}")
            tr_x, tr_y = train.iloc[tr_idx][features], train.iloc[tr_idx][target]
            vl_x, vl_y = train.iloc[vl_idx][features], train.iloc[vl_idx][target]
            tr_data = lgb.Dataset(tr_x, label=tr_y)
            vl_data = lgb.Dataset(vl_x, label=vl_y)

            if os.path.exists(model_path):
                print(f"modelが既に存在するのでロード : {model_path}")
                model = lgb.Booster(model_file=model_path)
            else:
                model = lgb.train(params, tr_data, valid_sets=[tr_data, vl_data],
                                num_boost_round=20000, early_stopping_rounds=100, verbose_eval=100)
            # モデル出力
            model.save_model(cfg.output_dir + f"{cfg.exp_name}/{cfg.exp_name}_model_{group}_{i}.lgb")
        
            # valid_pred
            oof_fold = train.iloc[vl_idx].copy()
            oof_fold["pred"] = model.predict(vl_x, num_iteration=model.best_iteration)
            oof_groups.append(oof_fold)

            # 特徴量重要度
            fi_fold = pd.DataFrame()
            fi_fold["feature"] = model.feature_name()
            fi_fold["importance"] = model.feature_importance(importance_type="gain")
            fi_fold["fold"] = i
            fis.append(fi_fold)

        fi = pd.concat(fis)    
        fi = fi.groupby("feature")["importance"].mean().reset_index()
        fi = fi.sort_values("importance", ascending=False).reset_index(drop=True)
        fi.to_csv(cfg.output_dir + f"{cfg.exp_name}/fi_{group}.csv", index=False)

        oof_group = pd.concat(oof_groups)
        oofs.append(oof_group)

        # 次のlevel_groupで使う用に特徴量を保持
        prev_features_df = train.groupby("session_id").head(1).drop(columns=["question", "correct", "level_group"])

        # meta_featureの付与
        meta_df = oof_group.groupby("session_id")["pred"].agg(["mean", "max", "min", "std"]).reset_index()
        meta_df = meta_df.rename(columns={"mean":f"{group}_pred_mean", "max":f"{group}_pred_max", "min":f"{group}_pred_min", "std":f"{group}_pred_std"})
        prev_features_df = prev_features_df.merge(meta_df, on="session_id", how="left")

    # cv
    oof = pd.concat(oofs)
    best_threshold = calc_metrics(oof)
    cfg.best_threshold = best_threshold
    oof[["session_id", "question", "pred", "correct"]].to_csv(cfg.output_dir + f"{cfg.exp_name}/oof.csv.gz", compression="gzip", index=False)

In [13]:
def get_mock_iter_train():
    """trainデータのiter分割を適用したtest_sample
    """
    test = pd.read_csv(cfg.input_dir + "_old/test.csv")
    sub = pd.read_csv(cfg.input_dir + "_old/sample_submission.csv")
    sub["level_group"] = sub["session_level"].apply(lambda x: x.split("_")[-1])
    
    # groupbyでiter作るときにgroup_levelの順番が崩れないように
    test["level_group2"] = test["level_group"].str.replace("13-22", "6")
    sub["level_group2"] = sub["level_group"].str.replace("13-22", "6")

    tests = [df[1].drop(columns=["session_level", "level_group2"]).reset_index(drop=True) for df in test.groupby("level_group2")]
    subs = [df[1].drop(columns=["session_level", "level_group2"]).reset_index(drop=True) for df in sub.groupby("level_group2")]
    return zip(tests, subs)

def get_mock_iter_test():
    """testデータのiter分割を適用したtest_sample
    """
    test = pd.read_csv(cfg.input_dir + "_old/test.csv")
    sub = pd.read_csv(cfg.input_dir + "_old/sample_submission.csv")
    
    # groupbyでiter作るときにgroup_levelの順番が崩れないように
    test["session_level"] = test["session_level"].str.replace("13-22", "6")
    sub["session_level"] = sub["session_level"].str.replace("13-22", "6")

    tests = [df[1].drop(columns="session_level").reset_index(drop=True) for df in test.groupby("session_level")]
    subs = [df[1].drop(columns="session_level").reset_index(drop=True) for df in sub.groupby("session_level")]
    return zip(tests, subs)

In [14]:
def inference(mode):
    if mode == "local_cv":
        # time series apiを模したiterをモックとして用意する
        iter_test = get_mock_iter_test()
        start_time = time.time()
    elif mode == "kaggle_inf":
        env = jo_wilder_310.make_env()
        iter_test = env.iter_test()
        
    model_dict = {}
    features_dict = {}
    for g in level_group_list:
        if mode == "local_cv":
            model_paths = [cfg.output_dir + f"{cfg.exp_name}/{cfg.exp_name}_model_{g}_{i}.lgb" for i in range(cfg.n_splits)]
        elif mode == "kaggle_inf":
            model_paths = [f"/kaggle/input/jo-wilder-{cfg.exp_name}/{cfg.exp_name}_model_{g}_{i}.lgb" for i in range(cfg.n_splits)]
        model_dict[g] = [lgb.Booster(model_file=p) for p in model_paths]
        features_dict[g] = model_dict[g][0].feature_name()
    need_create_features = features_dict["0-4"] + features_dict["5-12"] + features_dict["13-22"]
    not_drop_cols = ["0-4_elapsed_time_max", "0-4_index_max", "5-12_elapsed_time_max", "5-12_index_max", "13-22_elapsed_time_max", "13-22_index_max",
                     "0-4_elapsed_time_min", "0-4_index_min", "5-12_elapsed_time_min", "5-12_index_min", "13-22_elapsed_time_min", "13-22_index_min"]
    need_create_features = need_create_features + not_drop_cols
    need_create_features = list(set(need_create_features))
    
    prev_features_df = None
    for (test_sessions, sample_submission) in iter_test:
        level_group = test_sessions["level_group"].values[0]
        test = get_test_dataset(test_sessions, sample_submission, feature_select=True, need_create_features=need_create_features)
        features = features_dict[level_group]
        preds = np.zeros(len(test))

        if level_group == "0-4":
            pass
        else:
            test = test.merge(prev_features_df, on=["session_id"], how="left")

        # 前のlevel_groupのquestionパートの経過時間特徴量
        if level_group == "5-12":
            test["0-4_question_duration_time"] = test["5-12_elapsed_time_min"] - test["0-4_elapsed_time_max"]
            test["0-4_question_duration_index"] = test["5-12_index_min"] - test["0-4_index_max"]
        elif level_group == "13-22":
            test["5-12_question_duration_time"] = test["13-22_elapsed_time_min"] - test["5-12_elapsed_time_max"]
            test["5-12_question_duration_index"] = test["13-22_index_min"] - test["5-12_index_max"]

        prev_features_df = test.groupby("session_id").head(1).drop(columns=["question", "correct"])

        for i in range(cfg.n_splits):
            model = model_dict[level_group][i]
            preds += model.predict(test[features], num_iteration=model.best_iteration) / cfg.n_splits
        test["pred"] = preds
        preds = (preds>cfg.best_threshold).astype(int)
        sample_submission["correct"] = preds

        # meta_featureの付与
        meta_df = test.groupby("session_id")["pred"].agg(["mean", "max", "min", "std"]).reset_index()
        meta_df = meta_df.rename(columns={"mean":f"{level_group}_pred_mean", "max":f"{level_group}_pred_max", "min":f"{level_group}_pred_min", "std":f"{level_group}_pred_std"})
        prev_features_df = prev_features_df.merge(meta_df, on="session_id", how="left")

        if mode == "local_cv":
            print(sample_submission["correct"].values)
        elif mode == "kaggle_inf":
            env.predict(sample_submission)
    if mode == "local_cv":
        process_time = format(time.time() - start_time, ".1f")
        print("sample_inf処理時間 : ", process_time, "秒")

In [15]:
def valid_train_test_process_identity():
    iter_train = get_mock_iter_train()
    iter_test = get_mock_iter_test()

    print("train_iter")
    train_df_dict = {}
    train_features_dict = {}
    prev_features_df = None
    for (sessions, sub) in iter_train:
        group = sessions["level_group"].values[0]
        print(group)
        train = get_train_dataset(sessions, sub)
        if prev_features_df is not None:
            train = train.merge(prev_features_df, on=["session_id"], how="left")
        else:
            pass
            # 前のlevel_groupのquestionパートの経過時間特徴量
        if group == "5-12":
            train["0-4_question_duration_time"] = train["5-12_elapsed_time_min"] - train["0-4_elapsed_time_max"]
            train["0-4_question_duration_index"] = train["5-12_index_min"] - train["0-4_index_max"]
        elif group == "13-22":
            train["5-12_question_duration_time"] = train["13-22_elapsed_time_min"] - train["5-12_elapsed_time_max"]
            train["5-12_question_duration_index"] = train["13-22_index_min"] - train["5-12_index_max"]
        target = "correct"
        not_use_cols = [target, "session_id", "level_group"]
        features = [c for c in train.columns if c not in not_use_cols]
        train_df_dict[group] = train[["session_id"]+features].sort_values(["session_id", "question"], ignore_index=True)
        prev_features_df = train[["session_id"]+features].groupby("session_id").head(1).drop(columns="question")
        train_features_dict[group] = features


    print("test_iter")
    test_dfs_0_4 = []
    test_dfs_5_12 = []
    test_dfs_13_22 = []
    prev_features_df = None
    for (test_sessions, sample_submission) in iter_test:
        level_group = test_sessions["level_group"].values[0]
        session_id = test_sessions["session_id"].values[0]
        print(session_id, level_group)
        features = train_features_dict[level_group]
        test = get_test_dataset(test_sessions, sample_submission)

        if level_group == "0-4":
            pass
        else:
            test = test.merge(prev_features_df, on=["session_id"], how="left")

        # 前のlevel_groupのquestionパートの経過時間特徴量
        if level_group == "5-12":
            test["0-4_question_duration_time"] = test["5-12_elapsed_time_min"] - test["0-4_elapsed_time_max"]
            test["0-4_question_duration_index"] = test["5-12_index_min"] - test["0-4_index_max"]
        elif level_group == "13-22":
            test["5-12_question_duration_time"] = test["13-22_elapsed_time_min"] - test["5-12_elapsed_time_max"]
            test["5-12_question_duration_index"] = test["13-22_index_min"] - test["5-12_index_max"]
        target = "correct"
        not_use_cols = [target, "session_id", "level_group"]
        features = [c for c in test.columns if c not in not_use_cols]
        prev_features_df = test[["session_id"]+features].groupby("session_id").head(1).drop(columns="question")
        if level_group == "0-4":
            test_dfs_0_4.append(test[["session_id"]+features])
        elif level_group == "5-12":
            test_dfs_5_12.append(test[["session_id"]+features])
        elif level_group == "13-22":
            test_dfs_13_22.append(test[["session_id"]+features])
        

    test_dfs_0_4 = pd.concat(test_dfs_0_4, ignore_index=True).sort_values(["session_id", "question"], ignore_index=True)
    test_dfs_5_12 = pd.concat(test_dfs_5_12, ignore_index=True).sort_values(["session_id", "question"], ignore_index=True)
    test_dfs_13_22 = pd.concat(test_dfs_13_22, ignore_index=True).sort_values(["session_id", "question"], ignore_index=True)

    assert train_df_dict["0-4"][train_features_dict["0-4"]].equals(test_dfs_0_4[train_features_dict["0-4"]])
    assert train_df_dict["5-12"][train_features_dict["5-12"]].equals(test_dfs_5_12[train_features_dict["5-12"]])
    assert train_df_dict["13-22"][train_features_dict["13-22"]].equals(test_dfs_13_22[train_features_dict["13-22"]])

In [16]:
if cfg.mode == "local_cv":
    valid_train_test_process_identity()
    run_train()
inference(cfg.mode)

train_iter
0-4


100%|██████████| 3/3 [00:00<00:00, 302.27it/s]
100%|██████████| 3/3 [00:00<00:00, 281.74it/s]


5-12


100%|██████████| 3/3 [00:00<00:00, 346.17it/s]
100%|██████████| 3/3 [00:00<00:00, 376.43it/s]
100%|██████████| 3/3 [00:00<00:00, 382.29it/s]
100%|██████████| 3/3 [00:00<00:00, 376.84it/s]


13-22


100%|██████████| 3/3 [00:00<00:00, 336.12it/s]
100%|██████████| 3/3 [00:00<00:00, 334.66it/s]
100%|██████████| 3/3 [00:00<00:00, 387.76it/s]


test_iter
20090109393214576 0-4
20090109393214576 5-12
20090109393214576 13-22
20090312143683264 0-4
20090312143683264 5-12
20090312143683264 13-22
20090312331414616 0-4
20090312331414616 5-12
20090312331414616 13-22
0-4


100%|██████████| 23562/23562 [01:44<00:00, 224.45it/s]
100%|██████████| 23562/23562 [01:40<00:00, 234.55it/s]


特徴量間の相関性が高い特徴量を572個削除
fold : 0
[LightGBM] [Info] Number of positive: 49821, number of negative: 6726
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 121993
[LightGBM] [Info] Number of data points in the train set: 56547, number of used features: 622
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.881055 -> initscore=2.002456
[LightGBM] [Info] Start training from score 2.002456
Training until validation scores don't improve for 100 rounds
[100]	training's binary_logloss: 0.284241	valid_1's binary_logloss: 0.305585
[200]	training's binary_logloss: 0.253209	valid_1's binary_logloss: 0.284847
[300]	training's binary_logloss: 0.234366	valid_1's binary_logloss: 0.275961
[400]	training's binary_logloss: 0.22049	valid_1's binary_logloss: 0.271781
[500]	training's binary_logloss: 0.209145	valid_1's binary_logloss: 0.270015
[600]	training's binary_logloss: 0.199145	valid_1's binary_logloss: 0.269314
[700]	training's binary_logloss: 0.190478	valid_1's bina

100%|██████████| 23562/23562 [02:52<00:00, 136.80it/s]
100%|██████████| 23562/23562 [02:49<00:00, 138.83it/s]
100%|██████████| 23562/23562 [02:49<00:00, 138.61it/s]
100%|██████████| 23562/23562 [02:51<00:00, 137.54it/s]


特徴量間の相関性が高い特徴量を1364個削除
fold : 0
[LightGBM] [Info] Number of positive: 122655, number of negative: 65835
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 354878
[LightGBM] [Info] Number of data points in the train set: 188490, number of used features: 1762
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.650724 -> initscore=0.622224
[LightGBM] [Info] Start training from score 0.622224
Training until validation scores don't improve for 100 rounds
[100]	training's binary_logloss: 0.569844	valid_1's binary_logloss: 0.577237
[200]	training's binary_logloss: 0.540663	valid_1's binary_logloss: 0.55378
[300]	training's binary_logloss: 0.524789	valid_1's binary_logloss: 0.543751
[400]	training's binary_logloss: 0.513956	valid_1's binary_logloss: 0.53859
[500]	training's binary_logloss: 0.505769	valid_1's binary_logloss: 0.536073
[600]	training's binary_logloss: 0.498932	valid_1's binary_logloss: 0.534722
[700]	training's binary_logloss: 0.492827	valid_1's 

100%|██████████| 23562/23562 [03:47<00:00, 103.67it/s]
100%|██████████| 23562/23562 [03:45<00:00, 104.44it/s]
100%|██████████| 23562/23562 [03:41<00:00, 106.28it/s]


特徴量間の相関性が高い特徴量を2288個削除
fold : 0
[LightGBM] [Info] Number of positive: 67313, number of negative: 26932
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 657722
[LightGBM] [Info] Number of data points in the train set: 94245, number of used features: 3222
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.714234 -> initscore=0.916038
[LightGBM] [Info] Start training from score 0.916038
Training until validation scores don't improve for 100 rounds
[100]	training's binary_logloss: 0.530844	valid_1's binary_logloss: 0.547823
[200]	training's binary_logloss: 0.497631	valid_1's binary_logloss: 0.523811
[300]	training's binary_logloss: 0.478987	valid_1's binary_logloss: 0.514525
[400]	training's binary_logloss: 0.464834	valid_1's binary_logloss: 0.509444
[500]	training's binary_logloss: 0.453613	valid_1's binary_logloss: 0.506956
[600]	training's binary_logloss: 0.443993	valid_1's binary_logloss: 0.505442
[700]	training's binary_logloss: 0.435152	valid_1's 