In [1]:
import pandas as pd
import numpy as np
import warnings
import datetime
from time import time
from tqdm import tqdm_notebook as tqdm
from collections import Counter
from sklearn import preprocessing
from sklearn.model_selection import GroupKFold, StratifiedKFold
from sklearn.metrics import confusion_matrix, cohen_kappa_score
import lightgbm as lgb
from functools import partial
import json
import copy
import time
import seaborn as sns
import scipy as sp
pd.set_option('display.max_columns', None)
pd.set_option("display.max_rows",1000)
np.set_printoptions(precision=8)
warnings.filterwarnings("ignore")

In [2]:
def qwk(a1, a2):
    max_rat = 3
    a1 = np.asarray(a1, dtype=int)
    a2 = np.asarray(a2, dtype=int)
    hist1 = np.zeros((max_rat + 1, ))
    hist2 = np.zeros((max_rat + 1, ))
    o = 0
    for k in range(a1.shape[0]):
        i, j = a1[k], a2[k]
        hist1[i] += 1
        hist2[j] += 1
        o +=  (i - j) * (i - j)
    e = 0
    for i in range(max_rat + 1):
        for j in range(max_rat + 1):
            e += hist1[i] * hist2[j] * (i - j) * (i - j)
    e = e / a1.shape[0]
    return np.round(1 - o / e, 8)

In [3]:
class OptimizedRounder(object):
    def __init__(self):
        self.coef_ = 0

    def _kappa_loss(self, coef, X, y):
        X_p = pd.cut(X, [-np.inf] + list(np.sort(coef)) + [np.inf], labels = [0, 1, 2, 3])
        return -qwk(y, X_p)
    
    def fit(self, X, y):
        loss_partial = partial(self._kappa_loss, X=X, y=y)
        initial_coef = [0.5, 1.5, 2.5]
        self.coef_ = sp.optimize.minimize(loss_partial, initial_coef, method='nelder-mead')
        
    def predict(self, X, coef):
        return pd.cut(X, [-np.inf] + list(np.sort(coef)) + [np.inf], labels = [0, 1, 2, 3])

    def coefficients(self):
        return self.coef_['x']

# install

In [4]:
%%time
train = pd.read_csv('../input/data-science-bowl-2019/train.csv')
train_labels = pd.read_csv('../input/data-science-bowl-2019/train_labels.csv')
test = pd.read_csv('../input/data-science-bowl-2019/test.csv')
#specs = pd.read_csv('../input/data-science-bowl-2019/specs.csv')
sample_submission = pd.read_csv('../input/data-science-bowl-2019/sample_submission.csv')

CPU times: user 1min 5s, sys: 7.7 s, total: 1min 12s
Wall time: 1min 12s


In [5]:
keep_id = train[train.type == "Assessment"][['installation_id']].drop_duplicates()
train = pd.merge(train, keep_id, on="installation_id", how="inner")
train = train[train.installation_id.isin(train_labels.installation_id.unique())]

In [6]:
assess_title = ['Mushroom Sorter (Assessment)', 'Bird Measurer (Assessment)',
       'Cauldron Filler (Assessment)', 'Cart Balancer (Assessment)', 'Chest Sorter (Assessment)']
def remove_index_calc(df):
    additional_remove_index = []
    for i, session in df.groupby('installation_id', sort=False):
        last_row = session.index[-1]
        session = session[session.title.isin(assess_title)]
        first_row = session.index[-1] + 1
        for j in range(first_row, last_row+1):
            additional_remove_index.append(j)                
    return additional_remove_index
additional_remove_index = remove_index_calc(train)
train = train[~train.index.isin(additional_remove_index)]

# Preprocess and Feature engineering

In [7]:
length_list = [19, 17,  61 ,-999,61, -999, 109, 25, -999, 80, -999, -999, 156, 26, -999, 126, -999, -999, 20, -999, 60, -999, 22, -999,
              -999, -999, -999, -999, 18, -999, 72, -999, 118, 24, 142, -999, -999, -999, 19, 61] 
title_list = ["Welcome to Lost Lagoon!",  "Tree Top City - Level 1", "Ordering Spheres", "All Star Sorting",  "Costume Box", "Fireworks (Activity)", 
              "12 Monkeys", "Tree Top City - Level 2", "Flower Waterer (Activity)", "Pirate's Tale",  "Mushroom Sorter (Assessment)",  "Air Show",  
              "Treasure Map", "Tree Top City - Level 3", "Crystals Rule", "Rulers", "Bug Measurer (Activity)",  "Bird Measurer (Assessment)",  
              "Magma Peak - Level 1",   "Sandcastle Builder (Activity)",  "Slop Problem",  "Scrub-A-Dub",  "Watering Hole (Activity)", 
              "Magma Peak - Level 2" , "Dino Drink", "Bubble Bath",  "Bottle Filler (Activity)",  "Dino Dive", "Cauldron Filler (Assessment)", 
              "Crystal Caves - Level 1", "Chow Time", "Balancing Act",  "Chicken Balancer (Activity)",  "Lifting Heavy Things", 
              "Crystal Caves - Level 2" , "Honey Cake" ,  "Happy Camel", "Cart Balancer (Assessment)",  "Leaf Leader",  "Crystal Caves - Level 3",  
              "Heavy, Heavier, Heaviest"]
title_list = pd.DataFrame(title_list)
length_list = pd.DataFrame(length_list)
length_df = pd.concat([title_list, length_list], axis=1)
length_df.columns = ["title", "length"]

In [8]:
%%time
magma_short_video = [28, 2]
magma_long_video = [22]
tree_short_video = [38, 5, 24]
tree_long_video = [43, 39, 36, 40, 31, 21]
crys_short_video =  [26, 15, 25]
crys_long_video = [18, 27, 29, 19]

def encode_title(train, test):
    train = pd.merge(train, length_df, on="title", how="left")
    test = pd.merge(test, length_df, on="title", how="left")
    train.fillna(-999)
    test.fillna(-999)
    
    train['title_event_code'] = list(map(lambda x, y: str(x) + '_' + str(y), train['title'], train['event_code']))
    test['title_event_code'] = list(map(lambda x, y: str(x) + '_' + str(y), test['title'], test['event_code']))
    list_of_title_eventcode = list(set(train['title_event_code'].unique()).union(set(test['title_event_code'].unique())))

    list_of_user_activities = list(set(train['title'].unique()).union(set(test['title'].unique())))
    list_of_event_code = list(set(train['event_code'].unique()).union(set(test['event_code'].unique())))
    list_of_worlds = list(set(train['world'].unique()).union(set(test['world'].unique())))
    activities_map = dict(zip(list_of_user_activities, np.arange(len(list_of_user_activities))))
    activities_labels = dict(zip(np.arange(len(list_of_user_activities)), list_of_user_activities))
    activities_world = dict(zip(list_of_worlds, np.arange(len(list_of_worlds))))
    assess_titles = list(set(train[train['type'] == 'Assessment']['title'].value_counts().index).union(set(test[test['type'] == 'Assessment']['title'].value_counts().index)))

    train['title'] = train['title'].map(activities_map)
    test['title'] = test['title'].map(activities_map)
    train['world'] = train['world'].map(activities_world)
    test['world'] = test['world'].map(activities_world)

    win_code = dict(zip(activities_map.values(), (4100*np.ones(len(activities_map))).astype('int')))
    win_code[activities_map['Bird Measurer (Assessment)']] = 4110
    
    train['timestamp'] = pd.to_datetime(train['timestamp'])
    test['timestamp'] = pd.to_datetime(test['timestamp'])
    train['date'] = train['timestamp'].dt.date
    test['date'] = test['timestamp'].dt.date
    
    train["x"] = train["event_data"].apply(lambda x: json.loads(x)["coordinates"]["x"] if "\"coordinates\"" in x else -999)
    train["y"] = train["event_data"].apply(lambda x: json.loads(x)["coordinates"]["y"] if "\"coordinates\"" in x else -999)
    test["x"] = test["event_data"].apply(lambda x: json.loads(x)["coordinates"]["x"] if "\"coordinates\"" in x else -999)
    test["y"] = test["event_data"].apply(lambda x: json.loads(x)["coordinates"]["y"] if "\"coordinates\"" in x else -999)

    #train["stage_height"] = train["event_data"].apply(lambda x: json.loads(x)["coordinates"]["stage_height"] if "\"coordinates\"" in x else -999)
    #test["stage_height"] = test["event_data"].apply(lambda x: json.loads(x)["coordinates"]["stage_height"] if "\"coordinates\"" in x else -999)
    #train["stage_width"] = train["event_data"].apply(lambda x: json.loads(x)["coordinates"]["stage_width"] if "\"coordinates\"" in x else -999)
    #test["stage_width"] = test["event_data"].apply(lambda x: json.loads(x)["coordinates"]["stage_width"] if "\"coordinates\"" in x else -999)
    #train["growth"] = train["event_data"].apply(lambda x: sum(json.loads(x)["growth"]) if "\"growth\"" in x else np.nan)
    #test["growth"] = test["event_data"].apply(lambda x: sum(json.loads(x)["growth"]) if "\"growth\"" in x else np.nan)
    
    train["target_distances"] = train["event_data"].apply(lambda x: sum(json.loads(x)["target_distances"]) if "\"target_distances\"" in x else np.nan)
    test["target_distances"] = test["event_data"].apply(lambda x: sum(json.loads(x)["target_distances"]) if "\"target_distances\"" in x else np.nan)
    train["distance"] = train["event_data"].apply(lambda x: json.loads(x)["distance"] if "\"distance\"" in x else np.nan)
    test["distance"] = test["event_data"].apply(lambda x: json.loads(x)["distance"] if "\"distance\"" in x else np.nan)
    train["misses"] = train["event_data"].apply(lambda x: json.loads(x)["misses"] if "\"misses\"" in x else np.nan)
    test["misses"] = test["event_data"].apply(lambda x: json.loads(x)["misses"] if "\"misses\"" in x else np.nan)
    train["time_played"] = train["event_data"].apply(lambda x: json.loads(x)["time_played"] if "\"time_played\"" in x else np.nan)
    test["time_played"] = test["event_data"].apply(lambda x: json.loads(x)["time_played"] if "\"time_played\"" in x else np.nan)
    #train["group"] = train["event_data"].apply(lambda x: json.loads(x)["group"] if "\"group\"" in x else np.nan)
    #test["group"] = test["event_data"].apply(lambda x: json.loads(x)["group"] if "\"group\"" in x else np.nan)
    
    train_timediff = train['timestamp'].diff(1) / np.timedelta64(1,'s')
    train_timediff = list(train_timediff[1:])
    train_timediff.append(0)
    train["timediff"] = train_timediff
    test_timediff = test['timestamp'].diff(1) / np.timedelta64(1,'s')
    test_timediff = list(test_timediff[1:])
    test_timediff.append(0)
    test["timediff"] = test_timediff
    
    train["clip_video"] = train["title"].apply(lambda x: "magma_intro" if x in magma_short_video else(
                                                    "magma_long" if x in magma_long_video else(
                                                    "tree_intro" if x in tree_short_video else(
                                                    "tree_long" if x in tree_long_video else(
                                                    "crys_intro" if x in crys_short_video else(
                                                    "crys_long" if x in crys_long_video else "Non_video"))))))
    test["clip_video"] = test["title"].apply(lambda x: "magma_intro" if x in magma_short_video else(
                                                    "magma_long" if x in magma_long_video else(
                                                    "tree_intro" if x in tree_short_video else(
                                                    "tree_long" if x in tree_long_video else(
                                                    "crys_intro" if x in crys_short_video else(
                                                    "crys_long" if x in crys_long_video else "Non_video"))))))
    
    train["click"] = train["event_data"].apply(lambda x: 1 if "\"coordinates\"" in x else 0)
    test["click"] = test["event_data"].apply(lambda x: 1 if "\"coordinates\"" in x else 0)
    
    train["media"] = train["event_data"].apply(lambda x: json.loads(x)["media_type"] if "\"media_type\"" in x else "None")
    test["media"] = test["event_data"].apply(lambda x: json.loads(x)["media_type"] if "\"media_type\"" in x else "None")
                
    return train, test, win_code, list_of_user_activities, list_of_event_code, activities_labels, assess_titles, activities_world, list_of_title_eventcode

train, test, win_code, list_of_user_activities, list_of_event_code, activities_labels, assess_titles, activities_world, list_of_title_eventcode = encode_title(train, test)

CPU times: user 3min 28s, sys: 8.55 s, total: 3min 36s
Wall time: 3min 36s


In [9]:
def date_calc(tmp):
    tmp = list(tmp)
    all_login_days = len(tmp)
    max_cont_days = 1
    count = 1
    prev = tmp[0]
    for i in range(1,len(tmp)):
        if (tmp[i] - prev).days == 1:
            count += 1
            max_cont_days = max(max_cont_days, count)
        else:
            count = 1
        prev = tmp[i]
    return all_login_days, max_cont_days

def get_data(user_sample, test_set=False):
    last_activity = 0
    user_activities_count = {'Clip':0, 'Activity': 0, 'Assessment': 0, 'Game':0}
    time_spent_each_act = {actv: 0 for actv in list_of_user_activities}
    event_code_count = {str(eve): 0 for eve in list_of_event_code}
    user_world_count = {"world_"+str(wor) : 0 for wor in activities_world.values()} #0: None, 1:CRY, 2:TREE, 3:MEGA
    clip_videos = {"magma_intro":0, "magma_long":0, "tree_intro": 0, "tree_long": 0, "crys_intro":0, "crys_long":0}
    title_eventcode_count = {str(ele): 0 for ele in list_of_title_eventcode}
    media_count = {'accumulated_audio':0, 'accumulated_animation': 0}
    
    last_session_time_sec = 0
    all_assessments = []
    accuracy_groups = {"0":0, "1":0, "2":0, "3":0}
    accumulated_accuracy_group = 0
    accumulated_correct_attempts = 0 
    accumulated_uncorrect_attempts = 0 
    accumulated_actions = 0
    click_counts = 0
    accumulated_clicks = 0
    counter = 0
    cumulative_gametime = 0
    xrange = 0
    yrange = 0 
    play_game = []
    time_first_activity = float(user_sample['timestamp'].values[0])
    durations = []
    timediff = np.array([])
    session_udates = set()
    complete_videos = 0
    miss = 0
    #total_growth = 0
    total_target_distances = 0
    total_distance = 0
    total_time_played = 0
        
    for i, session in user_sample.groupby('game_session', sort=False):      
        session_type = session['type'].iloc[0]
        session_title = session['title'].iloc[0]
        session_title_text = activities_labels[session_title]
        session_world = session["world"].iloc[0]
        
        # get current session time in seconds
        if session_type != 'Assessment':
            if session_type == "Clip":
                n_of_clip_videos = Counter(session['clip_video'])
                for key in n_of_clip_videos.keys():
                    if key != "Non_video":
                        clip_videos[str(key)] += n_of_clip_videos[key]
                
            time_spent = int(session['game_time'].iloc[-1] / 1000)
            time_spent_each_act[activities_labels[session_title]] += time_spent
            user_world_count["world_"+str(session_world)] += session.shape[0]
            
            click_counts += np.sum(session["click"])
            
            n_of_medias = Counter(session['media'])
            for key in n_of_medias.keys():
                if key != "None":
                    media_count["accumulated_"+str(key)] += n_of_medias[key]
            
            cumulative_gametime += session.iloc[-1]["game_time"] / 1000
            
            play_game.append(session_title)
            
            if session[session.x != -999].shape[0] >= 2:
                tmp_xrange = session["x"].max() - session[session.x != -999]["x"].min()
                tmp_yrange = session["y"].max() - session[session.x != -999]["y"].min()       
            else:
                tmp_xrange = 0
                tmp_yrange = 0
            xrange = max(tmp_xrange, xrange)
            yrange = max(tmp_yrange, yrange)
            
            timediff = np.concatenate([timediff, session["timediff"].values]) 
            
            complete_videos += session[(session["length"] >=0) & (session["timediff"] >= session["length"])].shape[0]
            
        if (session_type == 'Assessment') & (test_set or len(session)>1): # test set or session in train_label
            all_attempts = session.query(f'event_code == {win_code[session_title]}')
            true_attempts = all_attempts['event_data'].str.contains('true').sum() # true in target assess
            false_attempts = all_attempts['event_data'].str.contains('false').sum() # false in target assessment
            
            # from start of installation_id to the start of target assessment ------------------------
            features = user_activities_count.copy() # appearance of each type without duplicates
            features.update(time_spent_each_act.copy()) # cumulative gameplay time in each title
            features.update(event_code_count.copy()) # cumulative appearance of each event_code with duplicates
            features.update(user_world_count.copy()) # appearance of world with duplicates
            features.update(clip_videos.copy()) # appearance of each clip video with duplicates
            features.update(title_eventcode_count.copy()) # apperance of combi of title and event_code
            features.update(clip_videos.copy())
            features.update(media_count.copy())
            features['accumulated_correct_attempts'] = accumulated_correct_attempts
            features['accumulated_uncorrect_attempts'] = accumulated_uncorrect_attempts
            accumulated_correct_attempts += true_attempts 
            accumulated_uncorrect_attempts += false_attempts
            accumulated_clicks = click_counts # num of clicks
            features['accumulated_clicks'] = accumulated_clicks
            features['accumulated_actions'] = accumulated_actions # num of rows
            if len(session_udates) == 0:
                features["total_login_days"] = 0
                features["cont_login_days"] = 0
            else:
                all_login_days, cont_login_days = date_calc(session_udates)
                features["total_login_days"] = all_login_days
                features["cont_login_days"] = cont_login_days
                
            features["complete_videos"] = complete_videos
            features["misses"] = miss
            #features["total_growth"] = total_growth
            features["total_target_distances"] = total_target_distances
            features["total_distance"] = total_distance
            features["total_time_played"] = total_time_played
            
            # from the end of previous assessment to the start of next assessment --------------------------
            features["total_gametime_bet_assess"] = cumulative_gametime # total gameplay time
            cumulative_gametime = 0
            features["no_playgames_bet_assess"] = len(set(play_game)) # num of kinds of games
            play_game.clear()
            features["xrange_bet_assess"] = xrange
            features["yrange_bet_assess"] = yrange
            xrange = 0
            yrange = 0
            features["touch_range"] = features["xrange_bet_assess"] * features["yrange_bet_assess"] 
            if len(timediff) == 0 or len(timediff) == 1: # mean and std of timespan between each activity
                features['timediff_mean_bet_assess'] = 0
                features["timediff_std_bet_assess"] = 0
                features["timediff_max_bet_assess"] = 0 
            else:
                features['timediff_mean_bet_assess'] = np.mean(timediff)
                features["timediff_std_bet_assess"] = np.std(timediff)
                features["timediff_max_bet_assess"] = np.max(timediff)
            timediff = np.array([])
                        
            # unique type --------------------------------------------------------
            features['installation_id'] = session['installation_id'].iloc[-1]
            features['session_title'] = session_title
            
            # nums in target assessment data ------------------------------------------
            if durations == []: #span of timestamp in target assessment
                features['duration_mean'] = 0
            else:
                features['duration_mean'] = np.mean(durations)
            durations.append((session.iloc[-1, 2] - session.iloc[0, 2]).seconds) 
            
            accuracy = true_attempts/(true_attempts+false_attempts) if (true_attempts+false_attempts) != 0 else 0
            if accuracy == 0:
                features['accuracy_group'] = 0
            elif accuracy == 1:
                features['accuracy_group'] = 3
            elif accuracy == 0.5:
                features['accuracy_group'] = 2
            else:
                features['accuracy_group'] = 1
            features.update(accuracy_groups)
            accuracy_groups[str(features['accuracy_group'])] += 1
            features['accumulated_accuracy_group'] = accumulated_accuracy_group/counter if counter > 0 else 0
            accumulated_accuracy_group += features['accuracy_group']
            
            if test_set:
                all_assessments.append(features)
            elif true_attempts+false_attempts > 0:
                all_assessments.append(features)
                
            counter += 1
        
        n_of_event_codes = Counter(session['event_code']) # output dic of each event_code counts
        for key in n_of_event_codes.keys():
            event_code_count[str(key)] += n_of_event_codes[key]
            
        n_of_title_eventcode = Counter(session['title_event_code']) 
        for key in n_of_title_eventcode.keys():
            title_eventcode_count[str(key)] += n_of_title_eventcode[key]
        miss += np.sum(session["misses"])
        total_target_distances += np.sum(session["target_distances"])
        total_distance += np.sum(session["distance"])
        total_time_played += np.sum(session["time_played"])

        #total_growth += np.sum(session["growth"])
        
        session_udates = session_udates.union(session["date"].unique())
        accumulated_actions += len(session)
        if last_activity != session_type:
            user_activities_count[session_type] += 1
            last_activitiy = session_type
    if test_set:
        return all_assessments[-1]
    return all_assessments

In [10]:
new_train = []
for i, (ins_id, user_sample) in tqdm(enumerate(train.groupby('installation_id', sort=False)), total=train.installation_id.nunique(), desc='Installation_id', position=0):
    new_train += get_data(user_sample)
new_train = pd.DataFrame(new_train)
print(new_train.shape)

HBox(children=(IntProgress(value=0, description='Installation_id', max=3614, style=ProgressStyle(description_w…


(17690, 527)


In [11]:
del train

In [12]:
new_test = []
for ins_id, user_sample in tqdm(test.groupby('installation_id', sort=False), total=test.installation_id.nunique(), desc='Installation_id', position=0):
    a = get_data(user_sample, test_set=True)
    new_test.append(a)   
new_test = pd.DataFrame(new_test)
print(new_test.shape)

HBox(children=(IntProgress(value=0, description='Installation_id', max=1000, style=ProgressStyle(description_w…


(1000, 527)


In [13]:
del test

# Feature selection and modelling

In [14]:
correlations = new_train.corr().abs()
correlations = correlations.mask(np.tril(np.ones(correlations.shape)).astype(np.bool))
correlations = correlations.stack().reset_index()
corr_columns = ["level_0", "level_1", "value"]
correlations.columns = corr_columns
correlations = correlations.sort_values("value", ascending=False).reset_index(drop=True)

high_corr = correlations[correlations["value"] == 1]

high_corr_features = []
for i in range(high_corr.shape[0]):
    if high_corr.iloc[i]["level_0"] not in high_corr_features and high_corr.iloc[i]["level_1"] not in high_corr_features:
        high_corr_features.append(high_corr.iloc[i]["level_0"])
    elif high_corr.iloc[i]["level_0"] in high_corr_features and high_corr.iloc[i]["level_1"] not in high_corr_features:
        high_corr_features.append(high_corr.iloc[i]["level_1"])
    elif high_corr.iloc[i]["level_0"] not in high_corr_features and high_corr.iloc[i]["level_1"] in high_corr_features:
        high_corr_features.append(high_corr.iloc[i]["level_0"])

In [15]:
X_train = new_train.drop(['accuracy_group'],axis=1) 
lbl = preprocessing.LabelEncoder()
lbl.fit(list(X_train["installation_id"]))
X_train["installation_id"] = lbl.transform(list(X_train["installation_id"]))
remove_features = ["4235", "2070", "cont_login_days"]
for i in X_train.columns:
    if X_train[i].std() == 0 and i not in remove_features:
        remove_features.append(i)
X_train = X_train.drop(remove_features, axis=1)
X_train = X_train[sorted(X_train.columns.tolist())]
y_train = new_train.accuracy_group
print(X_train.shape)

(17690, 497)


In [16]:
n_folds=5
groups = np.array(X_train.installation_id)
gkf=GroupKFold(n_splits = n_folds)
coefficients = []
models = []
train_qwk_scores = []
test_qwk_scores = []
lgbm_params = {
    "objective" : "regression",
    "metric" : "rmse",
    "tree_learner": "serial",
    "max_depth" : 5,
    "boosting": 'gbdt',
    "num_leaves" : 13,
    "learning_rate" : 0.01,
}
features_list = [i for i in X_train.columns if i != "installation_id"]
feature_importance_df = pd.DataFrame(features_list, columns=["Feature"])
for i , (train_index, test_index) in enumerate(gkf.split(X_train, y_train, groups)):
    optR = OptimizedRounder()
    X_train2 = X_train.iloc[train_index,:]
    y_train2 = y_train.iloc[train_index]
    X_train2 = X_train2.drop(['installation_id'],axis=1)
    
    X_test2 = X_train.iloc[test_index,:]
    y_test2 = y_train.iloc[test_index]
    test2 = pd.concat([X_test2, y_test2], axis=1)
    test2 = test2.groupby('installation_id').apply(lambda x: x.sample(1, random_state=1223)).reset_index(drop=True)
    X_test2 = test2.drop(["accuracy_group", "installation_id"], axis=1)
    y_test2 = test2["accuracy_group"]
    
    lgb_train = lgb.Dataset(X_train2, y_train2)
    lgb_eval = lgb.Dataset(X_test2, y_test2, reference=lgb_train)
    
    clf = lgb.train(
        lgbm_params, lgb_train,
        valid_sets=lgb_eval,
        num_boost_round=100000,
        early_stopping_rounds=10,
    )
    
    models.append(clf)
    train_predict = clf.predict(X_train2, num_iteration = clf.best_iteration)
    test_predict = clf.predict(X_test2, num_iteration = clf.best_iteration)
    
    optR.fit(train_predict.reshape(-1,), y_train2)
    tmp_coefficients = optR.coefficients()
    print("fold_"+str(i)+" coefficients: ", tmp_coefficients)
    opt_train_preds = optR.predict(train_predict.reshape(-1, ), tmp_coefficients)
    train_qwk_score = qwk(y_train2, opt_train_preds)
    print("training qwk: ", train_qwk_score)
    opt_test_preds = optR.predict(test_predict.reshape(-1, ), tmp_coefficients)
    test_qwk_score = qwk(y_test2, opt_test_preds)
    print("validation qwk: ", test_qwk_score)
    train_qwk_scores.append(train_qwk_score)
    test_qwk_scores.append(test_qwk_score)
    coefficients.append(tmp_coefficients)
    
    feature_importance_df["Fold_"+str(i+1)] = clf.feature_importance()
    
feature_importance_df["Average"] = np.mean(feature_importance_df.iloc[:,1:n_folds+1], axis=1)
feature_importance_df["Std"] = np.std(feature_importance_df.iloc[:,1:n_folds+1], axis=1)
feature_importance_df["Cv"] = feature_importance_df["Std"] / feature_importance_df["Average"]

print("                             ")
print("-----------------------------")
print('coefficients: ', coefficients)
print('train qwk list:', train_qwk_scores)
print('train qwk average score:',np.mean(train_qwk_scores))
print('valid qwk list: ', test_qwk_scores)
print('valid qwk average score:',np.mean(test_qwk_scores))

[1]	valid_0's rmse: 1.27633
Training until validation scores don't improve for 10 rounds
[2]	valid_0's rmse: 1.27331
[3]	valid_0's rmse: 1.27033
[4]	valid_0's rmse: 1.26729
[5]	valid_0's rmse: 1.26442
[6]	valid_0's rmse: 1.26158
[7]	valid_0's rmse: 1.25877
[8]	valid_0's rmse: 1.25613
[9]	valid_0's rmse: 1.25335
[10]	valid_0's rmse: 1.25081
[11]	valid_0's rmse: 1.24809
[12]	valid_0's rmse: 1.24548
[13]	valid_0's rmse: 1.24285
[14]	valid_0's rmse: 1.24047
[15]	valid_0's rmse: 1.23793
[16]	valid_0's rmse: 1.2356
[17]	valid_0's rmse: 1.23335
[18]	valid_0's rmse: 1.23093
[19]	valid_0's rmse: 1.22876
[20]	valid_0's rmse: 1.22642
[21]	valid_0's rmse: 1.22425
[22]	valid_0's rmse: 1.22202
[23]	valid_0's rmse: 1.2198
[24]	valid_0's rmse: 1.21784
[25]	valid_0's rmse: 1.21569
[26]	valid_0's rmse: 1.2136
[27]	valid_0's rmse: 1.21154
[28]	valid_0's rmse: 1.20967
[29]	valid_0's rmse: 1.20768
[30]	valid_0's rmse: 1.20572
[31]	valid_0's rmse: 1.20399
[32]	valid_0's rmse: 1.20209
[33]	valid_0's rmse: 1.

In [17]:
feature_importance_df.sort_values("Average", ascending=False).reset_index(drop=True).head(500)

Unnamed: 0,Feature,Fold_1,Fold_2,Fold_3,Fold_4,Fold_5,Average,Std,Cv
0,session_title,1051,1103,1131,1040,1059,1076.8,34.52767,0.032065
1,accumulated_accuracy_group,402,459,440,505,438,448.8,33.605952,0.07488
2,4070,210,180,199,206,201,199.2,10.342147,0.051918
3,Chow Time_4070,151,119,104,211,160,149.0,37.132196,0.249209
4,3121,117,151,140,146,185,147.8,21.939918,0.148443
5,3020,137,138,105,186,107,134.6,29.316207,0.217802
6,Chest Sorter (Assessment)_3021,110,143,135,146,108,128.4,16.255461,0.1266
7,Clip,135,103,135,133,131,127.4,12.289833,0.096467
8,misses,107,111,83,189,125,123.0,35.665109,0.28996
9,All Star Sorting_2025,89,48,114,118,104,94.6,25.358233,0.268057


In [18]:
print(X_train.shape)
print("")
print(", ".join('%s'%x for x in X_train.columns))

(17690, 497)

0, 1, 12 Monkeys_2000, 2, 2000, 2010, 2020, 2025, 2030, 2035, 2040, 2050, 2060, 2075, 2080, 2081, 2083, 3, 3010, 3020, 3021, 3110, 3120, 3121, 4010, 4020, 4021, 4022, 4025, 4030, 4031, 4035, 4040, 4045, 4050, 4070, 4080, 4090, 4095, 4100, 4110, 4220, 4230, 5000, 5010, Activity, Air Show, Air Show_2000, Air Show_2020, Air Show_2030, Air Show_2060, Air Show_2070, Air Show_2075, Air Show_3010, Air Show_3020, Air Show_3021, Air Show_3110, Air Show_3120, Air Show_3121, Air Show_4010, Air Show_4020, Air Show_4070, Air Show_4090, Air Show_4100, Air Show_4110, All Star Sorting, All Star Sorting_2000, All Star Sorting_2020, All Star Sorting_2025, All Star Sorting_2030, All Star Sorting_3010, All Star Sorting_3020, All Star Sorting_3021, All Star Sorting_3110, All Star Sorting_3120, All Star Sorting_3121, All Star Sorting_4010, All Star Sorting_4020, All Star Sorting_4030, All Star Sorting_4035, All Star Sorting_4070, All Star Sorting_4080, All Star Sorting_4090, All Star Sorting_4

# prediction

In [19]:
X_test = new_test.drop(["installation_id", "accuracy_group"], axis=1)
X_test = X_test.drop(remove_features, axis=1)
X_test = X_test[sorted(X_test.columns.tolist())]
pred_value = np.zeros([X_test.shape[0]])
test_coefficients = np.mean(coefficients, axis=0)
for model in models:
    pred_value += model.predict(X_test, num_iteration = model.best_iteration) /n_folds
test_pred_class= optR.predict(pred_value.reshape(-1, ), test_coefficients)
sample_submission["accuracy_group"] = test_pred_class
sample_submission.to_csv('submission.csv', index=False)

In [20]:
sample_submission["accuracy_group"].value_counts()

3    455
2    287
1    145
0    113
Name: accuracy_group, dtype: int64