In [25]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline

In [26]:
train = pd.read_csv('./data/train.csv')
labels = pd.read_csv('./data/train_labels.csv')
test = pd.read_csv('./data/test.csv')
submission = pd.read_csv('./data/sample_submission.csv')

# Feature Engineering

In [27]:
room_fq_cols = ['historicalsociety_closet', 'historicalsociety_basement',
       'historicalsociety_entry', 'historicalsociety_collection',
       'historicalsociety_stacks', 'kohlcenter_halloffame',
       'capitol_0_hall', 'historicalsociety_closet_dirty',
       'historicalsociety_frontdesk', 'humanecology_frontdesk',
       'drycleaner_frontdesk', 'library_frontdesk', 'library_microfiche',
       'capitol_1_hall', 'historicalsociety_cage',
       'historicalsociety_collection_flag', 'wildlife_center',
       'flaghouse_entry', 'capitol_2_hall']

def generate_elapsed_time_feature(df, df_merged):
    temp = df.groupby(['session_id', 'event_name', 'level_group'])['elapsed_time'].agg(['sum']).reset_index()
    temp = temp.rename(columns={'sum': 'elapsed_time'})
    temp = temp.pivot(index='session_id', columns='event_name', values='elapsed_time')
    temp.columns = [col + '_elapsed_time' for col in temp.columns]
    temp = temp.reset_index()
    df_merged = temp

    temp = df.groupby(['session_id', 'room_fqid', 'level_group'])['elapsed_time'].agg(['sum']).reset_index()
    temp = temp.rename(columns={'sum': 'elapsed_time'})
    temp = temp.pivot(index='session_id', columns='room_fqid', values='elapsed_time')
    temp.columns = [col + '_elapsed_time' for col in temp.columns]
    temp = temp.reset_index()

    df_merged = pd.merge(df_merged, temp, how='left', on='session_id')

    for room_col in [col + '_elapsed_time' for col in room_fq_cols]:
        if room_col not in df_merged.columns:
            df_merged[room_col] = 0

    return df_merged

def generate_feature_counts(df, df_merged):
    temp = df.groupby(['session_id'])['fullscreen','hq', 'music'].sum().reset_index()
    df_merged = pd.merge(df_merged, temp, how='left', on='session_id')

    temp = df.groupby(['session_id', 'level'])['level'].agg(['count']).reset_index()
    temp = temp.rename(columns={'count': 'level_counts'})
    temp = temp.pivot(index='session_id', columns='level', values='level_counts')
    temp.columns = ['level_' + str(col) + '_counts' for col in temp.columns]
    temp = temp.reset_index()
    df_merged = pd.merge(df_merged, temp, how='left', on='session_id')

    temp = df.groupby(['session_id', 'event_name'])['event_name'].agg(['count']).reset_index()
    temp = temp.rename(columns={'count': 'event_count'})
    temp = temp.pivot(index='session_id', columns='event_name', values='event_count')
    temp.columns = [col + '_counts' for col in temp.columns]
    temp = temp.reset_index()

    df_merged = pd.merge(df_merged, temp, how='left', on='session_id')
    return df_merged

def generate_notebook_feature(df, df_merged):
    temp = df.groupby(['session_id', 'event_name', 'name'])['elapsed_time'].agg(['count']).reset_index()
    temp = temp.rename(columns={'count': 'event_count'})
    temp = temp[(temp['name'].isin(['open', 'prev', 'next']))]
    temp = temp.pivot(index='session_id', columns='name', values='event_count')
    temp.columns = ['notebook_' + col + '_counts' for col in temp.columns]
    temp = temp.reset_index()
    df_merged = pd.merge(df_merged, temp, how='left', on='session_id')

    if 'notebook_open_counts' not in df_merged.columns:
        df_merged['notebook_open_counts'] = 0
    if 'notebook_prev_counts' not in df_merged.columns:
        df_merged['notebook_prev_counts'] = 0
    if 'notebook_next_counts' not in df_merged.columns:
        df_merged['notebook_next_counts'] = 0

    return df_merged

def generate_hover_duration(df, df_merged):
    temp = df.groupby(['session_id', 'event_name'])['hover_duration'].agg(['sum']).reset_index()
    temp = temp.rename(columns={'sum': 'hover_duration'})
    temp = temp[temp['hover_duration'] > 0]
    temp = temp.pivot(index='session_id', columns='event_name', values='hover_duration')
    temp.columns = [col + '_duration' for col in temp.columns]
    temp = temp.reset_index()
    df_merged = pd.merge(df_merged, temp, how='left', on='session_id')
    return df_merged

def generate_features(df, grp):
    df_merged = pd.DataFrame()
    df_merged = generate_elapsed_time_feature(df, df_merged)
    df_merged = generate_feature_counts(df, df_merged)
    df_merged = generate_notebook_feature(df, df_merged)
    df_merged = generate_hover_duration(df, df_merged)
    df_merged['level_group'] = grp

    return df_merged

def feature_engineering(df, grp):

    final_df = pd.DataFrame()

    col_use = ['session_id', 'elapsed_time', 'event_name', 'name', 'level',
        'hover_duration', 'room_fqid', 'fullscreen', 'hq', 'music', 'level_group']

    df = df[col_use]
    
    df['room_fqid'] = df['room_fqid'].str.replace('tunic.', '').str.replace('.', '_')

    final_df = pd.concat([final_df, generate_features(df, grp)], ignore_index=True)

    final_df.fillna(0, inplace=True)
    return final_df

In [28]:
train['room_fqid'] = train['room_fqid'].str.replace('tunic.', '').str.replace('.', '_')

train['room_fqid'].unique()

  train['room_fqid'] = train['room_fqid'].str.replace('tunic.', '').str.replace('.', '_')
  train['room_fqid'] = train['room_fqid'].str.replace('tunic.', '').str.replace('.', '_')


array(['historicalsociety_closet', 'historicalsociety_basement',
       'historicalsociety_entry', 'historicalsociety_collection',
       'historicalsociety_stacks', 'kohlcenter_halloffame',
       'capitol_0_hall', 'historicalsociety_closet_dirty',
       'historicalsociety_frontdesk', 'humanecology_frontdesk',
       'drycleaner_frontdesk', 'library_frontdesk', 'library_microfiche',
       'capitol_1_hall', 'historicalsociety_cage',
       'historicalsociety_collection_flag', 'wildlife_center',
       'flaghouse_entry', 'capitol_2_hall'], dtype=object)

In [29]:
labels[['session_id', 'question_number']] = labels['session_id'].str.split('_', 1, expand=True)
labels['question_number'] = labels['question_number'].apply(lambda x: int(x[1:]))


  labels[['session_id', 'question_number']] = labels['session_id'].str.split('_', 1, expand=True)


# Train LightGBM Baseline

In [30]:
from sklearn.metrics import f1_score

def f1_score_lgb(preds, dtrain):
    y_true = dtrain.get_label()
    y_pred = np.round(preds)
    return 'f1', f1_score(y_true, y_pred), True

In [31]:
from sklearn.model_selection import StratifiedKFold

# We use stratifiedKFold since data is inbalance
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=320)

In [32]:
# For baseline model simplified the parameter
params = {'objective': 'binary',
          'learning_rate': 0.01,
          'force_row_wise': True,
          'random_state': 0}

In [33]:
oof_val_preds = {}
# oof_test_preds = {}
for i in range(1,19):
    oof_val_preds[i] = np.zeros(int(train.shape[0] / 3))
#     oof_test_preds[i] = np.zeros(int(test_df.shape[0] / 3)) 

In [34]:
import lightgbm as lgb
from lightgbm import early_stopping
from lightgbm import log_evaluation

# For testing we only use level group 0-4 only, which is question 1 to 3
for i in range(1, 19):
    print(f"training/predicting question {i}")
    # Train, validate, and predict models in an OOF way

    if i <= 3:
        X = train[train['level_group'] == '0-4']
        X = feature_engineering(X, '0-4')
#         X_test = test_df[test_df['level_group'] == '0-4']
    elif i > 3 and i <=13:
        X = train[train['level_group'] == '5-12']
        X = feature_engineering(X, '5-12')
#         X_test = test_df[test_df['level_group'] == '5-12']
    else:
        X = train[train['level_group'] == '13-22']
        X = feature_engineering(X, '13-22')
#         X_test = test_df[test_df['level_group'] == '13-22']
    
    y = labels[labels['question_number'] == i]['correct'].values
    X.set_index('session_id', inplace=True)
    X.drop('level_group', axis=1, inplace=True)
#     X_test.set_index('session_id', inplace=True)
#     X_test.drop('level_group', axis=1, inplace=True)

    for idx, (train_idx, valid_idx) in enumerate(folds.split(X, y)):
        # Output a phrase that identifies each fold
        print('#'*40, f'fold {idx+1} / fold {folds.n_splits}', '#'*40)
        
        # Train data
        X_train, y_train = X.iloc[train_idx], y[train_idx]
        # Validation data
        X_valid, y_valid = X.iloc[valid_idx], y[valid_idx]

        # Convert them to LightGBM dataset
        dtrain = lgb.Dataset(X_train, y_train)
        dvalid = lgb.Dataset(X_valid, y_valid)


        lgb_model = lgb.train(params=params,
                            train_set=dtrain,
                            num_boost_round=1000,
                            valid_sets=dvalid,
                            feval=f1_score_lgb,
                            callbacks=[early_stopping(stopping_rounds=100),
                                        log_evaluation(100)])
        
        # OOF prediction using test data
#         oof_test_preds[i] += lgb_model.predict(X_test)/folds.n_splits
        
        # Prediction of validation data target value for model performance evaluation
        oof_val_preds[i][valid_idx] += lgb_model.predict(X_valid)
        
        # Normalized Gini coefficient for prediction probability of validation data
        y_pred = np.round(oof_val_preds[i][valid_idx])
        score = f1_score(y_valid, y_pred)
        lgb_model.save_model(f'basline_{i}.txt')
        print(f'fold {idx+1} f1 score : {score}\n')

training/predicting question 1


  df['room_fqid'] = df['room_fqid'].str.replace('tunic.', '').str.replace('.', '_')
  df['room_fqid'] = df['room_fqid'].str.replace('tunic.', '').str.replace('.', '_')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['room_fqid'] = df['room_fqid'].str.replace('tunic.', '').str.replace('.', '_')
  temp = df.groupby(['session_id'])['fullscreen','hq', 'music'].sum().reset_index()


######################################## fold 1 / fold 5 ########################################
[LightGBM] [Info] Number of positive: 13712, number of negative: 5137
[LightGBM] [Info] Total Bins 6887
[LightGBM] [Info] Number of data points in the train set: 18849, number of used features: 42
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.727466 -> initscore=0.981802
[LightGBM] [Info] Start training from score 0.981802
Training until validation scores don't improve for 100 rounds
[100]	valid_0's binary_logloss: 0.558215	valid_0's f1: 0.841768
[200]	valid_0's binary_logloss: 0.549288	valid_0's f1: 0.84314
Early stopping, best iteration is:
[191]	valid_0's binary_logloss: 0.549793	valid_0's f1: 0.843649
fold 1 f1 score : 0.8436490181456625

######################################## fold 2 / fold 5 ########################################
[LightGBM] [Info] Number of positive: 13713, number of negative: 5136
[LightGBM] [Info] Total Bins 6875
[LightGBM] [Info] Number of data points in th

  df['room_fqid'] = df['room_fqid'].str.replace('tunic.', '').str.replace('.', '_')
  df['room_fqid'] = df['room_fqid'].str.replace('tunic.', '').str.replace('.', '_')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['room_fqid'] = df['room_fqid'].str.replace('tunic.', '').str.replace('.', '_')
  temp = df.groupby(['session_id'])['fullscreen','hq', 'music'].sum().reset_index()


######################################## fold 1 / fold 5 ########################################
[LightGBM] [Info] Number of positive: 18450, number of negative: 399
[LightGBM] [Info] Total Bins 6872
[LightGBM] [Info] Number of data points in the train set: 18849, number of used features: 42
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.978832 -> initscore=3.833858
[LightGBM] [Info] Start training from score 3.833858
Training until validation scores don't improve for 100 rounds
[100]	valid_0's binary_logloss: 0.0953564	valid_0's f1: 0.989277
Early stopping, best iteration is:
[1]	valid_0's binary_logloss: 0.10255	valid_0's f1: 0.989277
fold 1 f1 score : 0.9892772892987347

######################################## fold 2 / fold 5 ########################################
[LightGBM] [Info] Number of positive: 18450, number of negative: 399
[LightGBM] [Info] Total Bins 6880
[LightGBM] [Info] Number of data points in the train set: 18849, number of used features: 42
[LightGBM] [Info] [

  df['room_fqid'] = df['room_fqid'].str.replace('tunic.', '').str.replace('.', '_')
  df['room_fqid'] = df['room_fqid'].str.replace('tunic.', '').str.replace('.', '_')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['room_fqid'] = df['room_fqid'].str.replace('tunic.', '').str.replace('.', '_')
  temp = df.groupby(['session_id'])['fullscreen','hq', 'music'].sum().reset_index()


######################################## fold 1 / fold 5 ########################################
[LightGBM] [Info] Number of positive: 17605, number of negative: 1244
[LightGBM] [Info] Total Bins 6884
[LightGBM] [Info] Number of data points in the train set: 18849, number of used features: 42
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.934002 -> initscore=2.649851
[LightGBM] [Info] Start training from score 2.649851
Training until validation scores don't improve for 100 rounds
[100]	valid_0's binary_logloss: 0.232789	valid_0's f1: 0.96588
Early stopping, best iteration is:
[1]	valid_0's binary_logloss: 0.242812	valid_0's f1: 0.96588
fold 1 f1 score : 0.9658804168952276

######################################## fold 2 / fold 5 ########################################
[LightGBM] [Info] Number of positive: 17605, number of negative: 1244
[LightGBM] [Info] Total Bins 6886
[LightGBM] [Info] Number of data points in the train set: 18849, number of used features: 42
[LightGBM] [Info] [

  df['room_fqid'] = df['room_fqid'].str.replace('tunic.', '').str.replace('.', '_')
  df['room_fqid'] = df['room_fqid'].str.replace('tunic.', '').str.replace('.', '_')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['room_fqid'] = df['room_fqid'].str.replace('tunic.', '').str.replace('.', '_')
  temp = df.groupby(['session_id'])['fullscreen','hq', 'music'].sum().reset_index()


######################################## fold 1 / fold 5 ########################################
[LightGBM] [Info] Number of positive: 15046, number of negative: 3803
[LightGBM] [Info] Total Bins 9757
[LightGBM] [Info] Number of data points in the train set: 18849, number of used features: 51
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.798239 -> initscore=1.375322
[LightGBM] [Info] Start training from score 1.375322
Training until validation scores don't improve for 100 rounds
[100]	valid_0's binary_logloss: 0.457691	valid_0's f1: 0.889573
[200]	valid_0's binary_logloss: 0.446111	valid_0's f1: 0.892627
[300]	valid_0's binary_logloss: 0.441489	valid_0's f1: 0.893643
[400]	valid_0's binary_logloss: 0.439814	valid_0's f1: 0.893874
Early stopping, best iteration is:
[343]	valid_0's binary_logloss: 0.44062	valid_0's f1: 0.894452
fold 1 f1 score : 0.8944517992538211

######################################## fold 2 / fold 5 ########################################
[LightGBM] [Info] Num

  df['room_fqid'] = df['room_fqid'].str.replace('tunic.', '').str.replace('.', '_')
  df['room_fqid'] = df['room_fqid'].str.replace('tunic.', '').str.replace('.', '_')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['room_fqid'] = df['room_fqid'].str.replace('tunic.', '').str.replace('.', '_')
  temp = df.groupby(['session_id'])['fullscreen','hq', 'music'].sum().reset_index()


######################################## fold 1 / fold 5 ########################################
[LightGBM] [Info] Number of positive: 10334, number of negative: 8515
[LightGBM] [Info] Total Bins 9766
[LightGBM] [Info] Number of data points in the train set: 18849, number of used features: 51
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.548252 -> initscore=0.193610
[LightGBM] [Info] Start training from score 0.193610
Training until validation scores don't improve for 100 rounds
[100]	valid_0's binary_logloss: 0.656123	valid_0's f1: 0.690989
Early stopping, best iteration is:
[21]	valid_0's binary_logloss: 0.675876	valid_0's f1: 0.713812
fold 1 f1 score : 0.7138124729943828

######################################## fold 2 / fold 5 ########################################
[LightGBM] [Info] Number of positive: 10334, number of negative: 8515
[LightGBM] [Info] Total Bins 9751
[LightGBM] [Info] Number of data points in the train set: 18849, number of used features: 51
[LightGBM] [Info

  df['room_fqid'] = df['room_fqid'].str.replace('tunic.', '').str.replace('.', '_')
  df['room_fqid'] = df['room_fqid'].str.replace('tunic.', '').str.replace('.', '_')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['room_fqid'] = df['room_fqid'].str.replace('tunic.', '').str.replace('.', '_')
  temp = df.groupby(['session_id'])['fullscreen','hq', 'music'].sum().reset_index()


######################################## fold 1 / fold 5 ########################################
[LightGBM] [Info] Number of positive: 14626, number of negative: 4223
[LightGBM] [Info] Total Bins 9767
[LightGBM] [Info] Number of data points in the train set: 18849, number of used features: 51
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.775956 -> initscore=1.242255
[LightGBM] [Info] Start training from score 1.242255
Training until validation scores don't improve for 100 rounds
[100]	valid_0's binary_logloss: 0.495927	valid_0's f1: 0.874102
[200]	valid_0's binary_logloss: 0.490025	valid_0's f1: 0.876555
[300]	valid_0's binary_logloss: 0.488887	valid_0's f1: 0.876457
Early stopping, best iteration is:
[274]	valid_0's binary_logloss: 0.489116	valid_0's f1: 0.877098
fold 1 f1 score : 0.8770983948045583

######################################## fold 2 / fold 5 ########################################
[LightGBM] [Info] Number of positive: 14626, number of negative: 4223
[LightGBM] [In

  df['room_fqid'] = df['room_fqid'].str.replace('tunic.', '').str.replace('.', '_')
  df['room_fqid'] = df['room_fqid'].str.replace('tunic.', '').str.replace('.', '_')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['room_fqid'] = df['room_fqid'].str.replace('tunic.', '').str.replace('.', '_')
  temp = df.groupby(['session_id'])['fullscreen','hq', 'music'].sum().reset_index()


######################################## fold 1 / fold 5 ########################################
[LightGBM] [Info] Number of positive: 13874, number of negative: 4975
[LightGBM] [Info] Total Bins 9698
[LightGBM] [Info] Number of data points in the train set: 18849, number of used features: 51
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.736060 -> initscore=1.025591
[LightGBM] [Info] Start training from score 1.025591
Training until validation scores don't improve for 100 rounds
[100]	valid_0's binary_logloss: 0.550699	valid_0's f1: 0.847818
Early stopping, best iteration is:
[1]	valid_0's binary_logloss: 0.576562	valid_0's f1: 0.847959
fold 1 f1 score : 0.8479589342459056

######################################## fold 2 / fold 5 ########################################
[LightGBM] [Info] Number of positive: 13874, number of negative: 4975
[LightGBM] [Info] Total Bins 9765
[LightGBM] [Info] Number of data points in the train set: 18849, number of used features: 51
[LightGBM] [Info]

  df['room_fqid'] = df['room_fqid'].str.replace('tunic.', '').str.replace('.', '_')
  df['room_fqid'] = df['room_fqid'].str.replace('tunic.', '').str.replace('.', '_')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['room_fqid'] = df['room_fqid'].str.replace('tunic.', '').str.replace('.', '_')
  temp = df.groupby(['session_id'])['fullscreen','hq', 'music'].sum().reset_index()


######################################## fold 1 / fold 5 ########################################
[LightGBM] [Info] Number of positive: 11634, number of negative: 7215
[LightGBM] [Info] Total Bins 9763
[LightGBM] [Info] Number of data points in the train set: 18849, number of used features: 51
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.617221 -> initscore=0.477770
[LightGBM] [Info] Start training from score 0.477770
Training until validation scores don't improve for 100 rounds
[100]	valid_0's binary_logloss: 0.654085	valid_0's f1: 0.765901
Early stopping, best iteration is:
[95]	valid_0's binary_logloss: 0.654246	valid_0's f1: 0.766291
fold 1 f1 score : 0.7662906436629064

######################################## fold 2 / fold 5 ########################################
[LightGBM] [Info] Number of positive: 11634, number of negative: 7215
[LightGBM] [Info] Total Bins 9756
[LightGBM] [Info] Number of data points in the train set: 18849, number of used features: 51
[LightGBM] [Info

  df['room_fqid'] = df['room_fqid'].str.replace('tunic.', '').str.replace('.', '_')
  df['room_fqid'] = df['room_fqid'].str.replace('tunic.', '').str.replace('.', '_')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['room_fqid'] = df['room_fqid'].str.replace('tunic.', '').str.replace('.', '_')
  temp = df.groupby(['session_id'])['fullscreen','hq', 'music'].sum().reset_index()


######################################## fold 1 / fold 5 ########################################
[LightGBM] [Info] Number of positive: 13878, number of negative: 4971
[LightGBM] [Info] Total Bins 9754
[LightGBM] [Info] Number of data points in the train set: 18849, number of used features: 51
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.736272 -> initscore=1.026684
[LightGBM] [Info] Start training from score 1.026684
Training until validation scores don't improve for 100 rounds
[100]	valid_0's binary_logloss: 0.548304	valid_0's f1: 0.849147
[200]	valid_0's binary_logloss: 0.543166	valid_0's f1: 0.850392
Early stopping, best iteration is:
[155]	valid_0's binary_logloss: 0.544598	valid_0's f1: 0.851122
fold 1 f1 score : 0.8511218544688236

######################################## fold 2 / fold 5 ########################################
[LightGBM] [Info] Number of positive: 13878, number of negative: 4971
[LightGBM] [Info] Total Bins 9752
[LightGBM] [Info] Number of data points in t

  df['room_fqid'] = df['room_fqid'].str.replace('tunic.', '').str.replace('.', '_')
  df['room_fqid'] = df['room_fqid'].str.replace('tunic.', '').str.replace('.', '_')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['room_fqid'] = df['room_fqid'].str.replace('tunic.', '').str.replace('.', '_')
  temp = df.groupby(['session_id'])['fullscreen','hq', 'music'].sum().reset_index()


######################################## fold 1 / fold 5 ########################################
[LightGBM] [Info] Number of positive: 9527, number of negative: 9322
[LightGBM] [Info] Total Bins 9754
[LightGBM] [Info] Number of data points in the train set: 18849, number of used features: 51
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.505438 -> initscore=0.021753
[LightGBM] [Info] Start training from score 0.021753
Training until validation scores don't improve for 100 rounds
[100]	valid_0's binary_logloss: 0.660749	valid_0's f1: 0.646581
Early stopping, best iteration is:
[3]	valid_0's binary_logloss: 0.691044	valid_0's f1: 0.682488
fold 1 f1 score : 0.682487978904917

######################################## fold 2 / fold 5 ########################################
[LightGBM] [Info] Number of positive: 9527, number of negative: 9322
[LightGBM] [Info] Total Bins 9717
[LightGBM] [Info] Number of data points in the train set: 18849, number of used features: 51
[LightGBM] [Info] [b

  df['room_fqid'] = df['room_fqid'].str.replace('tunic.', '').str.replace('.', '_')
  df['room_fqid'] = df['room_fqid'].str.replace('tunic.', '').str.replace('.', '_')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['room_fqid'] = df['room_fqid'].str.replace('tunic.', '').str.replace('.', '_')
  temp = df.groupby(['session_id'])['fullscreen','hq', 'music'].sum().reset_index()


######################################## fold 1 / fold 5 ########################################
[LightGBM] [Info] Number of positive: 12132, number of negative: 6717
[LightGBM] [Info] Total Bins 9751
[LightGBM] [Info] Number of data points in the train set: 18849, number of used features: 51
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.643642 -> initscore=0.591205
[LightGBM] [Info] Start training from score 0.591205
Training until validation scores don't improve for 100 rounds
[100]	valid_0's binary_logloss: 0.633066	valid_0's f1: 0.784329
Early stopping, best iteration is:
[79]	valid_0's binary_logloss: 0.634625	valid_0's f1: 0.785649
fold 1 f1 score : 0.7856490541422049

######################################## fold 2 / fold 5 ########################################
[LightGBM] [Info] Number of positive: 12132, number of negative: 6717
[LightGBM] [Info] Total Bins 9753
[LightGBM] [Info] Number of data points in the train set: 18849, number of used features: 51
[LightGBM] [Info

  df['room_fqid'] = df['room_fqid'].str.replace('tunic.', '').str.replace('.', '_')
  df['room_fqid'] = df['room_fqid'].str.replace('tunic.', '').str.replace('.', '_')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['room_fqid'] = df['room_fqid'].str.replace('tunic.', '').str.replace('.', '_')
  temp = df.groupby(['session_id'])['fullscreen','hq', 'music'].sum().reset_index()


######################################## fold 1 / fold 5 ########################################
[LightGBM] [Info] Number of positive: 16266, number of negative: 2583
[LightGBM] [Info] Total Bins 9753
[LightGBM] [Info] Number of data points in the train set: 18849, number of used features: 51
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.862964 -> initscore=1.840126
[LightGBM] [Info] Start training from score 1.840126
Training until validation scores don't improve for 100 rounds
[100]	valid_0's binary_logloss: 0.387878	valid_0's f1: 0.926424
Early stopping, best iteration is:
[1]	valid_0's binary_logloss: 0.399345	valid_0's f1: 0.926424
fold 1 f1 score : 0.9264236902050114

######################################## fold 2 / fold 5 ########################################
[LightGBM] [Info] Number of positive: 16266, number of negative: 2583
[LightGBM] [Info] Total Bins 9756
[LightGBM] [Info] Number of data points in the train set: 18849, number of used features: 51
[LightGBM] [Info]

  df['room_fqid'] = df['room_fqid'].str.replace('tunic.', '').str.replace('.', '_')
  df['room_fqid'] = df['room_fqid'].str.replace('tunic.', '').str.replace('.', '_')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['room_fqid'] = df['room_fqid'].str.replace('tunic.', '').str.replace('.', '_')
  temp = df.groupby(['session_id'])['fullscreen','hq', 'music'].sum().reset_index()


######################################## fold 1 / fold 5 ########################################
[LightGBM] [Info] Number of positive: 5185, number of negative: 13664
[LightGBM] [Info] Total Bins 9755
[LightGBM] [Info] Number of data points in the train set: 18849, number of used features: 51
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.275081 -> initscore=-0.968995
[LightGBM] [Info] Start training from score -0.968995
Training until validation scores don't improve for 100 rounds
[100]	valid_0's binary_logloss: 0.556703	valid_0's f1: 0.0197869
[200]	valid_0's binary_logloss: 0.550579	valid_0's f1: 0.0877067
[300]	valid_0's binary_logloss: 0.548914	valid_0's f1: 0.127248
[400]	valid_0's binary_logloss: 0.548084	valid_0's f1: 0.145847
[500]	valid_0's binary_logloss: 0.548121	valid_0's f1: 0.163782
Early stopping, best iteration is:
[483]	valid_0's binary_logloss: 0.548021	valid_0's f1: 0.162775
fold 1 f1 score : 0.1627751834556371

######################################## fold 2 / 

  df['room_fqid'] = df['room_fqid'].str.replace('tunic.', '').str.replace('.', '_')
  df['room_fqid'] = df['room_fqid'].str.replace('tunic.', '').str.replace('.', '_')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['room_fqid'] = df['room_fqid'].str.replace('tunic.', '').str.replace('.', '_')
  temp = df.groupby(['session_id'])['fullscreen','hq', 'music'].sum().reset_index()


######################################## fold 1 / fold 5 ########################################
[LightGBM] [Info] Number of positive: 13339, number of negative: 5510
[LightGBM] [Info] Total Bins 11651
[LightGBM] [Info] Number of data points in the train set: 18849, number of used features: 57
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.707677 -> initscore=0.884127
[LightGBM] [Info] Start training from score 0.884127
Training until validation scores don't improve for 100 rounds
[100]	valid_0's binary_logloss: 0.577364	valid_0's f1: 0.830089
[200]	valid_0's binary_logloss: 0.57141	valid_0's f1: 0.828776
Early stopping, best iteration is:
[121]	valid_0's binary_logloss: 0.575484	valid_0's f1: 0.831126
fold 1 f1 score : 0.831126245428175

######################################## fold 2 / fold 5 ########################################
[LightGBM] [Info] Number of positive: 13339, number of negative: 5510
[LightGBM] [Info] Total Bins 11633
[LightGBM] [Info] Number of data points in t

  df['room_fqid'] = df['room_fqid'].str.replace('tunic.', '').str.replace('.', '_')
  df['room_fqid'] = df['room_fqid'].str.replace('tunic.', '').str.replace('.', '_')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['room_fqid'] = df['room_fqid'].str.replace('tunic.', '').str.replace('.', '_')
  temp = df.groupby(['session_id'])['fullscreen','hq', 'music'].sum().reset_index()


######################################## fold 1 / fold 5 ########################################
[LightGBM] [Info] Number of positive: 9067, number of negative: 9782
[LightGBM] [Info] Total Bins 11633
[LightGBM] [Info] Number of data points in the train set: 18849, number of used features: 57
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.481033 -> initscore=-0.075903
[LightGBM] [Info] Start training from score -0.075903
Training until validation scores don't improve for 100 rounds
[100]	valid_0's binary_logloss: 0.658875	valid_0's f1: 0.593382
[200]	valid_0's binary_logloss: 0.651256	valid_0's f1: 0.605612
[300]	valid_0's binary_logloss: 0.649021	valid_0's f1: 0.608883
[400]	valid_0's binary_logloss: 0.648561	valid_0's f1: 0.611279
[500]	valid_0's binary_logloss: 0.64881	valid_0's f1: 0.613798
Early stopping, best iteration is:
[409]	valid_0's binary_logloss: 0.648538	valid_0's f1: 0.612403
fold 1 f1 score : 0.6124031007751939

######################################## fold 2 / fol

  df['room_fqid'] = df['room_fqid'].str.replace('tunic.', '').str.replace('.', '_')
  df['room_fqid'] = df['room_fqid'].str.replace('tunic.', '').str.replace('.', '_')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['room_fqid'] = df['room_fqid'].str.replace('tunic.', '').str.replace('.', '_')
  temp = df.groupby(['session_id'])['fullscreen','hq', 'music'].sum().reset_index()


######################################## fold 1 / fold 5 ########################################
[LightGBM] [Info] Number of positive: 13852, number of negative: 4997
[LightGBM] [Info] Total Bins 11637
[LightGBM] [Info] Number of data points in the train set: 18849, number of used features: 57
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.734893 -> initscore=1.019592
[LightGBM] [Info] Start training from score 1.019592
Training until validation scores don't improve for 100 rounds
[100]	valid_0's binary_logloss: 0.576711	valid_0's f1: 0.847114
Early stopping, best iteration is:
[1]	valid_0's binary_logloss: 0.578356	valid_0's f1: 0.847114
fold 1 f1 score : 0.8471135029354209

######################################## fold 2 / fold 5 ########################################
[LightGBM] [Info] Number of positive: 13852, number of negative: 4997
[LightGBM] [Info] Total Bins 11644
[LightGBM] [Info] Number of data points in the train set: 18849, number of used features: 57
[LightGBM] [Inf

  df['room_fqid'] = df['room_fqid'].str.replace('tunic.', '').str.replace('.', '_')
  df['room_fqid'] = df['room_fqid'].str.replace('tunic.', '').str.replace('.', '_')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['room_fqid'] = df['room_fqid'].str.replace('tunic.', '').str.replace('.', '_')
  temp = df.groupby(['session_id'])['fullscreen','hq', 'music'].sum().reset_index()


######################################## fold 1 / fold 5 ########################################
[LightGBM] [Info] Number of positive: 12964, number of negative: 5885
[LightGBM] [Info] Total Bins 11630
[LightGBM] [Info] Number of data points in the train set: 18849, number of used features: 57
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.687782 -> initscore=0.789770
[LightGBM] [Info] Start training from score 0.789770
Training until validation scores don't improve for 100 rounds
[100]	valid_0's binary_logloss: 0.612266	valid_0's f1: 0.815085
Early stopping, best iteration is:
[1]	valid_0's binary_logloss: 0.620596	valid_0's f1: 0.815085
fold 1 f1 score : 0.8150848522941546

######################################## fold 2 / fold 5 ########################################
[LightGBM] [Info] Number of positive: 12965, number of negative: 5884
[LightGBM] [Info] Total Bins 11632
[LightGBM] [Info] Number of data points in the train set: 18849, number of used features: 57
[LightGBM] [Inf

  df['room_fqid'] = df['room_fqid'].str.replace('tunic.', '').str.replace('.', '_')
  df['room_fqid'] = df['room_fqid'].str.replace('tunic.', '').str.replace('.', '_')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['room_fqid'] = df['room_fqid'].str.replace('tunic.', '').str.replace('.', '_')
  temp = df.groupby(['session_id'])['fullscreen','hq', 'music'].sum().reset_index()


######################################## fold 1 / fold 5 ########################################
[LightGBM] [Info] Number of positive: 17919, number of negative: 930
[LightGBM] [Info] Total Bins 11641
[LightGBM] [Info] Number of data points in the train set: 18849, number of used features: 57
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.950661 -> initscore=2.958432
[LightGBM] [Info] Start training from score 2.958432
Training until validation scores don't improve for 100 rounds
[100]	valid_0's binary_logloss: 0.184886	valid_0's f1: 0.974655
Early stopping, best iteration is:
[1]	valid_0's binary_logloss: 0.196498	valid_0's f1: 0.974655
fold 1 f1 score : 0.9746546285217013

######################################## fold 2 / fold 5 ########################################
[LightGBM] [Info] Number of positive: 17919, number of negative: 930
[LightGBM] [Info] Total Bins 11631
[LightGBM] [Info] Number of data points in the train set: 18849, number of used features: 57
[LightGBM] [Info]