# Feature Generating and Engineering
This notebook is for generating the features put into the model.

In [1]:
# Packages Load
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import OneHotEncoder
import lightgbm as lgb
import time
from multiprocessing import Pool

In [2]:
# Read Files
events = pd.read_pickle('data/events.pkl')
attr = pd.read_pickle('data/attr_long.pkl')
session = pd.read_pickle('data/session.pkl')

In [3]:
submission = pd.read_csv('data/submission.csv')

In [4]:
events.head()

Unnamed: 0,session_id,event,event_timestamp,event_value,user_id_hash
0,5558845121177764917,45,1542215397132,0.0,9943447915df3a45fd6720a026af905b6da6b56a37701b...
1,5558845121177764917,45,1542215484895,0.0,9943447915df3a45fd6720a026af905b6da6b56a37701b...
2,7689508378645584666,.m5100869650219008,1541124410372,0.0,9943447915df3a45fd6720a026af905b6da6b56a37701b...
3,2201961907282901522,4,1543713091129,0.0,9943447915df3a45fd6720a026af905b6da6b56a37701b...
4,2201961907282901522,6,1543713093116,0.0,9943447915df3a45fd6720a026af905b6da6b56a37701b...


Since our predictions are based on users, we need to create features on user base.

In [5]:
session.head()

Unnamed: 0,session_id,start_timestamp,timezone,timezone_offset,previous_sessions_duration,user_created_timestamp,is_user_first_session,country,region,city,latitude,longitude,locale,os_name,session_index,device_id,user_id_hash
0,5558845121177764917,1542215364580,Asia/Manila,28800000.0,25837591,1538874289458,False,PH,0,makati,14.554729,121.024445,en_GB,Android OS,30,546a3d98-d540-4e72-ad82-9ebd64e0839b,9943447915df3a45fd6720a026af905b6da6b56a37701b...
1,18781111175537580,1539215568666,Asia/Manila,28800000.0,11343848,1538874289458,False,PH,0,makati,14.554729,121.024445,en_GB,Android OS,10,546a3d98-d540-4e72-ad82-9ebd64e0839b,9943447915df3a45fd6720a026af905b6da6b56a37701b...
2,1477540082628742048,1540120743010,Asia/Manila,28800000.0,13499724,1538874289458,False,PH,11,davao city,7.190708,125.455338,en_GB,Android OS,13,546a3d98-d540-4e72-ad82-9ebd64e0839b,9943447915df3a45fd6720a026af905b6da6b56a37701b...
3,8184875317380844086,1542671625528,Asia/Manila,28800000.0,32788010,1538874289458,False,PH,0,makati,14.554729,121.024445,en_GB,Android OS,41,546a3d98-d540-4e72-ad82-9ebd64e0839b,9943447915df3a45fd6720a026af905b6da6b56a37701b...
4,4706180700083856343,1538997913013,Asia/Manila,28800000.0,5872534,1538874289458,False,PH,11,davao city,7.190708,125.455338,en_GB,Android OS,4,546a3d98-d540-4e72-ad82-9ebd64e0839b,9943447915df3a45fd6720a026af905b6da6b56a37701b...


In [6]:
print(datetime.utcfromtimestamp(session.start_timestamp.min()/1000).strftime('%Y-%m-%d %H:%M:%S'))
print(datetime.utcfromtimestamp(session.start_timestamp.max()/1000).strftime('%Y-%m-%d %H:%M:%S'))

2018-10-01 07:00:04
2018-12-14 23:59:59


We will create labels during period Dec 1st and Dec 14th, and use features from Oct 1st to Nov 30th to train the model.

In [7]:
# Get the time stamp to split data.
print(datetime(2018,11,30,23,59,59).timestamp()*1000)
print(datetime(2018,12,7,23,59,59).timestamp()*1000)


1543622399000.0
1544227199000.0


In [8]:
# Split the data specifically for df feature generation
events_training = events[events.event_timestamp<=1543622399000]
session_training = session[session.start_timestamp<=1543622399000]

## Training Labels

In [9]:
# Label1 7-day purchase
purchase_user_7 = set(events[(events.event=='8') & (events.event_timestamp>1543622399000) & (events.event_timestamp<=1544227199000)].user_id_hash)
# Label2 14-day purchase
purchase_user_14 = set(events[(events.event=='8') & (events.event_timestamp>1543622399000)].user_id_hash)

In [10]:
len(purchase_user_7)

4729

In [11]:
len(purchase_user_14)

6126

#### Create Data Frame for df

In [43]:
training = pd.DataFrame(data = list(set(events.user_id_hash)&set(session.user_id_hash)), 
                        columns = ['user_id_hash'])

#### Create Labels

In [44]:
def label_create(df, label_1='user_purchase_binary_7_days', label_2='user_purchase_binary_14_days'):
    df[label_1] = df['user_id_hash'].apply(lambda x: x in purchase_user_7)
    df[label_2] = df['user_id_hash'].apply(lambda x: x in purchase_user_14)
    return df

In [45]:
training  = label_create(training)

In [46]:
training.shape

(619423, 3)

In [47]:
training.head()

Unnamed: 0,user_id_hash,user_purchase_binary_7_days,user_purchase_binary_14_days
0,a1b7e06ceb4d33e7e7152ccca344c793700d52c8464e9c...,False,False
1,4ae81c836e37bdde4408585ced58a5602e0f946286e36e...,False,False
2,b524a272f9dc01b6dfa57ecd23fe8a85b628c7ac8cac7e...,False,False
3,ad244155c390b9b6c87da920c3d68cbafb7fd7968efa9c...,False,False
4,a632aad4c752164a6835f25eb325151f661b1ab549482c...,False,False


## Training features
We will create user-based features one by one.

In [48]:
day_gap = (1544227199000-1543622399000)/7

In [51]:
def feature_generate(df, events_df, session_df):
    window_max = events_df['event_timestamp'].max()
    window_1_week_before = window_max - 7*day_gap
    window_2_week_before = window_max - 14*day_gap
    window_3_week_before = window_max - 21*day_gap
    
    print(df.shape)
    # Feature one: event_count
    count = events_df.groupby('user_id_hash').event.count()
    df['event_count'] = df.user_id_hash.map(count)
    
    # Feature two: purchase_count_1_week
    count = events_df[(events_df.event=='8')&(events_df.event_timestamp>window_1_week_before)].groupby('user_id_hash').event.count()
    df['purchase_count_1_week'] = df.user_id_hash.map(count)
    # Feature two: purchase_count_2_week
    count = events_df[(events_df.event=='8')&(events_df.event_timestamp>window_2_week_before)&(events_df.event_timestamp<=window_1_week_before)].groupby('user_id_hash').event.count()
    df['purchase_count_2_week'] = df.user_id_hash.map(count)
    # Feature two: purchase_count_3_week
    count = events_df[(events_df.event=='8')&(events_df.event_timestamp>window_3_week_before)&(events_df.event_timestamp<=window_2_week_before)].groupby('user_id_hash').event.count()
    df['purchase_count_3_week'] = df.user_id_hash.map(count)
    # Feature two: purchase_count_4_week
    count = events_df[(events_df.event=='8')&(events_df.event_timestamp<=window_3_week_before)].groupby('user_id_hash').event.count()
    df['purchase_count_4_week'] = df.user_id_hash.map(count)
    print(df.shape)
    # Feature three: session_count    
    count = events_df.groupby('user_id_hash').session_id.count()
    df['session_count'] = df.user_id_hash.map(count)
    
    # Feature four: country
    country = session_df.groupby('user_id_hash').country.first()
    df['country'] = df.user_id_hash.map(country)
    
    # Feature five: OS
    os = session_df.groupby('user_id_hash').os_name.first()
    df['os_name'] = df.user_id_hash.map(os)

    # Feature 11: city
    
    city = session_df.groupby('user_id_hash').city.first()
    df['city'] = df.user_id_hash.map(city)
    
    
    # Feature 24: region
    
    region = session_df.groupby('user_id_hash').region.first()
    df['region'] = df.user_id_hash.map(region)
    print(df.shape)    
    # Feature six: session_duration
    duration = session_df.groupby('user_id_hash').previous_sessions_duration.mean()
    df['mean_sessions_duration'] = df.user_id_hash.map(duration)
    
    # Feature seven: spend
    spend = events_df.groupby('user_id_hash').event_value.sum()
    df['spend'] = df.user_id_hash.map(spend)
    
    # Feature eight: event_gap    
    event_gap = events_df.groupby('user_id_hash').event_timestamp.max()
    event_gap = window_max - event_gap
    df['event_gap'] = df.user_id_hash.map(event_gap)
        
    # Feature nine: session_gap
    session_gap = session_df.groupby('user_id_hash').start_timestamp.max()
    session_gap = window_max - session_gap
    df['session_gap'] = df.user_id_hash.map(session_gap)    

    # Feature 25: purchase_gap
    purchase_gap = events_df[events_df.event=='8'].groupby('user_id_hash').event_timestamp.max()
    purchase_gap = window_max - purchase_gap
    df['purchase_gap'] = df.user_id_hash.map(purchase_gap) 
    
    # Feature ten: life_time
    life_time = session_df.groupby('user_id_hash').user_created_timestamp.max()
    life_time = window_max - life_time
    df['life_time'] = df.user_id_hash.map(life_time)     
    print(df.shape)    
#     # Feature 12: num_places
#     num_places = session_df.groupby('user_id_hash').latitude.nunique('latitude')
#     df['num_places'] = df.user_id_hash.map(num_places) 
    
    # Feature 13: event_45    
    count = events_df[events_df.event=='45'].groupby('user_id_hash').event.count()
    df['event_45'] = df.user_id_hash.map(count)
    
    # Feature 14: event_1
    count = events_df[events_df.event=='1'].groupby('user_id_hash').event.count()
    df['event_1'] = df.user_id_hash.map(count)
    
    # Feature 15: event_5
    count = events_df[events_df.event=='5'].groupby('user_id_hash').event.count()
    df['event_5'] = df.user_id_hash.map(count)
    
    # Feature 16: event_6
    count = events_df[events_df.event=='6'].groupby('user_id_hash').event.count()
    df['event_6'] = df.user_id_hash.map(count)
    
    # Feature 17: event_14
    count = events_df[events_df.event=='14'].groupby('user_id_hash').event.count()
    df['event_14'] = df.user_id_hash.map(count)
    
    # Feature 18: event_4
    count = events_df[events_df.event=='4'].groupby('user_id_hash').event.count()
    df['event_4'] = df.user_id_hash.map(count)
    
    # Feature 19: event_40
    count = events_df[events_df.event=='40'].groupby('user_id_hash').event.count()
    df['event_40'] = df.user_id_hash.map(count)
    
    # Feature 20: event_7
    count = events_df[events_df.event=='7'].groupby('user_id_hash').event.count()
    df['event_7'] = df.user_id_hash.map(count)
    
    # Feature 21: event_41
    count = events_df[events_df.event=='41'].groupby('user_id_hash').event.count()
    df['event_41'] = df.user_id_hash.map(count)
    
    # Feature 22: event_3
    count = events_df[events_df.event=='3'].groupby('user_id_hash').event.count()
    df['event_3'] = df.user_id_hash.map(count)
    
    # Feature 23: event_42
    count = events_df[events_df.event=='42'].groupby('user_id_hash').event.count()
    df['event_42'] = df.user_id_hash.map(count)
    print(df.shape)    
    # Feature 23: event_42
    attr_1 = attr[attr.attribute==1].groupby('user_id_hash').attribute_value.astype(np.float32).mean()
    df['attr_1'] = df.user_id_hash.map(attr_1)
    
    attr_13 = attr[attr.attribute==13].groupby('user_id_hash').attribute_value.astype(np.float32).mean()
    df['attr_13'] = df.user_id_hash.map(attr_13)
    
    attr_14 = attr[attr.attribute==14].groupby('user_id_hash').attribute_value.astype(np.float32).mean()
    df['attr_14'] = df.user_id_hash.map(attr_14)
    
    attr_15 = attr[attr.attribute==15].groupby('user_id_hash').attribute_value.astype(np.float32).mean()
    df['attr_15'] = df.user_id_hash.map(attr_15)
    
    attr_16 = attr[attr.attribute==16].groupby('user_id_hash').attribute_value.astype(np.float32).mean()
    df['attr_16'] = df.user_id_hash.map(attr_16)

    return df

In [52]:
training = feature_generate(training, events_training, session_training)

(619423, 5)
(619423, 8)
(619423, 13)
(619423, 19)
(619423, 30)


AttributeError: Cannot access callable attribute 'astype' of 'SeriesGroupBy' objects, try using the 'apply' method

In [None]:
training.to_pickle('training_att.pkl')

## Prediction Features

Lets wrap up all feature generating procedures and use it in prediction dataset.

In [67]:
submit = feature_generate(submission, events, session)

In [68]:
submit2 = feature_encoding(submit, events, session)

In [69]:
submit.to_pickle('prediction.pkl')

In [41]:
submission.head()

Unnamed: 0,user_id_hash,user_purchase_binary_7_days,user_purchase_binary_14_days
0,e469dfaed039ead9110165d9bc457acb11609ca34057dc...,0.01,0.02
1,afcc639a324b6c598ef83d360450afa011cb2dd1358bf9...,0.01,0.02
2,fd5a7cf211d08e3e00f7be6a9df6e6ea3d2e5c22a5d9c3...,0.01,0.02
3,00bfff98b9d0329f014c2eeac7ce47cd18b2bc6e10d608...,0.01,0.02
4,0d298f3638c43e915c119d4935e1ce8d168f81b5e3e8c1...,0.01,0.02


## Model

In [202]:
X = training.drop(columns =['user_id_hash', 'user_purchase_binary_7_days', 'user_purchase_binary_14_days','country','os_name','city','region'])
y = training['user_purchase_binary_7_days']
X_test = submit2.drop(columns = ['user_id_hash', 'user_purchase_binary_7_days', 'user_purchase_binary_14_days','country','os_name','city','region'])

In [204]:
X.shape

(619423, 25)

In [205]:
X_test.shape

(312568, 25)

In [206]:
params = {'num_leaves': 8,
         'min_data_in_leaf': 40,
         'objective': 'binary',
         'max_depth': 16,
         'learning_rate': 0.01,
         'boosting': 'gbdt',
         'bagging_freq': 5,
         'bagging_fraction': 0.8,
         'feature_fraction': 0.8,
         'bagging_seed': 11,
         'reg_alpha': 1.7,
         'reg_lambda': 5,
         'random_state': 42,
         'metric': 'auc',
         'verbosity': -1,
         'subsample': 0.8,
#          'min_gain_to_split': 0.01,
#          'min_child_weight': 20,
         'num_threads': 8}

In [207]:
n_fold = 5
folds = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=42)

In [None]:
prediction = np.zeros(len(X_test))
for fold_n, (train_index, valid_index) in enumerate(folds.split(X,y)):
    print('Fold', fold_n, 'started at', time.ctime())
    X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
    
    train_data = lgb.Dataset(X_train, label=y_train)
    valid_data = lgb.Dataset(X_valid, label=y_valid)
        
    model = lgb.train(params,train_data,num_boost_round=20000,
                      valid_sets = [train_data, valid_data], 
                      verbose_eval=300, 
                      early_stopping_rounds = 200)
            
    #y_pred_valid = model.predict(X_valid)
    prediction += model.predict(X_test, num_iteration=model.best_iteration)/n_fold

Fold 0 started at Tue Feb 19 00:27:27 2019
Training until validation scores don't improve for 200 rounds.
[300]	training's auc: 0.975062	valid_1's auc: 0.9722
[600]	training's auc: 0.977128	valid_1's auc: 0.973809
[900]	training's auc: 0.978228	valid_1's auc: 0.974338
[1200]	training's auc: 0.978891	valid_1's auc: 0.974539
[1500]	training's auc: 0.97951	valid_1's auc: 0.974745
[1800]	training's auc: 0.980019	valid_1's auc: 0.974835
[2100]	training's auc: 0.980511	valid_1's auc: 0.974906
[2400]	training's auc: 0.981	valid_1's auc: 0.975005
[2700]	training's auc: 0.981468	valid_1's auc: 0.975052
[3000]	training's auc: 0.981894	valid_1's auc: 0.97512


In [None]:
np.save('lgb_prediction_1', prediction1)


In [122]:
prediction = np.zeros(len(X_test))
for fold_n, (train_index, valid_index) in enumerate(folds.split(X,y)):
    print('Fold', fold_n, 'started at', time.ctime())
    X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
    
    train_data = lgb.Dataset(X_train, label=y_train)
    valid_data = lgb.Dataset(X_valid, label=y_valid)
        
    model = lgb.train(params,train_data,num_boost_round=20000,
                      valid_sets = [train_data, valid_data], 
                      verbose_eval=300, 
                      early_stopping_rounds = 200)
            
    #y_pred_valid = model.predict(X_valid)
    prediction += model.predict(X_test, num_iteration=model.best_iteration)/n_fold

Fold 0 started at Mon Feb 18 04:12:04 2019
Training until validation scores don't improve for 200 rounds.
[300]	training's auc: 0.974345	valid_1's auc: 0.971998
[600]	training's auc: 0.976271	valid_1's auc: 0.973568
[900]	training's auc: 0.977183	valid_1's auc: 0.97389
[1200]	training's auc: 0.978002	valid_1's auc: 0.974197
[1500]	training's auc: 0.978594	valid_1's auc: 0.974382
[1800]	training's auc: 0.979117	valid_1's auc: 0.974516
[2100]	training's auc: 0.979524	valid_1's auc: 0.974564
[2400]	training's auc: 0.979943	valid_1's auc: 0.974607
Early stopping, best iteration is:
[2344]	training's auc: 0.979854	valid_1's auc: 0.97465
Fold 1 started at Mon Feb 18 04:14:35 2019
Training until validation scores don't improve for 200 rounds.
[300]	training's auc: 0.974003	valid_1's auc: 0.974068
[600]	training's auc: 0.975958	valid_1's auc: 0.975329
[900]	training's auc: 0.976866	valid_1's auc: 0.975858
[1200]	training's auc: 0.977602	valid_1's auc: 0.976209
[1500]	training's auc: 0.978206	v

In [None]:
X = training.drop(columns = ['user_id_hash', 'user_purchase_binary_7_days', 'user_purchase_binary_14_days','country','os_name', 'city', 'region'])
y = training['user_purchase_binary_14_days']
X_test = submit.drop(columns = ['user_id_hash', 'user_purchase_binary_7_days', 'user_purchase_binary_14_days','country','os_name', 'city', 'region'])

In [None]:
prediction = np.zeros(len(X_test))
for fold_n, (train_index, valid_index) in enumerate(folds.split(X,y)):
    print('Fold', fold_n, 'started at', time.ctime())
    X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
    
    train_data = lgb.Dataset(X_train, label=y_train)
    valid_data = lgb.Dataset(X_valid, label=y_valid)
        
    model = lgb.train(params,train_data,num_boost_round=20000,
                      valid_sets = [train_data, valid_data], 
                      verbose_eval=300, 
                      early_stopping_rounds = 200)
            
    #y_pred_valid = model.predict(X_valid)
    prediction += model.predict(X_test, num_iteration=model.best_iteration)/n_fold

In [None]:
prediction2 = prediction
np.save('lgb_prediction_2', prediction2)

In [148]:
params = {'num_leaves': 8,
         'min_data_in_leaf': 40,
         'objective': 'binary',
         'max_depth': 16,
         'learning_rate': 0.01,
         'boosting': 'gbdt',
         'bagging_freq': 5,
         'bagging_fraction': 0.8,
         'feature_fraction': 0.8,
         'bagging_seed': 11,
         'reg_alpha': 1.7,
         'reg_lambda': 5,
         'random_state': 42,
         'metric': 'auc',
         'verbosity': -1,
         'subsample': 0.8,
#          'min_gain_to_split': 0.01,
#          'min_child_weight': 20,
         'num_threads': 8}
n_fold = 5
folds = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=42)

In [181]:
X = training.drop(columns = ['user_id_hash', 'user_purchase_binary_7_days', 'user_purchase_binary_14_days','country','os_name', 'city', 'region'])
y = training['user_purchase_binary_7_days']
X_test = submit.drop(columns = ['user_id_hash', 'user_purchase_binary_7_days', 'user_purchase_binary_14_days','country','os_name', 'city', 'region'])

In [182]:
import xgboost as xgb
import catboost as cab

In [183]:
cab.__version__

'0.12.2'

In [None]:
model = cab.CatBoostClassifier(loss_function="Logloss",
                           eval_metric="AUC",
                           task_type="CPU",
                           learning_rate=0.01,
                           iterations=10000,
                           random_seed=42,
                           od_type="Iter",
                           depth=10,
                           early_stopping_rounds=300
                          )

In [None]:
from sklearn.metrics import roc_auc_score
target = y.apply(lambda x: int(x))
y_valid_pred = 0 * target
y_test_pred = 0
for idx, (train_index, valid_index) in enumerate(folds.split(X,y)):
    y_train, y_valid = target.iloc[train_index], target.iloc[valid_index]
    X_train, X_valid = X.iloc[train_index,:], X.iloc[valid_index,:]
    _train = cab.Pool(X_train, label=y_train)
    _valid = cab.Pool(X_valid, label=y_valid)
    print( "\nFold ", idx)
    fit_model = model.fit(_train,
                          eval_set=_valid,
                          use_best_model=True,
                          verbose=200,
                          plot=True
                         )
    pred = fit_model.predict_proba(X_valid)[:,1]
    print( "auc = ", roc_auc_score(y_valid, pred) )
    y_valid_pred.iloc[valid_index] = pred
    y_test_pred += fit_model.predict_proba(X_test)[:,1]
y_test_pred /= n_split

In [None]:
np.save('cab_prediction_1', y_test_pred)


In [None]:
y = training['user_purchase_binary_14_days']
from sklearn.metrics import roc_auc_score
target = y.apply(lambda x: int(x))
y_valid_pred = 0 * target
y_test_pred = 0
for idx, (train_index, valid_index) in enumerate(folds.split(X,y)):
    y_train, y_valid = target.iloc[train_index], target.iloc[valid_index]
    X_train, X_valid = X.iloc[train_index,:], X.iloc[valid_index,:]
    _train = cab.Pool(X_train, label=y_train)
    _valid = cab.Pool(X_valid, label=y_valid)
    print( "\nFold ", idx)
    fit_model = model.fit(_train,
                          eval_set=_valid,
                          use_best_model=True,
                          verbose=200,
                          plot=True
                         )
    pred = fit_model.predict_proba(X_valid)[:,1]
    print( "auc = ", roc_auc_score(y_valid, pred) )
    y_valid_pred.iloc[valid_index] = pred
    y_test_pred += fit_model.predict_proba(X_test)[:,1]
y_test_pred /= n_split

In [None]:
np.save('cab_prediction_2', y_test_pred)

In [187]:
X_test.head()

Unnamed: 0,event_count,purchase_count,session_count,mean_sessions_duration,spend,event_gap,session_gap,life_time,num_places,event_45,...,event_5,event_6,event_14,event_4,event_40,event_7,event_41,event_3,event_42,purchase_gap
0,26.0,0.0,2.0,356544.0,0.0,4169025000.0,4169058000.0,4219028000.0,1.0,17.0,...,1.0,1.0,3.0,0.0,0.0,1.0,0.0,0.0,0.0,
1,50.0,0.0,1.0,0.0,0.0,1962124000.0,2068505000.0,2068521000.0,1.0,31.0,...,3.0,3.0,3.0,3.0,0.0,1.0,0.0,0.0,0.0,
2,31.0,0.0,4.0,388631.25,0.0,3259103000.0,3448174000.0,4132000000.0,2.0,19.0,...,1.0,1.0,3.0,0.0,0.0,2.0,0.0,0.0,0.0,
3,207.0,0.0,10.0,9200582.9,0.0,2712367000.0,2714804000.0,4368928000.0,2.0,142.0,...,9.0,8.0,6.0,7.0,5.0,4.0,5.0,4.0,2.0,
4,5.0,0.0,1.0,0.0,0.0,5332535000.0,5507037000.0,5507037000.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,


In [None]:
np.save('lgb_prediction_1', prediction1)y_test_pred

In [None]:
X = training.drop(columns = ['user_id_hash', 'user_purchase_binary_7_days', 'user_purchase_binary_14_days','country','os_name', 'city', 'region'])
y = training['user_purchase_binary_7_days']
X_test = submit.drop(columns = ['user_id_hash', 'user_purchase_binary_7_days', 'user_purchase_binary_14_days','country','os_name', 'city', 'region'])

In [127]:
submission['user_purchase_binary_7_days'] = prediction1
submission['user_purchase_binary_14_days'] = prediction2

In [128]:
submission.to_csv('submission0217.csv', index=False)

In [129]:
!kaggle competitions submit predict-in-app-purchase -f submission0217.csv -m "Using full 11 features and model on lgbm"

100%|██████████████████████████████████████| 32.6M/32.6M [00:02<00:00, 16.1MB/s]
Successfully submitted to Predict in-app purchases