In [119]:
from __future__ import division

import pandas as pd
import numpy as np
import graphlab as gl
import matplotlib.pyplot as plt

from  more_itertools import unique_everseen

%matplotlib inline

In [55]:
start_time = pd.to_datetime('2011-06-27')
valid_num = 5
valid_size = 310

## Load train data

In [56]:
train = gl.load_sframe('big_train.data')

train[:5]

CAPSULE_TEXT,GENRE_NAME,PRICE_RATE,CATALOG_PRICE,DISCOUNT_PRICE,DISPFROM,DISPPERIOD,VALIDFROM
Food,Food,50,3000,1500,11,1,13
Food,Food,50,8000,3980,4,2,8
Other,Other coupon,74,3835,980,14,2,17
Delivery service,Delivery service,62,4000,1500,10,2,10
Delivery service,Delivery service,66,2980,999,12,2,12

VALIDPERIOD,USABLE_DATE_MON,USABLE_DATE_TUE,USABLE_DATE_WED,USABLE_DATE_THU,USABLE_DATE_FRI,USABLE_DATE_SAT
151.0,1.0,1.0,1.0,1.0,0.0,0.0
179.0,1.0,1.0,1.0,1.0,1.0,1.0
90.0,1.0,1.0,1.0,1.0,1.0,1.0
2.0,1.0,1.0,1.0,1.0,1.0,1.0
2.0,1.0,1.0,1.0,1.0,1.0,1.0

USABLE_DATE_SUN,USABLE_DATE_HOLIDAY,USABLE_DATE_BEFORE_HOLIDA Y ...,large_area_name,ken_name
1.0,1.0,0.0,関東,埼玉県
1.0,1.0,1.0,関東,埼玉県
1.0,1.0,1.0,関西,大阪府
1.0,1.0,1.0,関西,大阪府
1.0,1.0,1.0,関東,東京都

small_area_name,REG_DATE,SEX_ID,AGE,WITHDRAW_DATE,USER_ID_hash,COUPON_ID_hash
埼玉,0,1,42,552,2190cfb528ddc82f2bbb78616 3231ae5 ...,6b263844241eea98c5a97f133 5ea82af ...
埼玉,0,1,42,552,2190cfb528ddc82f2bbb78616 3231ae5 ...,c5e7bccd11979d5378fb58ee6 266e692 ...
ミナミ他,0,1,42,552,2190cfb528ddc82f2bbb78616 3231ae5 ...,9ea331b5d1983df6f0af7df46 3f5798f ...
ミナミ他,0,1,42,552,2190cfb528ddc82f2bbb78616 3231ae5 ...,e14345c8b3d5aaac2b47747bb 5f54dee ...
銀座・新橋・東京� ��上野 ...,0,1,42,552,2190cfb528ddc82f2bbb78616 3231ae5 ...,ec39e0718619dcd2cc402ddbc 39ce4ff ...

PURCHASE_FLG
0
0
0
0
0


# Load model

In [57]:
model = gl.load_model('model')
model.name()

'BoostedTreesClassifier'

In [58]:
model.get_current_options()

{'class_weights': 'auto',
 'column_subsample': 0.8,
 'max_depth': 6,
 'max_iterations': 100,
 'min_child_weight': 0.1,
 'min_loss_reduction': 0.0,
 'random_seed': None,
 'row_subsample': 0.8,
 'step_size': 0.3}

In [62]:
model.get('features')

['CAPSULE_TEXT',
 'GENRE_NAME',
 'PRICE_RATE',
 'CATALOG_PRICE',
 'DISCOUNT_PRICE',
 'DISPFROM',
 'DISPPERIOD',
 'VALIDFROM',
 'VALIDPERIOD',
 'USABLE_DATE_MON',
 'USABLE_DATE_TUE',
 'USABLE_DATE_WED',
 'USABLE_DATE_THU',
 'USABLE_DATE_FRI',
 'USABLE_DATE_SAT',
 'USABLE_DATE_SUN',
 'USABLE_DATE_HOLIDAY',
 'USABLE_DATE_BEFORE_HOLIDAY',
 'large_area_name',
 'ken_name',
 'small_area_name',
 'REG_DATE',
 'SEX_ID',
 'AGE',
 'WITHDRAW_DATE']

# Validation sets

In [123]:
# keep only purchased coupons, order by dispensing date, drop duplicates
coupons = train[train['PURCHASE_FLG'] != 0]
coupons = coupons.sort('DISPFROM')['COUPON_ID_hash']
coupons = list(unique_everseen(coupons))

In [181]:
valid_coupons = [ {'validation' : coupons[-valid_size :], 'train' : coupons[ : -valid_size] } ]
for n in range(1, valid_num):
    valid_coupons.append( {'validation' : coupons[-valid_size * (n + 1) : -valid_size * n],
                           'train'      : coupons[ : -valid_size * (n + 1)] } )

# Answers

In [183]:
true = train[train['PURCHASE_FLG'] != 0]
true = true['USER_ID_hash', 'COUPON_ID_hash']

In [184]:
true.head(5)

USER_ID_hash,COUPON_ID_hash
2190cfb528ddc82f2bbb78616 3231ae5 ...,eec8856cd2792f55bb363ee8f c1e3898 ...
2190cfb528ddc82f2bbb78616 3231ae5 ...,dce5527b5da0fca4ad4dd795d b03268a ...
2190cfb528ddc82f2bbb78616 3231ae5 ...,3d9029d3ec66802b11ee2645d c16e8cb ...
462d299e58f26fc825d7579fb 89f9d8d ...,0ef236f6fa6343763c4a06fde 508dad5 ...
462d299e58f26fc825d7579fb 89f9d8d ...,2438cf398f5fad0f2b65221e4 3cf97b7 ...


# Single hold-out

In [194]:
eval_coupons = valid_coupons[0]['validation']
train_coupons = valid_coupons[0]['train']

eval_true = true.filter_by(eval_coupons, 'COUPON_ID_hash')

eval_valid = train.filter_by(eval_coupons, 'COUPON_ID_hash').remove_column('PURCHASE_FLG')
eval_train = train.filter_by(train_coupons, 'COUPON_ID_hash')

# Evaluation

In [200]:
model1 = gl.load_model('model')
model.name()

'BoostedTreesClassifier'

In [202]:
param = model1.get_current_options()
param

{'class_weights': 'auto',
 'column_subsample': 0.8,
 'max_depth': 6,
 'max_iterations': 100,
 'min_child_weight': 0.1,
 'min_loss_reduction': 0.0,
 'random_seed': None,
 'row_subsample': 0.8,
 'step_size': 0.3}

In [12]:
model2 = 

In [13]:
predictions = gl.SFrame({'USER_ID_hash'   : test['USER_ID_hash'],
                         'COUPON_ID_hash' : test['COUPON_ID_hash'],
                         'probability'    : probabilities})

predictions = predictions.to_dataframe()

In [14]:
predictions = predictions[ ['USER_ID_hash', 'COUPON_ID_hash', 'probability'] ]
predictions = predictions.sort(['USER_ID_hash', 'probability'], ascending = [True, False])
predictions.head(10)

Unnamed: 0,USER_ID_hash,COUPON_ID_hash,probability
1322088,0000b53e182165208887ba65c079fc21,5e47b887e154f746883013f863c3ffe1,0.772535
3838118,0000b53e182165208887ba65c079fc21,42cc500acba3c79883cfd40adcd5ae96,0.75272
2534357,0000b53e182165208887ba65c079fc21,9193590f0f6d2f9ea8467cfe52295107,0.691288
5096133,0000b53e182165208887ba65c079fc21,27741884a086e2864936d7ef680becc2,0.688815
3563642,0000b53e182165208887ba65c079fc21,79de77aa8c36fdf17cb3366e2084e353,0.684892
4798784,0000b53e182165208887ba65c079fc21,df7d72a87f2c99634766f0f3bef141ef,0.677157
1459326,0000b53e182165208887ba65c079fc21,c9e1dcbd8c98f919bf85ab5f2ea30a9d,0.675628
5667958,0000b53e182165208887ba65c079fc21,281326ffac6d5dd2eec24f7bde0078d7,0.67444
6697243,0000b53e182165208887ba65c079fc21,fc5f052a1bd97696fbcab35d8d974b73,0.668088
3998229,0000b53e182165208887ba65c079fc21,c8ede88786a1cb6295ac2392c6093c5b,0.667218


In [15]:
predictions.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7090630 entries, 1322088 to 3290817
Data columns (total 3 columns):
USER_ID_hash      object
COUPON_ID_hash    object
probability       float64
dtypes: float64(1), object(2)
memory usage: 216.4+ MB


In [16]:
predictions = predictions.groupby('USER_ID_hash').head(10)

In [17]:
predictions

Unnamed: 0,USER_ID_hash,COUPON_ID_hash,probability
1322088,0000b53e182165208887ba65c079fc21,5e47b887e154f746883013f863c3ffe1,0.772535
3838118,0000b53e182165208887ba65c079fc21,42cc500acba3c79883cfd40adcd5ae96,0.752720
2534357,0000b53e182165208887ba65c079fc21,9193590f0f6d2f9ea8467cfe52295107,0.691288
5096133,0000b53e182165208887ba65c079fc21,27741884a086e2864936d7ef680becc2,0.688815
3563642,0000b53e182165208887ba65c079fc21,79de77aa8c36fdf17cb3366e2084e353,0.684892
4798784,0000b53e182165208887ba65c079fc21,df7d72a87f2c99634766f0f3bef141ef,0.677157
1459326,0000b53e182165208887ba65c079fc21,c9e1dcbd8c98f919bf85ab5f2ea30a9d,0.675628
5667958,0000b53e182165208887ba65c079fc21,281326ffac6d5dd2eec24f7bde0078d7,0.674440
6697243,0000b53e182165208887ba65c079fc21,fc5f052a1bd97696fbcab35d8d974b73,0.668088
3998229,0000b53e182165208887ba65c079fc21,c8ede88786a1cb6295ac2392c6093c5b,0.667218


In [18]:
predictions.drop('probability', axis = 1, inplace = True)

In [19]:
submission = predictions.groupby('USER_ID_hash')['COUPON_ID_hash'].apply( lambda x : ' '.join(x))
submission = submission.reset_index().rename(columns = {'COUPON_ID_hash' : 'PURCHASED_COUPONS'})
submission = submission.rename(columns = {'COUPON_ID_hash' : 'PURCHASED_COUPONS'})
submission.head()

Unnamed: 0,USER_ID_hash,PURCHASED_COUPONS
0,0000b53e182165208887ba65c079fc21,5e47b887e154f746883013f863c3ffe1 42cc500acba3c...
1,00035b86e6884589ec8d28fbf2fe7757,79de77aa8c36fdf17cb3366e2084e353 c9e1dcbd8c98f...
2,0005b1068d5f2b8f2a7c978fcfe1ca06,5e47b887e154f746883013f863c3ffe1 c9e1dcbd8c98f...
3,000cc06982785a19e2a2fdb40b1c9d59,46da51ba6dd20c514c2802f79a4e94b2 5e47b887e154f...
4,0013518e41c416cd6a181d277dd8ca0b,79de77aa8c36fdf17cb3366e2084e353 c9e1dcbd8c98f...


In [20]:
# submission.to_csv('sub_logreg_gl.csv', index = False)
# submission.to_csv('sub_logreg_gl_200iter_weights.csv', index = False)

submission.to_csv('sub_boosttree_gl_200iter_weights_cols0.8_rows0,8.csv', index = False)