In [15]:
from __future__ import division

import pandas as pd
import numpy as np
import graphlab as gl
import sys

sys.path.append('/home/galina/PythonProjects/common_functions')
import preprocess

In [16]:
start_time = pd.to_datetime('2011-06-27')
end_time = pd.to_datetime('2012-12-30')
time_length = int((end_time - start_time).days)

## user_list.csv

In [17]:
users = pd.read_csv('user_list.csv', index_col = None)
users.drop('PREF_NAME', axis = 1, inplace = True)
users['SEX_ID'] = users['SEX_ID'].map({'f':0, 'm': 1})

users['REG_DATE'] = pd.to_datetime(users['REG_DATE'])
users['WITHDRAW_DATE'] = pd.to_datetime(users['WITHDRAW_DATE'])
users['WITHDRAW_DATE'] =users['WITHDRAW_DATE'].fillna(end_time)

# REG_DATE to number of days since start time. Negative numbers to zero.
users['REG_DATE'] = users['REG_DATE'].map(lambda x : 0 if x < start_time else (x - start_time).days)
users['WITHDRAW_DATE'] = users['WITHDRAW_DATE'].map(lambda x : (x - start_time).days)

In [18]:
# users that deactivated their accounts before test coupons became valid. Can be shown nothing (or anything?)
users_withdraw = set(users[users['WITHDRAW_DATE'] < 367])
len(users_withdraw)

5

## coupon_list_train_translated.csv
## coupon_list_test_translated.csv

In [19]:
train = pd.read_csv('coupon_list_train_translated.csv', index_col = None)
train['test'] = 0
test = pd.read_csv('coupon_list_test_translated.csv', index_col = None)
test['test'] = 1

joined = pd.concat([train, test])

In [20]:
# USABLE_DATE_SOMETHING: replace 2 and NaN with 1
for col in joined.iloc[:, 11:20]:
    joined[col] = joined[col].map({0:0, 1:1, 2:1})
    joined[col] = joined[col].fillna(1)

In [21]:
# drop DISPEND and VALIDEND (because start and length are known)
joined.drop('DISPEND', axis = 1, inplace = True)
joined.drop('VALIDEND', axis = 1, inplace = True)

In [22]:
# when valid period unknown, assume it's the same as dispensing period
joined['VALIDFROM'] = joined['VALIDFROM'].fillna(joined['DISPFROM'])
joined['VALIDPERIOD'] = joined['VALIDPERIOD'].fillna(joined['DISPPERIOD'])

In [23]:
# DISPFROM and VALIDFROM to number of days since start time
joined['DISPFROM'] = pd.to_datetime(joined['DISPFROM'])
joined['DISPFROM'] = joined['DISPFROM'].map(lambda x: (x - start_time).days)

joined['VALIDFROM'] = pd.to_datetime(joined['VALIDFROM'])
joined['VALIDFROM'] = joined['VALIDFROM'].map(lambda x: (x - start_time).days)

In [24]:
joined.head()

Unnamed: 0,CAPSULE_TEXT,GENRE_NAME,PRICE_RATE,CATALOG_PRICE,DISCOUNT_PRICE,DISPFROM,DISPPERIOD,VALIDFROM,VALIDPERIOD,USABLE_DATE_MON,...,USABLE_DATE_FRI,USABLE_DATE_SAT,USABLE_DATE_SUN,USABLE_DATE_HOLIDAY,USABLE_DATE_BEFORE_HOLIDAY,large_area_name,ken_name,small_area_name,COUPON_ID_hash,test
0,Food,Food,50,3000,1500,11,1,13,151,1,...,0,0,1,1,0,関東,埼玉県,埼玉,6b263844241eea98c5a97f1335ea82af,0
1,Food,Food,51,2080,1000,4,1,6,154,1,...,1,1,1,1,1,関東,千葉県,千葉,cc031f250e8bad1e24060263b9fc0ddd,0
2,Food,Food,50,7000,3500,15,3,19,179,0,...,1,1,1,1,1,関東,千葉県,千葉,ba5e9b7453ca52ff711635a5d2e8102d,0
3,Food,Food,50,3000,1500,12,2,15,142,1,...,0,0,1,1,1,関東,千葉県,千葉,3e1ffbedca3569f9e8032d401e8cb4e6,0
4,Food,Food,50,2000,1000,8,1,10,176,1,...,0,0,1,1,0,関東,千葉県,千葉,782934b6c815b4030ea204eef7d4a734,0


## coupon_visit_train.csv

In [25]:
visit = pd.read_csv('coupon_visit_train.csv')
visit.drop(['PAGE_SERIAL', 'REFERRER_hash', 'SESSION_ID_hash', 'I_DATE'], axis = 1, inplace = True)

visit = visit.drop_duplicates()

In [26]:
index_id = visit.drop_duplicates().groupby(['USER_ID_hash', 'VIEW_COUPON_ID_hash']).sum().reset_index()
index_id['PURCHASE_FLG'] = index_id['PURCHASE_FLG'].map(lambda x: 1 if x > 0 else 0)
index_id.rename(columns={'VIEW_COUPON_ID_hash' : 'COUPON_ID_hash'}, inplace = True)
index_id.head()

Unnamed: 0,USER_ID_hash,COUPON_ID_hash,PURCHASE_FLG
0,0000b53e182165208887ba65c079fc21,0645faa156f34104e6d8910160868f9f,0
1,0000b53e182165208887ba65c079fc21,18097cd25ab6b7e8eb0481b0e3a3cfd8,0
2,0000b53e182165208887ba65c079fc21,1b581f2ed53f2f2eafbc1560db640194,0
3,0000b53e182165208887ba65c079fc21,1d04e76c44c231d5d05dc1634d20fe8c,0
4,0000b53e182165208887ba65c079fc21,2962b9f2ec7ecde9daddf53dd3118526,0


## coupon_detail_train.csv

In [27]:
detail = pd.read_csv('coupon_detail_train.csv')
detail.drop(['ITEM_COUNT', 'I_DATE', 'SMALL_AREA_NAME', 'PURCHASEID_hash'], axis = 1, inplace = True)
detail['PURCHASE_FLG'] = 1
detail.head()

Unnamed: 0,USER_ID_hash,COUPON_ID_hash,PURCHASE_FLG
0,d9dca3cb44bab12ba313eaa681f663eb,34c48f84026e08355dc3bd19b427f09a,1
1,560574a339f1b25e57b0221e486907ed,767673b7a777854a92b73b0934ddfae7,1
2,560574a339f1b25e57b0221e486907ed,4f3b5b91d9831192557c056022fdc1f2,1
3,560574a339f1b25e57b0221e486907ed,4f3b5b91d9831192557c056022fdc1f2,1
4,560574a339f1b25e57b0221e486907ed,4f3b5b91d9831192557c056022fdc1f2,1


In [28]:
index_id = pd.concat([index_id, detail]).drop_duplicates()

In [29]:
# users that only viewed coupons but didn't purchase anything.

users_view = index_id.groupby('USER_ID_hash').sum().reset_index()
users_view = users_view[users_view['PURCHASE_FLG'] == 0]
users_view = set(users_view['USER_ID_hash'])
len(users_view)

51

In [30]:
big = joined.merge(index_id)
big = big.merge(users)

# Create new training data

In [58]:
big_train = big[big['test'] == 0]

# number of used coupons in train set
len(big_train[big_train['PURCHASE_FLG'] == 1]['COUPON_ID_hash'].unique())

19368

In [59]:
big_train.drop(['test'], axis = 1, inplace = True)

cols = list(big_train.columns.values)

cols.remove('COUPON_ID_hash')
cols.remove('USER_ID_hash')
cols.remove('PURCHASE_FLG')

cols = cols + ['USER_ID_hash', 'COUPON_ID_hash', 'PURCHASE_FLG']

big_train = big_train[cols]

A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [60]:
big_train.head()

Unnamed: 0,CAPSULE_TEXT,GENRE_NAME,PRICE_RATE,CATALOG_PRICE,DISCOUNT_PRICE,DISPFROM,DISPPERIOD,VALIDFROM,VALIDPERIOD,USABLE_DATE_MON,...,large_area_name,ken_name,small_area_name,REG_DATE,SEX_ID,AGE,WITHDRAW_DATE,USER_ID_hash,COUPON_ID_hash,PURCHASE_FLG
0,Food,Food,50,3000,1500,11,1,13,151,1,...,関東,埼玉県,埼玉,0,1,42,552,2190cfb528ddc82f2bbb786163231ae5,6b263844241eea98c5a97f1335ea82af,0
1,Food,Food,50,8000,3980,4,2,8,179,1,...,関東,埼玉県,埼玉,0,1,42,552,2190cfb528ddc82f2bbb786163231ae5,c5e7bccd11979d5378fb58ee6266e692,0
2,Other,Other coupon,74,3835,980,14,2,17,90,1,...,関西,大阪府,ミナミ他,0,1,42,552,2190cfb528ddc82f2bbb786163231ae5,9ea331b5d1983df6f0af7df463f5798f,0
3,Delivery service,Delivery service,62,4000,1500,10,2,10,2,1,...,関西,大阪府,ミナミ他,0,1,42,552,2190cfb528ddc82f2bbb786163231ae5,e14345c8b3d5aaac2b47747bb5f54dee,0
4,Delivery service,Delivery service,66,2980,999,12,2,12,2,1,...,関東,東京都,銀座・新橋・東京・上野,0,1,42,552,2190cfb528ddc82f2bbb786163231ae5,ec39e0718619dcd2cc402ddbc39ce4ff,0


In [61]:
big_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1654063 entries, 0 to 1654687
Data columns (total 28 columns):
CAPSULE_TEXT                  1654063 non-null object
GENRE_NAME                    1654063 non-null object
PRICE_RATE                    1654063 non-null int64
CATALOG_PRICE                 1654063 non-null int64
DISCOUNT_PRICE                1654063 non-null int64
DISPFROM                      1654063 non-null int64
DISPPERIOD                    1654063 non-null int64
VALIDFROM                     1654063 non-null int64
VALIDPERIOD                   1654063 non-null float64
USABLE_DATE_MON               1654063 non-null float64
USABLE_DATE_TUE               1654063 non-null float64
USABLE_DATE_WED               1654063 non-null float64
USABLE_DATE_THU               1654063 non-null float64
USABLE_DATE_FRI               1654063 non-null float64
USABLE_DATE_SAT               1654063 non-null float64
USABLE_DATE_SUN               1654063 non-null float64
USABLE_DATE_HOLIDAY  

In [62]:
for col in big_train:
    print col, len(big_train[col].unique()), big_train[col].dtype

CAPSULE_TEXT 24 object
GENRE_NAME 13 object
PRICE_RATE 71 int64
CATALOG_PRICE 2415 int64
DISCOUNT_PRICE 1115 int64
DISPFROM 363 int64
DISPPERIOD 18 int64
VALIDFROM 380 int64
VALIDPERIOD 180 float64
USABLE_DATE_MON 2 float64
USABLE_DATE_TUE 2 float64
USABLE_DATE_WED 2 float64
USABLE_DATE_THU 2 float64
USABLE_DATE_FRI 2 float64
USABLE_DATE_SAT 2 float64
USABLE_DATE_SUN 2 float64
USABLE_DATE_HOLIDAY 2 float64
USABLE_DATE_BEFORE_HOLIDAY 2 float64
large_area_name 9 object
ken_name 47 object
small_area_name 55 object
REG_DATE 364 int64
SEX_ID 2 int64
AGE 66 int64
WITHDRAW_DATE 293 int64
USER_ID_hash 22833 object
COUPON_ID_hash 19413 object
PURCHASE_FLG 2 int64


In [63]:
big_train = gl.SFrame(big_train)
big_train.save('big_train.data', format = 'binary')

# Create new testing data

In [38]:
test = joined[joined['test'] == 1]
users['test'] = 1

big_test = pd.merge(test, users, on = 'test', how = 'outer').drop('test', axis = 1)

users.drop('test', axis = 1, inplace = True)

In [39]:
# big_test.drop('small_area_name', axis = 1, inplace= True)
big_test = big_test[ [col for col in big_test if col != 'COUPON_ID_hash'] + ['COUPON_ID_hash'] ]
big_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7090630 entries, 0 to 7090629
Data columns (total 27 columns):
CAPSULE_TEXT                  object
GENRE_NAME                    object
PRICE_RATE                    int64
CATALOG_PRICE                 int64
DISCOUNT_PRICE                int64
DISPFROM                      int64
DISPPERIOD                    int64
VALIDFROM                     int64
VALIDPERIOD                   float64
USABLE_DATE_MON               float64
USABLE_DATE_TUE               float64
USABLE_DATE_WED               float64
USABLE_DATE_THU               float64
USABLE_DATE_FRI               float64
USABLE_DATE_SAT               float64
USABLE_DATE_SUN               float64
USABLE_DATE_HOLIDAY           float64
USABLE_DATE_BEFORE_HOLIDAY    float64
large_area_name               object
ken_name                      object
small_area_name               object
REG_DATE                      int64
SEX_ID                        int64
AGE                           int6

In [40]:
for col in big_test:
    print col, len(big_test[col].unique()), big_test[col].dtype

CAPSULE_TEXT 17 object
GENRE_NAME 12 object
PRICE_RATE 41 int64
CATALOG_PRICE 229 int64
DISCOUNT_PRICE 142 int64
DISPFROM 7 int64
DISPPERIOD 7 int64
VALIDFROM 28 int64
VALIDPERIOD 83 float64
USABLE_DATE_MON 2 float64
USABLE_DATE_TUE 2 float64
USABLE_DATE_WED 2 float64
USABLE_DATE_THU 2 float64
USABLE_DATE_FRI 2 float64
USABLE_DATE_SAT 2 float64
USABLE_DATE_SUN 2 float64
USABLE_DATE_HOLIDAY 2 float64
USABLE_DATE_BEFORE_HOLIDAY 2 float64
large_area_name 9 object
ken_name 34 object
small_area_name 42 object
REG_DATE 369 int64
SEX_ID 2 int64
AGE 66 int64
WITHDRAW_DATE 293 int64
USER_ID_hash 22873 object
COUPON_ID_hash 310 object


In [41]:
# dirty, dirty hack.
# I split set into for conversion into graphlab frame, because whole set is too large
split = 3545315
big_test_0 = big_test[:split]
big_test_1 = big_test[split:]

big_test_0 = gl.SFrame(big_test_0)
big_test_1 = gl.SFrame(big_test_1)

big_test = big_test_0.append(big_test_1)
big_test.save('big_test.data', format = 'binary')