In [1]:
from __future__ import division

import pandas as pd
import numpy as np

Earliest coupon distribution:
2011-06-27

Earliest coupon valid period beginning:
2011-07-01

Latest coupon valid period end:
2012-12-30

In [3]:
start_time = pd.to_datetime('2011-06-27')
end_time = pd.to_datetime('2012-12-30')
time_length = int((end_time - start_time).days)

## user_list.csv

In [4]:
users = pd.read_csv('user_list.csv', index_col = None)
users.drop('PREF_NAME', axis = 1, inplace = True)
users['SEX_ID'] = users['SEX_ID'].map({'f':0, 'm': 1})

users['REG_DATE'] = pd.to_datetime(users['REG_DATE'])
users['WITHDRAW_DATE'] = pd.to_datetime(users['WITHDRAW_DATE'])
users['WITHDRAW_DATE'] =users['WITHDRAW_DATE'].fillna(end_time)

users.head()

Unnamed: 0,REG_DATE,SEX_ID,AGE,WITHDRAW_DATE,USER_ID_hash
0,2012-03-28 14:14:18,0,25,2012-12-30,d9dca3cb44bab12ba313eaa681f663eb
1,2011-05-18 00:41:48,0,34,2012-12-30,560574a339f1b25e57b0221e486907ed
2,2011-06-13 16:36:58,1,41,2012-12-30,e66ae91b978b3229f8fd858c80615b73
3,2012-02-08 12:56:15,1,25,2012-12-30,43fc18f32eafb05713ec02935e2c2825
4,2011-05-22 23:43:56,1,62,2012-12-30,dc6df8aa860f8db0d710ce9d4839840f


In [5]:
users.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22873 entries, 0 to 22872
Data columns (total 5 columns):
REG_DATE         22873 non-null datetime64[ns]
SEX_ID           22873 non-null int64
AGE              22873 non-null int64
WITHDRAW_DATE    22873 non-null datetime64[ns]
USER_ID_hash     22873 non-null object
dtypes: datetime64[ns](2), int64(2), object(1)
memory usage: 1.0+ MB


In [6]:
print 'Earliest registration', min(users['REG_DATE'])
print 'Time inside active period:', min(users['REG_DATE']) >= start_time and min(users['REG_DATE']) <= end_time

Earliest registration 2010-07-21 13:44:02
Time inside active period: False


In [7]:
print 'Latest registration', max(users['REG_DATE'])
print 'Time inside active period:',max(users['REG_DATE']) >= start_time and max(users['REG_DATE']) <= end_time

Latest registration 2012-06-30 23:29:01
Time inside active period: True


In [8]:
print 'Earliest withdrawal', min(users['WITHDRAW_DATE'].dropna())
print 'Time inside active period:', min(users['WITHDRAW_DATE']) >= start_time and min(users['WITHDRAW_DATE']) <= end_time

Earliest withdrawal 2011-07-07 19:51:30
Time inside active period: True


In [9]:
print 'Latest withdrawal', max(users['WITHDRAW_DATE'].dropna())
print 'Time inside active period:', max(users['WITHDRAW_DATE']) >= start_time and max(users['WITHDRAW_DATE']) <= end_time

Latest withdrawal 2012-12-30 00:00:00
Time inside active period: True


In [10]:
# REG_DATE to number of days since start time. Negative numbers to zero.
users['REG_DATE'] = users['REG_DATE'].map(lambda x : 0 if x < start_time else (x - start_time).days)
users['WITHDRAW_DATE'] = users['WITHDRAW_DATE'].map(lambda x : (x - start_time).days)

users.head()

Unnamed: 0,REG_DATE,SEX_ID,AGE,WITHDRAW_DATE,USER_ID_hash
0,275,0,25,552,d9dca3cb44bab12ba313eaa681f663eb
1,0,0,34,552,560574a339f1b25e57b0221e486907ed
2,0,1,41,552,e66ae91b978b3229f8fd858c80615b73
3,226,1,25,552,43fc18f32eafb05713ec02935e2c2825
4,0,1,62,552,dc6df8aa860f8db0d710ce9d4839840f


In [11]:
users_userID = set(users['USER_ID_hash'].unique())

## coupon_list_train_translated.csv
## coupon_list_test_translated.csv

In [12]:
train = pd.read_csv('coupon_list_train_translated.csv', index_col = None)
train['test'] = 0
test = pd.read_csv('coupon_list_test_translated.csv', index_col = None)
test['test'] = 1

joined = pd.concat([train, test])
joined.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19723 entries, 0 to 309
Data columns (total 25 columns):
CAPSULE_TEXT                  19723 non-null object
GENRE_NAME                    19723 non-null object
PRICE_RATE                    19723 non-null int64
CATALOG_PRICE                 19723 non-null int64
DISCOUNT_PRICE                19723 non-null int64
DISPFROM                      19723 non-null object
DISPEND                       19723 non-null object
DISPPERIOD                    19723 non-null int64
VALIDFROM                     13480 non-null object
VALIDEND                      13480 non-null object
VALIDPERIOD                   13480 non-null float64
USABLE_DATE_MON               12617 non-null float64
USABLE_DATE_TUE               12617 non-null float64
USABLE_DATE_WED               12617 non-null float64
USABLE_DATE_THU               12617 non-null float64
USABLE_DATE_FRI               12617 non-null float64
USABLE_DATE_SAT               12617 non-null float64
USABLE

In [125]:
print 'Earliest start of coupon valid period in train:', min(train['VALIDFROM'].dropna())
print 'Latest end of coupon valid period in train:', max(train['VALIDEND'].dropna()), '\n'

print 'Earliest start of coupon valid period in test:', min(test['VALIDFROM'].dropna())
print 'Latest end of coupon valid period in test:', max(test['VALIDEND'].dropna()), '\n'

print 'In submission, can drop all users that withdrew before 2012-06-28 or registered after 2012-12-31.'
print 'Withdraw date should be more than', (pd.to_datetime( min(test['VALIDFROM'].dropna()) ) - start_time).days
print 'Registration date should be less then', (pd.to_datetime( max(test['VALIDEND'].dropna()) ) - start_time).days

Earliest start of coupon valid period in train: 2011-07-02
Latest end of coupon valid period in train: 2012-12-25 

Earliest start of coupon valid period in test: 2012-06-28
Latest end of coupon valid period in test: 2012-12-31 

In submission, can drop all users that withdrew before 2012-06-28 or registered after 2012-12-31.
Withdraw date should be more than 367
Registration date should be less then 553


In [126]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 310 entries, 0 to 309
Data columns (total 25 columns):
CAPSULE_TEXT                  310 non-null object
GENRE_NAME                    310 non-null object
PRICE_RATE                    310 non-null int64
CATALOG_PRICE                 310 non-null int64
DISCOUNT_PRICE                310 non-null int64
DISPFROM                      310 non-null object
DISPEND                       310 non-null object
DISPPERIOD                    310 non-null int64
VALIDFROM                     214 non-null object
VALIDEND                      214 non-null object
VALIDPERIOD                   214 non-null float64
USABLE_DATE_MON               209 non-null float64
USABLE_DATE_TUE               209 non-null float64
USABLE_DATE_WED               209 non-null float64
USABLE_DATE_THU               209 non-null float64
USABLE_DATE_FRI               209 non-null float64
USABLE_DATE_SAT               209 non-null float64
USABLE_DATE_SUN               209 non-null

In [13]:
# USABLE_DATE_SOMETHING: replace 2 and NaN with 1
for col in joined.iloc[:, 11:20]:
    joined[col] = joined[col].map({0:0, 1:1, 2:1})
    joined[col] = joined[col].fillna(1)

In [14]:
# VALIDFROM to number of days since start time
# Dirty solution? Assumes coupons with NaN action period are active all time
joined.drop('VALIDEND', axis = 1, inplace = True)
joined['VALIDPERIOD'] = joined['VALIDPERIOD'].fillna(time_length)
joined['VALIDPERIOD'] = joined['VALIDPERIOD'].map(int)

joined['VALIDFROM'] = pd.to_datetime(joined['VALIDFROM'])
joined['VALIDFROM'] = joined['VALIDFROM'].fillna(start_time)
joined['VALIDFROM'] = joined['VALIDFROM'].map(lambda x: (x - start_time).days)

In [15]:
# DISPFROM to number of days since start time
joined.drop('DISPEND', axis = 1, inplace = True)

joined['DISPFROM'] = pd.to_datetime(joined['DISPFROM'])
joined['DISPFROM'] = joined['DISPFROM'].map(lambda x: (x - start_time).days)

In [16]:
joined.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19723 entries, 0 to 309
Data columns (total 23 columns):
CAPSULE_TEXT                  19723 non-null object
GENRE_NAME                    19723 non-null object
PRICE_RATE                    19723 non-null int64
CATALOG_PRICE                 19723 non-null int64
DISCOUNT_PRICE                19723 non-null int64
DISPFROM                      19723 non-null int64
DISPPERIOD                    19723 non-null int64
VALIDFROM                     19723 non-null int64
VALIDPERIOD                   19723 non-null int64
USABLE_DATE_MON               19723 non-null float64
USABLE_DATE_TUE               19723 non-null float64
USABLE_DATE_WED               19723 non-null float64
USABLE_DATE_THU               19723 non-null float64
USABLE_DATE_FRI               19723 non-null float64
USABLE_DATE_SAT               19723 non-null float64
USABLE_DATE_SUN               19723 non-null float64
USABLE_DATE_HOLIDAY           19723 non-null float64
USABLE_D

In [17]:
joined.head()

Unnamed: 0,CAPSULE_TEXT,GENRE_NAME,PRICE_RATE,CATALOG_PRICE,DISCOUNT_PRICE,DISPFROM,DISPPERIOD,VALIDFROM,VALIDPERIOD,USABLE_DATE_MON,...,USABLE_DATE_FRI,USABLE_DATE_SAT,USABLE_DATE_SUN,USABLE_DATE_HOLIDAY,USABLE_DATE_BEFORE_HOLIDAY,large_area_name,ken_name,small_area_name,COUPON_ID_hash,test
0,Food,Food,50,3000,1500,11,1,13,151,1,...,0,0,1,1,0,関東,埼玉県,埼玉,6b263844241eea98c5a97f1335ea82af,0
1,Food,Food,51,2080,1000,4,1,6,154,1,...,1,1,1,1,1,関東,千葉県,千葉,cc031f250e8bad1e24060263b9fc0ddd,0
2,Food,Food,50,7000,3500,15,3,19,179,0,...,1,1,1,1,1,関東,千葉県,千葉,ba5e9b7453ca52ff711635a5d2e8102d,0
3,Food,Food,50,3000,1500,12,2,15,142,1,...,0,0,1,1,1,関東,千葉県,千葉,3e1ffbedca3569f9e8032d401e8cb4e6,0
4,Food,Food,50,2000,1000,8,1,10,176,1,...,0,0,1,1,0,関東,千葉県,千葉,782934b6c815b4030ea204eef7d4a734,0


In [18]:
for col in joined:
    print 'Number of unique entries in', col, len(joined[col].unique())

Number of unique entries in CAPSULE_TEXT 24
Number of unique entries in GENRE_NAME 13
Number of unique entries in PRICE_RATE 71
Number of unique entries in CATALOG_PRICE 2435
Number of unique entries in DISCOUNT_PRICE 1118
Number of unique entries in DISPFROM 370
Number of unique entries in DISPPERIOD 18
Number of unique entries in VALIDFROM 388
Number of unique entries in VALIDPERIOD 181
Number of unique entries in USABLE_DATE_MON 2
Number of unique entries in USABLE_DATE_TUE 2
Number of unique entries in USABLE_DATE_WED 2
Number of unique entries in USABLE_DATE_THU 2
Number of unique entries in USABLE_DATE_FRI 2
Number of unique entries in USABLE_DATE_SAT 2
Number of unique entries in USABLE_DATE_SUN 2
Number of unique entries in USABLE_DATE_HOLIDAY 2
Number of unique entries in USABLE_DATE_BEFORE_HOLIDAY 2
Number of unique entries in large_area_name 9
Number of unique entries in ken_name 47
Number of unique entries in small_area_name 55
Number of unique entries in COUPON_ID_hash 197

In [19]:
for col in joined:
    if joined[col].dtype != 'object' and len(joined[col].unique()) > 2:
        print 'Minimum entry in', col, min(joined[col].unique())

Minimum entry in PRICE_RATE 0
Minimum entry in CATALOG_PRICE 1
Minimum entry in DISCOUNT_PRICE 0
Minimum entry in DISPFROM 0
Minimum entry in DISPPERIOD 0
Minimum entry in VALIDFROM 0
Minimum entry in VALIDPERIOD 0


In [20]:
for col in joined:
    if joined[col].dtype != 'object' and len(joined[col].unique()) > 2:
        print 'Maximum entry in', col, max(joined[col].unique())

Maximum entry in PRICE_RATE 100
Maximum entry in CATALOG_PRICE 680000
Maximum entry in DISCOUNT_PRICE 100000
Maximum entry in DISPFROM 369
Maximum entry in DISPPERIOD 36
Maximum entry in VALIDFROM 433
Maximum entry in VALIDPERIOD 552


In [21]:
for col in joined:
    if joined[col].dtype != 'object' and len(joined[col].unique()) > 2:
        print 'Second largest entry in', col, np.sort(joined[col].unique())[-2]

Second largest entry in PRICE_RATE 97
Second largest entry in CATALOG_PRICE 300000
Second largest entry in DISCOUNT_PRICE 99225
Second largest entry in DISPFROM 368
Second largest entry in DISPPERIOD 29
Second largest entry in VALIDFROM 432
Second largest entry in VALIDPERIOD 179


In [22]:
joined_couponID = set(joined['COUPON_ID_hash'].unique())
train_couponID = set(train['COUPON_ID_hash'].unique())
test_couponID = set(test['COUPON_ID_hash'].unique())

## coupon_visit_train.csv

In [91]:
visit = pd.read_csv('coupon_visit_train.csv')
visit.drop(['PAGE_SERIAL', 'REFERRER_hash', 'SESSION_ID_hash'], axis = 1, inplace = True)

# I_DATE to number of days since start time. Negative numbers to zero.
# visit['I_DATE'] = pd.to_datetime(visit['I_DATE'])
# visit['I_DATE'] = visit['I_DATE'].map(lambda x : 0 if x < start_time else (x - start_time).days)
# print 'Latest visit day:', max(visit['I_DATE'])

# Actually, there is no sense in keeping date, because we don't have viewing info for test set anyway
# I also drop all rows that used to differ by date only and now are duplicates
visit.drop('I_DATE', axis = 1, inplace = True)
visit = visit.drop_duplicates()

In [93]:
visit.head()

Unnamed: 0,PURCHASE_FLG,VIEW_COUPON_ID_hash,USER_ID_hash,PURCHASEID_hash
0,0,34c48f84026e08355dc3bd19b427f09a,d9dca3cb44bab12ba313eaa681f663eb,
2,0,17c450c3b470c045d35ec22b02daa690,d9dca3cb44bab12ba313eaa681f663eb,
3,0,91a15e6a95d09e5e01b50747833b317d,d9dca3cb44bab12ba313eaa681f663eb,
4,0,96fcbc8f6e45d5a2de1661eb140c6e82,d9dca3cb44bab12ba313eaa681f663eb,
7,0,13090b0c75d7d2d51b15c51bfa7a90fe,d9dca3cb44bab12ba313eaa681f663eb,


In [94]:
visit.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1965074 entries, 0 to 2833176
Data columns (total 4 columns):
PURCHASE_FLG           int64
VIEW_COUPON_ID_hash    object
USER_ID_hash           object
PURCHASEID_hash        object
dtypes: int64(1), object(3)
memory usage: 75.0+ MB


In [95]:
visit.count()

PURCHASE_FLG           1965074
VIEW_COUPON_ID_hash    1965074
USER_ID_hash           1965074
PURCHASEID_hash         114774
dtype: int64

In [96]:
for col in visit:
    print 'Number of unique entries in', col, len(visit[col].unique())

Number of unique entries in PURCHASE_FLG 2
Number of unique entries in VIEW_COUPON_ID_hash 32628
Number of unique entries in USER_ID_hash 22805
Number of unique entries in PURCHASEID_hash 114775


In [97]:
print 'Number of non-zero PURCHASE_FLG entries:', sum(visit['PURCHASE_FLG'])
print 'Fraction of non-zero PURCHASE_FLG entries:', sum(visit['PURCHASE_FLG']) / len(visit)

Number of non-zero PURCHASE_FLG entries: 114774
Fraction of non-zero PURCHASE_FLG entries: 0.0584069607557


In [98]:
visit_userID = set(visit['USER_ID_hash'].unique())

print 'Unique users in users list:', len(users_userID)
print 'Unique users in visit list:', len(visit_userID)
print 'Intersection beween sets of users:', len(users_userID.intersection(visit_userID))

Unique users in users list: 22873
Unique users in visit list: 22805
Intersection beween sets of users: 22805


In [99]:
visit_couponID = set(visit['VIEW_COUPON_ID_hash'].unique())

print 'Unique coupons in train list:', len(train_couponID)
print 'Unique coupons in test list:', len(test_couponID)
print 'Unique coupons in visit list:', len(visit_couponID)

print 'Intersection beween train and visit sets of coupons:', len(train_couponID.intersection(visit_couponID))
print 'Intersection beween test and visit sets of coupons:', len(test_couponID.intersection(visit_couponID))

Unique coupons in train list: 19413
Unique coupons in test list: 310
Unique coupons in visit list: 32628
Intersection beween train and visit sets of coupons: 19412
Intersection beween test and visit sets of coupons: 39


In [103]:
# Most coupons were viewed, but never used. Each of such coupons now has single record:
# {user, coupon, purchase_flag = 0, purchase_id = NaN}

# Coupons that were used for purchase have either single record:
# {user, coupon, purchase_flag = 1, purchase_id}
# or this record and additional record for viewing without purchasing (same as for non-used coupons)

# I want to drop such records for used coupons, so that there is a single record for each
# {user, coupon} combination.

# I want a dataframe of {user, coupon, coupon_flag}
# I can get it by resetting index of the following frame.
# If more than one purchase was made with a coupon, flags add up to int > 1, but I don't care.
# I'll reset all flags to 0 or 1 later.

visit.drop_duplicates().groupby(['USER_ID_hash', 'VIEW_COUPON_ID_hash']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,PURCHASE_FLG
USER_ID_hash,VIEW_COUPON_ID_hash,Unnamed: 2_level_1
0000b53e182165208887ba65c079fc21,0645faa156f34104e6d8910160868f9f,0
0000b53e182165208887ba65c079fc21,18097cd25ab6b7e8eb0481b0e3a3cfd8,0
0000b53e182165208887ba65c079fc21,1b581f2ed53f2f2eafbc1560db640194,0
0000b53e182165208887ba65c079fc21,1d04e76c44c231d5d05dc1634d20fe8c,0
0000b53e182165208887ba65c079fc21,2962b9f2ec7ecde9daddf53dd3118526,0
0000b53e182165208887ba65c079fc21,2ab16b8f5aeead6d31dbdb9bd59c41db,0
0000b53e182165208887ba65c079fc21,2ccbe8c179e4f956b367ad6c4a74a23e,0
0000b53e182165208887ba65c079fc21,2d231a9ce88beb42b0952f59d3e0e8e8,0
0000b53e182165208887ba65c079fc21,300bffc61bc80717d7a2a2c22965fa58,0
0000b53e182165208887ba65c079fc21,35dd6298ee5e42db692c2c211a5f5eba,0


In [107]:
index_id = visit.drop_duplicates().groupby(['USER_ID_hash', 'VIEW_COUPON_ID_hash']).sum().reset_index()
index_id['PURCHASE_FLG'] = index_id['PURCHASE_FLG'].map(lambda x: 1 if x > 0 else 0)
index_id.rename(columns={'VIEW_COUPON_ID_hash' : 'COUPON_ID_hash'}, inplace = True)
index_id

Unnamed: 0,USER_ID_hash,COUPON_ID_hash,PURCHASE_FLG
0,0000b53e182165208887ba65c079fc21,0645faa156f34104e6d8910160868f9f,0
1,0000b53e182165208887ba65c079fc21,18097cd25ab6b7e8eb0481b0e3a3cfd8,0
2,0000b53e182165208887ba65c079fc21,1b581f2ed53f2f2eafbc1560db640194,0
3,0000b53e182165208887ba65c079fc21,1d04e76c44c231d5d05dc1634d20fe8c,0
4,0000b53e182165208887ba65c079fc21,2962b9f2ec7ecde9daddf53dd3118526,0
5,0000b53e182165208887ba65c079fc21,2ab16b8f5aeead6d31dbdb9bd59c41db,0
6,0000b53e182165208887ba65c079fc21,2ccbe8c179e4f956b367ad6c4a74a23e,0
7,0000b53e182165208887ba65c079fc21,2d231a9ce88beb42b0952f59d3e0e8e8,0
8,0000b53e182165208887ba65c079fc21,300bffc61bc80717d7a2a2c22965fa58,0
9,0000b53e182165208887ba65c079fc21,35dd6298ee5e42db692c2c211a5f5eba,0


In [108]:
index_userID = set(index_id['USER_ID_hash'].unique())
index_couponID = set(index_id['COUPON_ID_hash'].unique()) 

### Summary
One in 19 413 coupons in train test and 271 in 310 coupons in test set were never visited.
Only 122 389 in 2 833 180  PURCHASEID_hash entries are not NaNs.
114 775 of these IDs are unique.

*Important*: looks like only 

## coupon_detail_train.csv

In [35]:
detail = pd.read_csv('coupon_detail_train.csv')
detail.drop('ITEM_COUNT', axis = 1, inplace = True)

# I_DATE to number of days since start time. Negative numbers to zero.
detail['I_DATE'] = pd.to_datetime(detail['I_DATE'])
detail['I_DATE'] = detail['I_DATE'].map(lambda x : 0 if x < start_time else (x - start_time).days)

In [36]:
detail.head()

Unnamed: 0,I_DATE,SMALL_AREA_NAME,PURCHASEID_hash,USER_ID_hash,COUPON_ID_hash
0,275,兵庫,c820a8882374a4e472f0984a8825893f,d9dca3cb44bab12ba313eaa681f663eb,34c48f84026e08355dc3bd19b427f09a
1,7,銀座・新橋・東京・上野,1b4eb2435421ede98c8931c42e8220ec,560574a339f1b25e57b0221e486907ed,767673b7a777854a92b73b0934ddfae7
2,19,恵比寿・目黒・品川,36b5f9ba46c44b65587d0b16f2e4c77f,560574a339f1b25e57b0221e486907ed,4f3b5b91d9831192557c056022fdc1f2
3,19,恵比寿・目黒・品川,2f30f46937cc9004774e576914b2aa1a,560574a339f1b25e57b0221e486907ed,4f3b5b91d9831192557c056022fdc1f2
4,19,恵比寿・目黒・品川,4d000c64a55ac573d0ae1a8f03677f50,560574a339f1b25e57b0221e486907ed,4f3b5b91d9831192557c056022fdc1f2


In [44]:
detail.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 168996 entries, 0 to 168995
Data columns (total 5 columns):
I_DATE             168996 non-null int64
SMALL_AREA_NAME    168996 non-null object
PURCHASEID_hash    168996 non-null object
USER_ID_hash       168996 non-null object
COUPON_ID_hash     168996 non-null object
dtypes: int64(1), object(4)
memory usage: 7.7+ MB


In [45]:
for col in detail:
    print 'Number of unique entries in', col, len(detail[col].unique())

Number of unique entries in I_DATE 359
Number of unique entries in SMALL_AREA_NAME 55
Number of unique entries in PURCHASEID_hash 168996
Number of unique entries in USER_ID_hash 22782
Number of unique entries in COUPON_ID_hash 19368


In [37]:
print 'Latest transaction day:', max(detail['I_DATE'])

Latest transaction day: 362


In [51]:
detail_couponID = set(detail['COUPON_ID_hash'].unique())

print 'Unique coupons in joined list:', len(joined_couponID)
print 'Unique coupons in visit list:', len(visit_couponID)
print 'Unique coupons in detail list:', len(detail_couponID)
print 'Intersection beween joined and visit sets of users:', len(joined_couponID.intersection(visit_couponID))
print 'Intersection beween joined and detail sets of users:', len(joined_couponID.intersection(detail_couponID))
print 'Intersection beween detail and visit sets of coupons:', len(detail_couponID.intersection(visit_couponID))

 Unique coupons in joined list: 19723
Unique coupons in visit list: 32628
Unique coupons in detail list: 19368
Intersection beween joined and visit sets of users: 19451
Intersection beween joined and detail sets of users: 19368
Intersection beween detail and visit sets of coupons: 19367


In [48]:
detail_userID = set(detail['USER_ID_hash'].unique())

print 'Unique users in users list:', len(users_userID)
print 'Unique users in visit list:', len(visit_userID)
print 'Unique users in detail list:', len(detail_userID)
print 'Intersection beween users and visit sets of users:', len(users_userID.intersection(visit_userID))
print 'Intersection beween users and detail sets of users:', len(users_userID.intersection(detail_userID))
print 'Intersection beween detail and visit sets of users:', len(detail_userID.intersection(visit_userID))

Unique users in users list: 22873
Unique users in visit list: 22805
Unique users in detail list: 22782
Intersection beween users and visit sets of users: 22805
Intersection beween users and detail sets of users: 22782
Intersection beween detail and visit sets of users: 22754


# I don't actually need detail, because:
# - I don't have that data for test
# - visit file is enough to get answers for train set

# See 01 Explore data for cleaned up version

## Sets of users and coupons

In [111]:
print 'Unique users in users list:', len(users_userID)
print 'Unique users in index list:', len(index_userID), '\n'

print 'Intersection beween users and index sets of users:', len(users_userID.intersection(index_userID))
# Users that are not in index_id table should be ones from test set (the ones we don't have answers for).

Unique users in users list: 22873
Unique users in index list: 22805 

Intersection beween users and index sets of users: 22805


In [112]:
print 'Unique coupons in joined list:', len(joined_couponID)
print 'Unique coupons in train list:', len(train_couponID)
print 'Unique coupons in test list:', len(test_couponID)
print 'Unique coupons in index list:', len(index_couponID), '\n'

print 'Intersection beween joined and index sets of coupons:', len(joined_couponID.intersection(index_couponID))
print 'Intersection beween train and index sets of coupons:', len(train_couponID.intersection(index_couponID))
print 'Intersection beween test and index sets of coupons:', len(test_couponID.intersection(index_couponID))

Unique coupons in joined list: 19723
Unique coupons in train list: 19413
Unique coupons in test list: 310
Unique coupons in index list: 32628 

Intersection beween joined and index sets of coupons: 19451
Intersection beween train and index sets of coupons: 19412
Intersection beween test and index sets of coupons: 39


We know for sure that some of the coupons from test set were viewed (and maybe used; should check).
But by which users? If train users, everythin is fine, I'll lose no info when I merge tables.
If by test users, I need to decide whether I want to use this information for prediction.