In [1]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import scipy.sparse
pd.set_option("display.max_columns",101)
RANDOM_STATE = 42

In [2]:
DATA_PATH = "../data/instacart/"
orders = pd.read_csv(DATA_PATH + "orders.csv")
products = pd.read_csv(DATA_PATH + "products.csv")
aisles = pd.read_csv(DATA_PATH + "aisles.csv")
departments = pd.read_csv(DATA_PATH + "departments.csv")
prior = pd.read_csv(DATA_PATH + "order_products__prior.csv")

In [3]:
train = pd.read_csv(DATA_PATH + "order_products__train.csv")

#### Build Datasets

In [4]:
aisles.head()

Unnamed: 0,aisle_id,aisle
0,1,prepared soups salads
1,2,specialty cheeses
2,3,energy granola bars
3,4,instant foods
4,5,marinades meat preparation


In [5]:
departments.head()

Unnamed: 0,department_id,department
0,1,frozen
1,2,other
2,3,bakery
3,4,produce
4,5,alcohol


In [6]:
orders.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


In [7]:
train.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,1,49302,1,1
1,1,11109,2,1
2,1,10246,3,0
3,1,49683,4,0
4,1,43633,5,1


In [8]:
(orders.eval_set == 'train').value_counts()

False    3289874
True      131209
Name: eval_set, dtype: int64

In [9]:
(orders.eval_set == 'test').value_counts()

False    3346083
True       75000
Name: eval_set, dtype: int64

#### Use prior as train set, and train set as cv set, online data as test set

In [10]:
orders.loc[orders.eval_set == 'test']

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
38,2774568,3,test,13,5,15,11.0
44,329954,4,test,6,3,12,30.0
53,1528013,6,test,4,3,16,22.0
96,1376945,11,test,8,6,11,8.0
102,1356845,12,test,6,1,20,30.0
152,2161313,15,test,23,1,9,7.0
159,1416320,16,test,7,0,13,7.0
217,1735923,19,test,10,6,17,8.0
222,1980631,20,test,5,1,11,30.0
272,139655,22,test,16,5,6,1.0


#### Build Feature： Use prior to build User\Item\Interactive feature
train build (user-item) True or False Matrix

#### Merge Train and order to build (user_item_model) information

In [4]:
train_order_product = pd.merge(train, orders, on='order_id')

In [12]:
train_order_product.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,1,49302,1,1,112108,train,4,4,10,9.0
1,1,11109,2,1,112108,train,4,4,10,9.0
2,1,10246,3,0,112108,train,4,4,10,9.0
3,1,49683,4,0,112108,train,4,4,10,9.0
4,1,43633,5,1,112108,train,4,4,10,9.0


In [13]:
train_order_product.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1384617 entries, 0 to 1384616
Data columns (total 10 columns):
order_id                  1384617 non-null int64
product_id                1384617 non-null int64
add_to_cart_order         1384617 non-null int64
reordered                 1384617 non-null int64
user_id                   1384617 non-null int64
eval_set                  1384617 non-null object
order_number              1384617 non-null int64
order_dow                 1384617 non-null int64
order_hour_of_day         1384617 non-null int64
days_since_prior_order    1384617 non-null float64
dtypes: float64(1), int64(8), object(1)
memory usage: 116.2+ MB


#### User feature

In [14]:
prior.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1
2,2,9327,3,0
3,2,45918,4,1
4,2,30035,5,0


In [15]:
orders.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


In [5]:
user_order_cnt = pd.DataFrame(orders.loc[orders.eval_set == 'prior'].groupby('user_id')['order_id'].count())

In [17]:
user_order_cnt.head()

Unnamed: 0_level_0,order_id
user_id,Unnamed: 1_level_1
1,10
2,14
3,12
4,5
5,4


In [58]:
user_order_cnt.order_id.ix[1]

10

In [6]:
user_order_dow_cnt = pd.DataFrame(orders.loc[orders.eval_set == 'prior'].groupby(['user_id', 'order_dow'])['order_id'].count())

In [7]:
user_hour_cnt = pd.DataFrame(orders.loc[orders.eval_set == 'prior'].groupby(['user_id', 'order_hour_of_day'])['order_id'].count())

In [8]:
user_days_pass_cnt = pd.DataFrame(orders.loc[orders.eval_set == 'prior'].groupby('user_id').agg({'days_since_prior_order': np.mean}))

#### Item Feature

In [22]:
orders.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


In [23]:
prior.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1
2,2,9327,3,0
3,2,45918,4,1
4,2,30035,5,0


In [19]:
item_reorder_cnt = pd.DataFrame(prior.groupby('product_id').agg({'reordered': np.sum}))

In [25]:
products.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id
0,1,Chocolate Sandwich Cookies,61,19
1,2,All-Seasons Salt,104,13
2,3,Robust Golden Unsweetened Oolong Tea,94,7
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1
4,5,Green Chile Anytime Sauce,5,13


In [20]:
new_products = products.set_index(products['product_id'])

In [27]:
new_products.ix[1]

product_id                                1
product_name     Chocolate Sandwich Cookies
aisle_id                                 61
department_id                            19
Name: 1, dtype: object

#### Item_User

In [21]:
prior_order = pd.merge(prior, orders, on='order_id')

In [29]:
prior_order.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2,33120,1,1,202279,prior,3,5,9,8.0
1,2,28985,2,1,202279,prior,3,5,9,8.0
2,2,9327,3,0,202279,prior,3,5,9,8.0
3,2,45918,4,1,202279,prior,3,5,9,8.0
4,2,30035,5,0,202279,prior,3,5,9,8.0


In [22]:
item_order_dow_cnt = pd.DataFrame(prior_order.groupby(['product_id', 'order_dow'])['order_id'].count())

In [23]:
item_hour_cnt = pd.DataFrame(prior_order.groupby(['product_id', 'order_hour_of_day'])['order_id'].count())

In [24]:
item_day_pass_cnt = pd.DataFrame(prior_order.groupby('product_id').agg({'days_since_prior_order': np.mean}))

In [25]:
item_user_reordered = pd.DataFrame(prior_order.groupby(['user_id', 'product_id']).agg({'reordered': np.sum}))

In [26]:
item_user_day_pass = pd.DataFrame(prior_order.groupby(['user_id', 'product_id']).agg({'days_since_prior_order': np.mean}))

#### Feature Matrix

In [35]:
train.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,1,49302,1,1
1,1,11109,2,1
2,1,10246,3,0
3,1,49683,4,0
4,1,43633,5,1


In [36]:
train.shape

(1384617, 4)

In [37]:
item_user_reordered.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,reordered
user_id,product_id,Unnamed: 2_level_1
1,196,9
1,10258,8
1,10326,0
1,12427,9
1,13032,2


In [38]:
item_user_reordered.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 13307953 entries, (1, 196) to (206209, 48742)
Data columns (total 1 columns):
reordered    int64
dtypes: int64(1)
memory usage: 205.0+ MB


In [39]:
import time
a = time.time()
item_user_reordered.ix[(1, 196)]
b = time.time()

#### Calculate toplist：物品被reordered次数最多的产品

In [40]:
top_200 = pd.DataFrame(prior.groupby('product_id').agg({'reordered': np.sum}).sort_values(['reordered'], ascending=False)).head(200)

In [41]:
top_item_list = list(top_200.index)

In [27]:
train_order_list = pd.merge(train, orders, on='order_id')

In [43]:
train_order_list.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,1,49302,1,1,112108,train,4,4,10,9.0
1,1,11109,2,1,112108,train,4,4,10,9.0
2,1,10246,3,0,112108,train,4,4,10,9.0
3,1,49683,4,0,112108,train,4,4,10,9.0
4,1,43633,5,1,112108,train,4,4,10,9.0


In [28]:
n_feature = 15 + 6 + 23 + 6 + 23

In [45]:
data = np.zeros((train.shape[0], n_feature))

In [46]:
data.shape

(1384617, 73)

In [47]:
import time
a = time.time()
xx = user_days_pass_cnt.days_since_prior_order.ix
b = time.time()
print xx[1]
c = time.time()
print b - a, c - b

19.5555555556
0.000392198562622 0.0150158405304


In [48]:
orders.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


In [49]:
user_order_dow_cnt.order_id.ix[(1, 1)]

3

In [50]:
user_hour_cnt.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,order_id
user_id,order_hour_of_day,Unnamed: 2_level_1
1,7,3
1,8,2
1,9,1
1,12,1
1,14,1


In [29]:
train_ix = train_order_list.ix
orders_ix = orders.ix
# user_feature
user_day_pass_cnt_idx = user_days_pass_cnt.to_dict()['days_since_prior_order']
user_order_cnt_idx = user_order_cnt.to_dict()['order_id']
user_order_dow_cnt_idx = user_order_dow_cnt.to_dict()['order_id']
user_hour_cnt_idx = user_hour_cnt.to_dict()['order_id']

# item feature
new_product_idx = new_products.to_dict()    # return a list
item_reorder_cnt_idx = item_reorder_cnt.to_dict()['reordered']
item_order_dow_cnt_idx = item_order_dow_cnt.to_dict()['order_id']
item_hour_cnt_idx = item_hour_cnt.to_dict()['order_id']
item_day_pass_cnt_idx = item_day_pass_cnt.to_dict()['days_since_prior_order']

# interactive feature
item_user_reordered_idx = item_user_reordered.reordered.ix
item_user_day_pass_idx = item_user_day_pass.days_since_prior_order.ix

In [56]:
import time
a1 = time.time()
for idx in xrange(data.shape[0]):
    try:
        if (idx + 1) % 10000 == 0:
            print "finish:", 100 * float(idx) / data.shape[0]
            a2 = time.time()
            print "time_passed", int(a2 - a1)

        item = train_ix[idx]
        order_id = item['order_id']
        product_id = item['product_id']
        user_id = item['user_id']
        order_dow = item['order_dow']
        order_hour_of_day = item['order_hour_of_day']
        days_since_prior_order = item['days_since_prior_order']

        f_user_dow = [user_order_dow_cnt_idx.get((user_id, dow), 0) for dow in xrange(7)]
        f_user_hour = [user_hour_cnt_idx.get((user_id, hour), 0) for hour in xrange(24)]
        f_user_day_pass = user_day_pass_cnt_idx.get(user_id, 0)
        f_user_order_cnt = user_order_cnt_idx.get(user_id, 0)

        f_aisle = new_product_idx['aisle_id'][product_id]
        f_department = new_product_idx['department_id'][product_id]

        f_product_order_cnt = item_reorder_cnt_idx.get(product_id, 0)

        f_product_dow = [item_order_dow_cnt_idx.get((product_id, dow), 0) for dow in xrange(7)]
        f_product_hour = [item_hour_cnt_idx.get((product_id, hour), 0) for hour in xrange(24)]
        f_product_day_pass = item_day_pass_cnt_idx.get(product_id, 0)

        ax1 = time.time()
        product_id_list = item_user_reordered_idx[user_id].index
        f_item_user_reordered = item_user_reordered_idx[(user_id, product_id)] if product_id in product_id_list else 0
        product_id_list = item_user_day_pass_idx[user_id].index
        f_item_user_day_pass = item_user_day_pass_idx[(user_id, product_id)] if product_id in product_id_list else 0

        item_info = []
        item_info.append(f_user_day_pass)
        item_info.extend(f_user_dow)
        item_info.extend(f_user_hour)
        item_info.append(f_user_order_cnt)

        item_info.append(f_aisle)
        item_info.append(f_department)
        item_info.append(f_product_order_cnt)
        item_info.extend(f_product_dow)
        item_info.extend(f_product_hour)
        item_info.append(f_product_day_pass)

        item_info.append(f_item_user_reordered)
        item_info.append(f_item_user_day_pass)

        item_info.append(order_dow)
        item_info.append(order_hour_of_day)
        item_info.append(days_since_prior_order)

        data[idx] = item_info
    except:
        data[idx] = [0 for x in xrange(n_feature)]
        print "failed idx:", idx
    

failed idx: 6677
finish: 0.722149157493
time_passed 24
finish: 1.44437053712
time_passed 35
finish: 2.16659191675
time_passed 45
finish: 2.88881329638
time_passed 56
finish: 3.61103467602
time_passed 65
finish: 4.33325605565
time_passed 75
finish: 5.05547743528
time_passed 85
finish: 5.77769881491
time_passed 95
finish: 6.49992019454
time_passed 105
finish: 7.22214157417
time_passed 116
finish: 7.9443629538
time_passed 126
finish: 8.66658433343
time_passed 136
finish: 9.38880571306
time_passed 146
finish: 10.1110270927
time_passed 156
finish: 10.8332484723
time_passed 166
finish: 11.555469852
time_passed 176
finish: 12.2776912316
time_passed 186
failed idx: 171849
finish: 12.9999126112
time_passed 196
finish: 13.7221339908
time_passed 207
finish: 14.4443553705
time_passed 218
finish: 15.1665767501
time_passed 229
finish: 15.8887981297
time_passed 239
finish: 16.6110195094
time_passed 249
finish: 17.333240889
time_passed 259
finish: 18.0554622686
time_passed 269
finish: 18.7776836483
ti

In [47]:
columns = []
columns.append('USER_DAY_PASS')
columns.extend(['USER_DOW_' + str(i) for i in xrange(7)])
columns.extend(['USER_HOUR_' + str(i) for i in xrange(24)])
columns.append('USER_ORDER_CNT')
columns.append('AISLE')
columns.append('DEPARTMENT')
columns.append('PRODUCT_ORDER_CNT')
columns.extend(['PRODUCT_DOW_' + str(i) for i in xrange(7)])
columns.extend(['PRODUCT_HOUR_' + str(i) for i in xrange(24)])
columns.append('PRODUCT_DAY_PASS')
columns.append('ITEM_USER_REOREDED')
columns.append('ITEM_USER_DAY_PASS')
columns.append("ORDER_DOW")
columns.append('ORDER_HOUR')
columns.append('DAY_SINCE_PRIOR_ORDER')

In [65]:
outfile = DATA_PATH + "user_item"
np.save(outfile, data)

In [67]:
new_data = np.load(outfile + ".npy")

In [69]:
new_data.shape

(1384617, 73)

In [72]:
positive_X = pd.DataFrame(data, columns=columns)

In [73]:
positive_X.head()

Unnamed: 0,USER_DAY_PASS,USER_DOW_0,USER_DOW_1,USER_DOW_2,USER_DOW_3,USER_DOW_4,USER_DOW_5,USER_DOW_6,USER_HOUR_0,USER_HOUR_1,USER_HOUR_2,USER_HOUR_3,USER_HOUR_4,USER_HOUR_5,USER_HOUR_6,USER_HOUR_7,USER_HOUR_8,USER_HOUR_9,USER_HOUR_10,USER_HOUR_11,USER_HOUR_12,USER_HOUR_13,USER_HOUR_14,USER_HOUR_15,USER_HOUR_16,USER_HOUR_17,USER_HOUR_18,USER_HOUR_19,USER_HOUR_20,USER_HOUR_21,USER_HOUR_22,USER_HOUR_23,USER_ORDER_CNT,AISLE,DEPARTMENT,PRODUCT_ORDER_CNT,PRODUCT_DOW_0,PRODUCT_DOW_1,PRODUCT_DOW_2,PRODUCT_DOW_3,PRODUCT_DOW_4,PRODUCT_DOW_5,PRODUCT_DOW_6,PRODUCT_HOUR_0,PRODUCT_HOUR_1,PRODUCT_HOUR_2,PRODUCT_HOUR_3,PRODUCT_HOUR_4,PRODUCT_HOUR_5,PRODUCT_HOUR_6,PRODUCT_HOUR_7,PRODUCT_HOUR_8,PRODUCT_HOUR_9,PRODUCT_HOUR_10,PRODUCT_HOUR_11,PRODUCT_HOUR_12,PRODUCT_HOUR_13,PRODUCT_HOUR_14,PRODUCT_HOUR_15,PRODUCT_HOUR_16,PRODUCT_HOUR_17,PRODUCT_HOUR_18,PRODUCT_HOUR_19,PRODUCT_HOUR_20,PRODUCT_HOUR_21,PRODUCT_HOUR_22,PRODUCT_HOUR_23,PRODUCT_DAY_PASS,ITEM_USER_REOREDED,ITEM_USER_DAY_PASS,ORDER_DOW,ORDER_HOUR,DAY_SINCE_PRIOR_ORDER
0,11.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,3.0,120.0,16.0,101.0,27.0,27.0,26.0,14.0,10.0,18.0,41.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,4.0,5.0,9.0,17.0,11.0,16.0,18.0,19.0,12.0,12.0,9.0,8.0,7.0,3.0,4.0,5.0,0.0,9.96732,1.0,7.0,4.0,10.0,9.0
1,11.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,3.0,108.0,16.0,3192.0,954.0,738.0,596.0,524.0,497.0,527.0,636.0,16.0,15.0,2.0,7.0,6.0,16.0,32.0,134.0,224.0,336.0,356.0,333.0,329.0,363.0,388.0,403.0,388.0,320.0,219.0,167.0,156.0,127.0,85.0,50.0,10.571665,1.0,7.0,4.0,10.0,9.0
2,11.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,3.0,83.0,4.0,12498.0,4970.0,4195.0,3011.0,2681.0,2548.0,2909.0,3512.0,202.0,94.0,49.0,38.0,45.0,63.0,208.0,640.0,1264.0,1731.0,2161.0,2148.0,2017.0,2095.0,2033.0,1917.0,1868.0,1390.0,1009.0,805.0,659.0,605.0,497.0,288.0,10.814744,0.0,0.0,4.0,10.0,9.0
3,11.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,3.0,83.0,4.0,67313.0,22054.0,17435.0,11953.0,10416.0,10165.0,11220.0,14072.0,729.0,378.0,193.0,141.0,167.0,305.0,924.0,2594.0,4797.0,6770.0,7821.0,8044.0,7962.0,8162.0,8155.0,7903.0,7933.0,6487.0,5067.0,3954.0,3059.0,2491.0,2051.0,1228.0,12.102385,0.0,0.0,4.0,10.0,9.0
4,11.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,3.0,95.0,15.0,312.0,113.0,94.0,80.0,96.0,72.0,86.0,112.0,7.0,5.0,1.0,1.0,2.0,1.0,5.0,9.0,29.0,47.0,42.0,51.0,39.0,55.0,53.0,54.0,55.0,55.0,37.0,36.0,26.0,11.0,14.0,18.0,11.946166,1.0,15.0,4.0,10.0,9.0


In [74]:
out_csv_file = DATA_PATH + "positive_X.csv"
positive_X.to_csv(out_csv_file)

#### Add Inverse Data

In [52]:
train_order_product_list = train_order_list.groupby('order_id')['product_id'].apply(list)

In [77]:
train_order_product_list.head()

order_id
1     [49302, 11109, 10246, 49683, 43633, 13176, 472...
36    [39612, 19660, 49235, 43086, 46620, 34497, 486...
38    [11913, 18159, 4461, 21616, 23622, 32433, 2884...
96    [20574, 30391, 40706, 25610, 27966, 24489, 39275]
98    [8859, 19731, 43654, 13176, 4357, 37664, 34065...
Name: product_id, dtype: object

#### 从top_item_list中

In [53]:
positive_order_item_list = train_order_product_list.to_dict()

In [54]:
false_data = np.zeros((train.shape[0], n_feature))

In [55]:
negative_order_item_list = {}

def find_new_negative_product_id(order_id, product_id, negative_order_item_list, positive_order_item_list, top_item_list):
    product_id = positive_order_item_list[order_id][0]
    import random
    
    cnt = 10
    while cnt > 0:
        new_product_id = random.choice(top_item_list)
        if new_product_id not in negative_order_item_list[order_id] and new_product_id not in positive_order_item_list[order_id]:
            product_id = new_product_id
            break
        
        cnt -= 1
    
    return product_id

import time
a1 = time.time()
for idx in xrange(false_data.shape[0]):
    try:
        if (idx + 1) % 10000 == 0:
            print "finish:", 100 * float(idx) / false_data.shape[0]
            a2 = time.time()
            print "time_passed", int(a2 - a1)

        item = train_ix[idx]
        order_id = item['order_id']
        
        product_id = item['product_id']
        negative_order_item_list.setdefault(order_id, [])
        product_id = find_new_negative_product_id(order_id, product_id, negative_order_item_list, positive_order_item_list, top_item_list)
        negative_order_item_list[order_id].append(product_id)
        user_id = item['user_id']
        order_dow = item['order_dow']
        order_hour_of_day = item['order_hour_of_day']
        days_since_prior_order = item['days_since_prior_order']

        f_user_dow = [user_order_dow_cnt_idx.get((user_id, dow), 0) for dow in xrange(7)]
        f_user_hour = [user_hour_cnt_idx.get((user_id, hour), 0) for hour in xrange(24)]
        f_user_day_pass = user_day_pass_cnt_idx.get(user_id, 0)
        f_user_order_cnt = user_order_cnt_idx.get(user_id, 0)

        f_aisle = new_product_idx['aisle_id'][product_id]
        f_department = new_product_idx['department_id'][product_id]

        f_product_order_cnt = item_reorder_cnt_idx.get(product_id, 0)

        f_product_dow = [item_order_dow_cnt_idx.get((product_id, dow), 0) for dow in xrange(7)]
        f_product_hour = [item_hour_cnt_idx.get((product_id, hour), 0) for hour in xrange(24)]
        f_product_day_pass = item_day_pass_cnt_idx.get(product_id, 0)

        ax1 = time.time()
        product_id_list = item_user_reordered_idx[user_id].index
        f_item_user_reordered = item_user_reordered_idx[(user_id, product_id)] if product_id in product_id_list else 0
        product_id_list = item_user_day_pass_idx[user_id].index
        f_item_user_day_pass = item_user_day_pass_idx[(user_id, product_id)] if product_id in product_id_list else 0

        item_info = []
        item_info.append(f_user_day_pass)
        item_info.extend(f_user_dow)
        item_info.extend(f_user_hour)
        item_info.append(f_user_order_cnt)

        item_info.append(f_aisle)
        item_info.append(f_department)
        item_info.append(f_product_order_cnt)
        item_info.extend(f_product_dow)
        item_info.extend(f_product_hour)
        item_info.append(f_product_day_pass)

        item_info.append(f_item_user_reordered)
        item_info.append(f_item_user_day_pass)

        item_info.append(order_dow)
        item_info.append(order_hour_of_day)
        item_info.append(days_since_prior_order)

        false_data[idx] = item_info
    except:
        false_data[idx] = [0 for x in xrange(n_feature)]
        print "failed idx:", idx
        import traceback
        traceback.print_exc()

finish: 0.722149157493
time_passed 22
finish: 1.44437053712
time_passed 31
finish: 2.16659191675
time_passed 39
finish: 2.88881329638
time_passed 48
finish: 3.61103467602
time_passed 56
finish: 4.33325605565
time_passed 65
finish: 5.05547743528
time_passed 73
finish: 5.77769881491
time_passed 82
finish: 6.49992019454
time_passed 90
finish: 7.22214157417
time_passed 99
finish: 7.9443629538
time_passed 107
finish: 8.66658433343
time_passed 116
finish: 9.38880571306
time_passed 125
finish: 10.1110270927
time_passed 133
finish: 10.8332484723
time_passed 142
finish: 11.555469852
time_passed 151
finish: 12.2776912316
time_passed 160
finish: 12.9999126112
time_passed 169
finish: 13.7221339908
time_passed 179
finish: 14.4443553705
time_passed 188
finish: 15.1665767501
time_passed 198
finish: 15.8887981297
time_passed 207
finish: 16.6110195094
time_passed 216
finish: 17.333240889
time_passed 226
finish: 18.0554622686
time_passed 235
finish: 18.7776836483
time_passed 244
finish: 19.4999050279
ti

In [58]:
#TODO: 将data替换为false_data
negative_X = pd.DataFrame(false_data, columns=columns)
negtive_out_csv_file = DATA_PATH + "nega`tive_X.csv"
negative_X.to_csv(negtive_out_csv_file)

In [30]:
test_order = orders.loc[orders.eval_set == 'test']

In [31]:
list(item_user_reordered_idx[1].index)

[196,
 10258,
 10326,
 12427,
 13032,
 13176,
 14084,
 17122,
 25133,
 26088,
 26405,
 30450,
 35951,
 38928,
 39657,
 41787,
 46149,
 49235]

In [32]:
test_user_id_list = np.unique(test_order['user_id'])

In [33]:
len(test_user_id_list)

75000

In [34]:
test_product_num = sum([len(item_user_reordered_idx[i].index) for i in test_user_id_list])

In [35]:
test_data = np.zeros((test_product_num, n_feature))

In [36]:
test_order_ix = test_order.ix

In [37]:
test_order_index = list(test_order.index)

In [38]:
test_order.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
38,2774568,3,test,13,5,15,11.0
44,329954,4,test,6,3,12,30.0
53,1528013,6,test,4,3,16,22.0
96,1376945,11,test,8,6,11,8.0
102,1356845,12,test,6,1,20,30.0


In [40]:
test_data.shape[0]

4833292

In [41]:
import time
a1 = time.time()

idx = 0
user_id_idx = 0
while idx < test_data.shape[0]:
    try:
        if (user_id_idx + 1) % 100 == 0:
            print "test", user_id_idx
            
        item = test_order_ix[test_order_index[user_id_idx]]
        user_id_idx += 1
        order_id = item['order_id']
        user_id = item['user_id']
        order_dow = item['order_dow']
        order_hour_of_day = item['order_hour_of_day']
        days_since_prior_order = item['days_since_prior_order']
        
        product_id_list = list(item_user_reordered_idx[user_id].index)
        
        for product_id in product_id_list:
            if (idx + 1) % 10000 == 0:
                print "finish:", 100 * float(idx) / test_data.shape[0], product_id
                a2 = time.time()
                print "time_passed", int(a2 - a1)
            
            idx += 1
            f_user_dow = [user_order_dow_cnt_idx.get((user_id, dow), 0) for dow in xrange(7)]
            f_user_hour = [user_hour_cnt_idx.get((user_id, hour), 0) for hour in xrange(24)]
            f_user_day_pass = user_day_pass_cnt_idx.get(user_id, 0)
            f_user_order_cnt = user_order_cnt_idx.get(user_id, 0)

            f_aisle = new_product_idx['aisle_id'][product_id]
            f_department = new_product_idx['department_id'][product_id]

            f_product_order_cnt = item_reorder_cnt_idx.get(product_id, 0)

            f_product_dow = [item_order_dow_cnt_idx.get((product_id, dow), 0) for dow in xrange(7)]
            f_product_hour = [item_hour_cnt_idx.get((product_id, hour), 0) for hour in xrange(24)]
            f_product_day_pass = item_day_pass_cnt_idx.get(product_id, 0)

            ax1 = time.time()
            product_id_list = item_user_reordered_idx[user_id].index
            f_item_user_reordered = item_user_reordered_idx[(user_id, product_id)] if product_id in product_id_list else 0
            product_id_list = item_user_day_pass_idx[user_id].index
            f_item_user_day_pass = item_user_day_pass_idx[(user_id, product_id)] if product_id in product_id_list else 0

            item_info = []
            item_info.append(f_user_day_pass)
            item_info.extend(f_user_dow)
            item_info.extend(f_user_hour)
            item_info.append(f_user_order_cnt)

            item_info.append(f_aisle)
            item_info.append(f_department)
            item_info.append(f_product_order_cnt)
            item_info.extend(f_product_dow)
            item_info.extend(f_product_hour)
            item_info.append(f_product_day_pass)

            item_info.append(f_item_user_reordered)
            item_info.append(f_item_user_day_pass)

            item_info.append(order_dow)
            item_info.append(order_hour_of_day)
            item_info.append(days_since_prior_order)

            test_data[idx] = item_info
    except:
        test_data[idx] = [0 for x in xrange(n_feature)]
        print "failed test_data idx:", idx

test 99
finish: 0.206877631229 22242
time_passed 30
test 199
test 299
finish: 0.413775952291 34134
time_passed 38
test 399
finish: 0.620674273352 22035
time_passed 46
test 499
test 599
finish: 0.827572594414 45723
time_passed 54
test 699
finish: 1.03447091548 8518
time_passed 62
test 799
test 899
finish: 1.24136923654 5097
time_passed 70
test 999
finish: 1.4482675576 23740
time_passed 79
test 1099
test 1199
finish: 1.65516587866 9558
time_passed 89
test 1299
test 1399
finish: 1.86206419972 8744
time_passed 98
test 1499
finish: 2.06896252078 20590
time_passed 108
test 1599
test 1699
finish: 2.27586084184 21938
time_passed 117
test 1799
finish: 2.48275916291 21288
time_passed 126
test 1899
finish: 2.68965748397 39108
time_passed 135
test 1999
test 2099
finish: 2.89655580503 8223
time_passed 144
test 2199
test 2299
finish: 3.10345412609 20082
time_passed 153
test 2399
finish: 3.31035244715 43987
time_passed 162
test 2499
test 2599
finish: 3.51725076821 19660
time_passed 171
test 2699
test

IndexError: index 4833292 is out of bounds for axis 0 with size 4833292

In [48]:
test_X = pd.DataFrame(test_data, columns=columns)
test_out_csv_file = DATA_PATH + "test_X.csv"
test_X.to_csv(test_out_csv_file)