In [1]:
import operator
import warnings
import pandas as pd


In [2]:
warnings.filterwarnings('ignore')
pd.set_option('display.float_format', lambda x: '%.4f' % x)


In [24]:
# reading data
prior_orders = pd.read_csv('order_products__prior.csv')
train_orders = pd.read_csv('order_products__train.csv')
orders = pd.read_csv('orders.csv')

In [25]:
# 選取testing中，75000位顧客的全部資料。
test  = orders[orders['eval_set'] == 'test']
user_ids = test['user_id'].values
orders = orders[orders['user_id'].isin(user_ids)]
orders.head(13)

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
26,1374495,3,prior,1,1,14,
27,444309,3,prior,2,3,19,9.0
28,3002854,3,prior,3,3,16,21.0
29,2037211,3,prior,4,2,18,20.0
30,2710558,3,prior,5,0,17,12.0
31,1972919,3,prior,6,0,16,7.0
32,1839752,3,prior,7,0,15,7.0
33,3225766,3,prior,8,0,17,7.0
34,3160850,3,prior,9,0,16,7.0
35,676467,3,prior,10,3,16,17.0


In [28]:
# seperate orders into prior/train sets
# turns out there are no test user_ids in the training set so train will be empty
prior = orders[orders['eval_set'] == 'prior']
train = orders[orders['eval_set'] == 'train']

# add column: find the number of the last order
prior['num_orders'] = prior.groupby(['user_id'])['order_number'].transform(max)
train['num_orders'] = train.groupby(['user_id'])['order_number'].transform(max)
prior.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,num_orders
26,1374495,3,prior,1,1,14,,12
27,444309,3,prior,2,3,19,9.0,12
28,3002854,3,prior,3,3,16,21.0,12
29,2037211,3,prior,4,2,18,20.0,12
30,2710558,3,prior,5,0,17,12.0,12


In [21]:
prior_orders.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1
2,2,9327,3,0
3,2,45918,4,1
4,2,30035,5,0


In [14]:
# 整理prior顧客的消費記錄成submit的形式
prior_products = pd.DataFrame(prior_orders.groupby('order_id')['product_id'].apply(list))
prior_products.reset_index(level=['order_id'], inplace=True)
prior_products.columns = ['order_id','products_list']

# 整理training顧客的消費記錄成submit的形式
train_products = pd.DataFrame(train_orders.groupby('order_id')['product_id'].apply(list))
train_products.reset_index(level=['order_id'], inplace=True)
train_products.columns = ['order_id','products_list']
train_products.head()

Unnamed: 0,order_id,products_list
0,1,"[49302, 11109, 10246, 49683, 43633, 13176, 472..."
1,36,"[39612, 19660, 49235, 43086, 46620, 34497, 486..."
2,38,"[11913, 18159, 4461, 21616, 23622, 32433, 2884..."
3,96,"[20574, 30391, 40706, 25610, 27966, 24489, 39275]"
4,98,"[8859, 19731, 43654, 13176, 4357, 37664, 34065..."


In [31]:
# merge order_list and order_info into one dataframe
prior = pd.merge(prior, prior_products, on='order_id', how='left')
train = pd.merge(train, train_products, on='order_id', how='left')
comb = pd.concat([prior, train], axis=0).reset_index(drop=True)
comb.head(1)

Unnamed: 0,days_since_prior_order,eval_set,num_orders,order_dow,order_hour_of_day,order_id,order_number,products_list,user_id
0,,prior,12,1,14,1374495,1,"[9387, 17668, 15143, 16797, 39190, 47766, 2190...",3


In [18]:
# test只留下order_id和user_id
# prior&train只留下第幾次消費,消費數量和消費明細
test_cols = ['order_id','user_id']
cols = ['order_id','user_id','order_number','num_orders','products_list']
comb = comb[cols]
test = test[test_cols]

print (test.head(1))
comb.head(1)

    order_id  user_id
38   2774568        3


Unnamed: 0,order_id,user_id,order_number,num_orders,products_list
0,1374495,3,1,12,"[9387, 17668, 15143, 16797, 39190, 47766, 2190..."


In [10]:
# iterate through dataframe, adding data to dictionary
# data added is in the form of a list:
    # list[0] = weight of the data: (1 + current order number / final order number), thus later data is weighted more
    # list[1] = how important the item is to the buyer: (order in the cart / number of items bought), thus items bought first are weighted more

# also used the average amount of items bought every order as a benchmark for how many items to add per user in the final submission

product_dict = {}
for i, row in comb.iterrows():
    if i % 100000 == 0:
        print('Iterated Through {} Rows...'.format(i))

    if row['user_id'] in product_dict:
        index = 1
        list.append(product_dict[row['user_id']]['len_products'], len(row['products_list']))
        for val in row['products_list']:
            if val in product_dict[row['user_id']]:
                product_dict[row['user_id']][val][0] += 1 + int(row['order_number']) / int(row['num_orders'])
                list.append(product_dict[row['user_id']][val][1], index / len(row['products_list']))
            else:
                product_dict[row['user_id']][val] = [1 + int(row['order_number']) / int(row['num_orders']),
                                              [index / len(row['products_list'])]]
            index += 1
    else:
        index = 1
        product_dict[row['user_id']] = {'len_products': [
            len(row['products_list'])]}
        for val in row['products_list']:
            product_dict[row['user_id']][val] = [1 + int(row['order_number']) / int(row['num_orders']),
                                          [index / len(row['products_list'])]]
            index += 1


Iterated Through 0 Rows...
Iterated Through 100000 Rows...
Iterated Through 200000 Rows...
Iterated Through 300000 Rows...
Iterated Through 400000 Rows...
Iterated Through 500000 Rows...
Iterated Through 600000 Rows...
Iterated Through 700000 Rows...
Iterated Through 800000 Rows...
Iterated Through 900000 Rows...
Iterated Through 1000000 Rows...
Iterated Through 1100000 Rows...


In [21]:
final_data = {}
for user_id in product_dict:
    final_data[user_id] = {}
    for product_id in product_dict[user_id]:
        if product_id == 'len_products':
            final_data[user_id][product_id] = \
                round(sum(product_dict[user_id][product_id])/
                    len(product_dict[user_id][product_id]))
        else:
            final_data[user_id][product_id] = \
                [product_dict[user_id][product_id][0],1/
                 (sum(product_dict[user_id][product_id][1])/
                len(product_dict[user_id][product_id][1]))]


1. 顧次每次購買數量
2. 1 + current order number / final order number
3. 每一次出現先後的比例
4. 最後的data是把它加起來的整理

In [23]:
# iterate through testing dataframe
# every user_id in test corresponds to a dictionary entry
# call the dictionary with every row, products by weight, combine them into a string, and append them to products

products = []
for i, row in test.iterrows():
    if i % 100000 == 0:
        print('Iterated Through {} Rows...'.format(i))

    final_products = []
    len_products = None
    total_products = final_data[row['user_id']].items()
    for product in total_products:
        if product[0] == 'len_products':
            len_products = product[1]
        else:
            list.append(final_products, product)

    output = []
    product_list = sorted(final_products, key=operator.itemgetter(1), reverse=True)
    for val in product_list[:len_products]:
        list.append(output, str(val[0]))
    final_output = ' '.join(output)
    list.append(products, final_output)


Iterated Through 1700000 Rows...


In [91]:
# create submission
submission = pd.DataFrame()
submission['order_id'] = test['order_id']
submission['products'] = products
#submission.to_csv('submission.csv', index=False)
print(submission.isnull().sum())
submission.head()

order_id    0
products    0
dtype: int64


Unnamed: 0,order_id,products
38,2774568,39190 47766 21903 17668 18599 9387 43961
44,329954,35469 26576 25623 21573
53,1528013,38293 21903 49401 25659 8424
96,1376945,27959 8309 14947 35948 28465 34658 8670 42585 ...
102,1356845,13176 14992 10863 7076 8239 5746 28134 21616 2...
