In [1]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import scipy.sparse
pd.set_option("display.max_columns",101)
RANDOM_STATE = 42

In [2]:
DATA_PATH = "../data/instacart/"

In [3]:
def save_sparse_matrix(filename, x):
    x_coo = x.tocoo()
    row = x_coo.row
    col = x_coo.col
    data = x_coo.data
    shape = x_coo.shape
    np.savez(filename, row=row, col=col, data=data, shape=shape)

def load_sparse_matrix(filename):
    y = np.load(filename)
    z = scipy.sparse.coo_matrix((y['data'], (y['row'], y['col'])), shape=y['shape'])
    return z

save_file = DATA_PATH + "user_item.npz"

In [4]:
load_S = load_sparse_matrix(save_file)
user_item_matrix = scipy.sparse.csr_matrix(load_S)

#### inverse_matrix

#### implement simple LFM model

In [5]:
item_pool = range(0, load_S.shape[1])

def random_select_negative_sample(items):
    import random
    ret = {}
    
    result = sum(items.values())
    for item, value in items.iteritems():
        ret[item] = value / float(result)
    
    n = 0
    for i in xrange(3 * len(items)):
        item = item_pool[random.randint(0, len(item_pool) - 1)]
        
        if item in items:
            continue
        
        ret[item] = 0
        
        n += 1
        if n >= len(items):
            break
    return ret

In [6]:
ret = random_select_negative_sample({1: 2, 2: 3})

In [7]:
ret

{1: 0.4, 2: 0.6, 22158: 0, 33670: 0}

In [8]:
xx = user_item_matrix.todok()

In [9]:
print xx.shape

(206209, 49677)


In [10]:
for i in xx.iteritems():
    print i 
    break

((97555, 46216), 13)


In [11]:
len(list(xx.iteritems()))

9098126

In [12]:
dok_matrix = user_item_matrix.todok()

In [13]:
def make_user_item_dict(X):
    from collections import defaultdict
    result = defaultdict(dict)
    for user_item, value in dok_matrix.iteritems():
        user_id, item_id = user_item
        result[user_id][item_id] = value
    return result

In [14]:
user_items = make_user_item_dict(dok_matrix)

In [15]:
u_items = user_items.items()

#### tuning the parameter, because parameter is too small, C should be larger

In [None]:
def LatentFactorModel(ui_data, n_factor, n_step, alpha, C):
    num_user, num_item = user_item_matrix.shape
    P_uk = np.ones((num_user, n_factor)) / float(n_factor)
    Q_ik = np.ones((num_item, n_factor)) / float(n_factor)
    for step in xrange(n_step):
        num_error = 0
        i = 0
        for user_id, items in ui_data.iteritems():
            i += 1
            if i % 10000 == 0:
                print 'count user:', i
            samples = random_select_negative_sample(items)
            for item_id, value in samples.iteritems():
                E_ui = value - P_uk[user_id, :].dot(Q_ik[item_id, :].transpose())
                old_error = E_ui * E_ui
                P_uk[user_id, :] = P_uk[user_id, :] + alpha * (Q_ik[item_id, :] * E_ui - C * P_uk[user_id, :])
                Q_ik[item_id, :] = Q_ik[item_id, :] + alpha * (P_uk[user_id, :] * E_ui - C * Q_ik[item_id, :])
                new_error = (value - P_uk[user_id, :].dot(Q_ik[item_id, :].transpose())) ** 2
                
                num_error += new_error
        alpha *= 0.9
    
        print "step:", step, "error:", num_error
    return P_uk, Q_ik

P, Q = LatentFactorModel(user_items, 10, 1, 0.5, 0.1)

count user: 10000
count user: 20000
count user: 30000
count user: 40000
count user: 50000
count user: 60000
count user: 70000
count user: 80000
count user: 90000


In [438]:
df = pd.DataFrame(P)

In [439]:
q_df = pd.DataFrame(Q)
q_df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
count,49677.0,49677.0,49677.0,49677.0,49677.0,49677.0,49677.0,49677.0,49677.0,49677.0
mean,0.0007518801,0.0007518801,0.0007518801,0.0007518801,0.0007518801,0.0007518801,0.0007518801,0.0007518801,0.0007518801,0.0007518801
std,0.001993054,0.001993054,0.001993054,0.001993054,0.001993054,0.001993054,0.001993054,0.001993054,0.001993054,0.001993054
min,3.288207e-11,3.288207e-11,3.288207e-11,3.288207e-11,3.288207e-11,3.288207e-11,3.288207e-11,3.288207e-11,3.288207e-11,3.288207e-11
25%,1.347168e-05,1.347168e-05,1.347168e-05,1.347168e-05,1.347168e-05,1.347168e-05,1.347168e-05,1.347168e-05,1.347168e-05,1.347168e-05
50%,0.0001457287,0.0001457287,0.0001457287,0.0001457287,0.0001457287,0.0001457287,0.0001457287,0.0001457287,0.0001457287,0.0001457287
75%,0.0007135976,0.0007135976,0.0007135976,0.0007135976,0.0007135976,0.0007135976,0.0007135976,0.0007135976,0.0007135976,0.0007135976
max,0.0810607,0.0810607,0.0810607,0.0810607,0.0810607,0.0810607,0.0810607,0.0810607,0.0810607,0.0810607


#### Accoring to P and Q, we try to calculate result

In [440]:
orders = pd.read_csv(DATA_PATH + "orders.csv")

#### Get train use-item_list

In [441]:
train = pd.read_csv(DATA_PATH + "order_products__train.csv")

In [442]:
train_order_user = pd.merge(train, orders, on='order_id')

In [443]:
train_user_products = pd.DataFrame(train_order_user.groupby('user_id')['product_id'].apply(list))

In [444]:
train_user_products.head()

Unnamed: 0_level_0,product_id
user_id,Unnamed: 1_level_1
1,"[196, 25133, 38928, 26405, 39657, 10258, 13032..."
2,"[22963, 7963, 16589, 32792, 41787, 22825, 1364..."
5,"[15349, 19057, 16185, 21413, 20843, 20114, 482..."
7,"[12053, 47272, 37999, 13198, 43967, 40852, 176..."
8,"[15937, 5539, 10960, 23165, 22247, 4853, 27104..."


In [445]:
prior = pd.read_csv(DATA_PATH + "order_products__prior.csv")
order_user = pd.merge(prior, orders, on='order_id')
prior_user_products = pd.DataFrame(order_user.groupby('user_id')['product_id'].apply(list))
def cal_dict(x):
    from collections import Counter
    a = Counter(x)
    return dict(a)

prior_user_products_dict = pd.DataFrame(prior_user_products['product_id'].apply(cal_dict))
product_id_list = np.unique(order_user.product_id)
product_id_dict = {}
for idx, product_id in enumerate(product_id_list):
    product_id_dict[product_id] = idx
    
user_id_list = np.unique(order_user.user_id)
user_id_dict = {}
for idx, user_id in enumerate(user_id_list):
    user_id_dict[user_id] = idx

In [448]:
def cal_dict(x):
    from collections import Counter
    a = Counter(x)
    return dict(a)

train_user_products_dict = pd.DataFrame(train_user_products['product_id'].apply(cal_dict))

#### Accoring to train_data, calculate the threshold for all provided data

In [450]:
def predict(train_dict, P_uk, Q_ik, user_dict, item_dict):
    from collections import defaultdict
    result = defaultdict(dict)
    for x in train_dict['product_id'].iteritems():
        user_id, product_dict = x[0], x[1]
        for product_id, num in product_dict.iteritems():
            if product_id not in item_dict:
                continue
            user_idx = user_dict[user_id]
            item_idx = item_dict[product_id]
            result[user_id][product_id] = P_uk[user_idx, :].dot(Q_ik[item_idx, :].transpose())
    return result

In [451]:
train_result = predict(prior_user_products_dict, P, Q, user_id_dict, product_id_dict)

In [452]:
def normalize(data):
    result = {}
    for user_id, product_dict in data.iteritems():
        cnt = sum(product_dict.values())
        result[user_id] = {}
        for product_id, value in product_dict.iteritems():
            result[user_id][product_id] = value / cnt
    return result

normal_data = normalize(train_result)

In [453]:
def score_f(predict_list, product_list):
    recall = 0
    precision = 0
    
    same_one = 0
    for product_id in predict_list:
        if product_id in product_list:
            same_one += 1
    
    if len(predict_list) == 0:
        return 0
    
    recall = float(same_one) / len(product_list)
    precision = float(same_one) / len(predict_list)
    
    if recall == 0 and precision == 0:
        return 0
    
    return 2 * (recall * precision) / (recall + precision)

In [None]:
from collectios import defaultdict
user_order_list = orders.groupby('user_id')['order_id'].apply(list)
user_order_dict = user_order_list.to_dict()
cart_number_dict = order_user.groupby('order_id')['add_to_cart_order'].apply(list)
unique_cart_number_list = cart_number_dict.apply(np.unique)

average_length_dict = {}
for user_id, order_list in user_order_list.iteritems():
    order_length_list = [len(cart_number_dict[order_id]) for order_id in order_list]
    average_length_dict[user_id] = max(1, sum(order_length_list) / len(order_length_list)

In [454]:
def cal_result(norm_data, train_dict, theshold):
    f_score = 0
    for x in train_dict['product_id'].iteritems():
        user_id, product_dict = x[0], x[1]
        
        product_list = list(product_dict.keys())
        predict_dict = normal_data[user_id]
        predict_list = []
        for key, value in predict_dict.iteritems():
            if value >= theshold:
                predict_list.append(key)
                
        f_score += score_f(predict_list, product_list)
    
    f_score /= float(len(train_dict))
    return f_score

In [455]:
cal_result(normal_data, train_user_products_dict, 0.0032)

0.20005023085593565

In [456]:
cal_result(normal_data, train_user_products_dict, 0.0034)

0.20005631213031477

In [457]:
cal_result(normal_data, train_user_products_dict, 0.0035)

0.2000537168876042

In [458]:
test_user_id_list = pd.DataFrame(orders.groupby('eval_set')['user_id'].apply(list))

In [459]:
unique_user_id_list = np.unique(test_user_id_list.user_id['test'])

In [460]:
len(unique_user_id_list)

75000

In [461]:
def predict_result(norm_data, user_id_list, theshold=0.0035):
    result = {}
    for user_id in user_id_list:
        predict_dict = normal_data[user_id]
        predict_list = []
        for key, value in predict_dict.iteritems():
            if value >= theshold:
                predict_list.append(key)
                
        result[user_id] = predict_list
    
    return result

In [462]:
test_result = predict_result(normal_data, unique_user_id_list)

In [463]:
test_order_data = orders.ix[orders['eval_set'] == 'test']

In [464]:
test_order_data.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
38,2774568,3,test,13,5,15,11.0
44,329954,4,test,6,3,12,30.0
53,1528013,6,test,4,3,16,22.0
96,1376945,11,test,8,6,11,8.0
102,1356845,12,test,6,1,20,30.0


In [465]:
test_order_data.iteritems()

<generator object iteritems at 0x4771aeaa0>

In [466]:
print_data = []
for x in test_order_data.iterrows():
    order_id = x[1].order_id
    user_id = x[1].user_id
    print_data.append((order_id, test_result[user_id]))

In [467]:
sorted_print_data = sorted(print_data)

In [468]:
sorted_print_data[0]

(17,
 [26429,
  40002,
  16965,
  11494,
  13107,
  39275,
  21709,
  21903,
  18288,
  44056,
  6291,
  47766,
  21463,
  39928,
  38777,
  7035,
  31964,
  15613,
  13535])

In [471]:
model_id_list = [x[0] for x in sorted_print_data]
product_id_data = [" ".join(map(str, x[1])) for x in sorted_print_data]

In [476]:
data = pd.DataFrame({'order_id': model_id_list, "products": product_id_data})

In [477]:
data.head()

Unnamed: 0,order_id,products
0,17,26429 40002 16965 11494 13107 39275 21709 2190...
1,34,26369 36994 7559 18441 651 39180 5134 31533 43...
2,137,42625 28934 22281 7179 5134 27663 7952 47630 4...
3,182,34824 26128 27156 17948 2078 48682 14891 23645...
4,257,1025 24964 47766 24838 29837 37646 39055 21137...


In [478]:
data.to_csv(DATA_PATH + "predict.csv", index=False)