In [1]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import scipy.sparse
pd.set_option("display.max_columns",101)
RANDOM_STATE = 42

In [2]:
DATA_PATH = "../data/instacart/"

In [3]:
def save_sparse_matrix(filename, x):
    x_coo = x.tocoo()
    row = x_coo.row
    col = x_coo.col
    data = x_coo.data
    shape = x_coo.shape
    np.savez(filename, row=row, col=col, data=data, shape=shape)

def load_sparse_matrix(filename):
    y = np.load(filename)
    z = scipy.sparse.coo_matrix((y['data'], (y['row'], y['col'])), shape=y['shape'])
    return z

save_file = DATA_PATH + "user_item.npz"

In [4]:
load_S = load_sparse_matrix(save_file)
user_item_matrix = scipy.sparse.csr_matrix(load_S)

#### inverse_matrix

#### implement simple LFM model

In [275]:
item_pool = range(0, load_S.shape[1])

def random_select_negative_sample(items):
    import random
    ret = {}
    
    result = sum(items.values())
    for item, value in items.iteritems():
        ret[item] = value / float(result)
    
    n = 0
    for i in xrange(3 * len(items)):
        item = item_pool[random.randint(0, len(item_pool) - 1)]
        
        if item in items:
            continue
        
        ret[item] = 0
        
        n += 1
        if n >= len(items):
            break
    return ret

In [249]:
ret = random_select_negative_sample({1: 2, 2: 3})

In [250]:
ret

{1: 0.4, 2: 0.6, 3939: 0, 37818: 0}

In [79]:
xx = user_item_matrix.todok()

In [82]:
print xx.shape

(206209, 49677)


In [84]:
for i in xx.iteritems():
    print i 
    break

((84520, 6101), 4)


In [85]:
len(list(xx.iteritems()))

13307953

In [92]:
dok_matrix = user_item_matrix.todok()

In [95]:
def make_user_item_dict(X):
    from collections import defaultdict
    result = defaultdict(dict)
    for user_item, value in dok_matrix.iteritems():
        user_id, item_id = user_item
        result[user_id][item_id] = value
    return result

In [None]:
user_items = make_user_item_dict(dok_matrix)

In [253]:
u_items = user_items.items()

#### tuning the parameter, because parameter is too small, C should be larger

In [288]:
def LatentFactorModel(ui_data, n_factor, n_step, alpha, C):
    num_user, num_item = user_item_matrix.shape
    P_uk = np.ones((num_user, n_factor)) / float(n_factor)
    Q_ik = np.ones((num_item, n_factor)) / float(n_factor)
    for step in xrange(n_step):
        num_error = 0
        i = 0
        for user_id, items in ui_data.iteritems():
            i += 1
            if i % 10000 == 0:
                print 'count user:', i
            samples = random_select_negative_sample(items)
            for item_id, value in samples.iteritems():
                E_ui = value - P_uk[user_id, :].dot(Q_ik[item_id, :].transpose())
                old_error = E_ui * E_ui
                P_uk[user_id, :] = P_uk[user_id, :] + alpha * (Q_ik[item_id, :] * E_ui - C * P_uk[user_id, :])
                Q_ik[item_id, :] = Q_ik[item_id, :] + alpha * (P_uk[user_id, :] * E_ui - C * Q_ik[item_id, :])
                new_error = (value - P_uk[user_id, :].dot(Q_ik[item_id, :].transpose())) ** 2
                
                num_error += new_error
        alpha *= 0.9
    
        print "step:", step, "error:", num_error
    return P_uk, Q_ik

test_item = dict(u_items[:10000])
P, Q = LatentFactorModel(user_items, 10, 1, 0.5, 0.1)

count user: 10000
count user: 20000
count user: 30000
count user: 40000
count user: 50000
count user: 60000
count user: 70000
count user: 80000
count user: 90000
count user: 100000
count user: 110000
count user: 120000
count user: 130000
count user: 140000
count user: 150000
count user: 160000
count user: 170000
count user: 180000
count user: 190000
count user: 200000
step: 0 error: 10181.4843422


In [260]:
df = pd.DataFrame(P)

In [289]:
q_df = pd.DataFrame(Q)
q_df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
count,49677.0,49677.0,49677.0,49677.0,49677.0,49677.0,49677.0,49677.0,49677.0,49677.0
mean,0.0007514803,0.0007514803,0.0007514803,0.0007514803,0.0007514803,0.0007514803,0.0007514803,0.0007514803,0.0007514803,0.0007514803
std,0.002000601,0.002000601,0.002000601,0.002000601,0.002000601,0.002000601,0.002000601,0.002000601,0.002000601,0.002000601
min,1.679865e-11,1.679865e-11,1.679865e-11,1.679865e-11,1.679865e-11,1.679865e-11,1.679865e-11,1.679865e-11,1.679865e-11,1.679865e-11
25%,1.349424e-05,1.349424e-05,1.349424e-05,1.349424e-05,1.349424e-05,1.349424e-05,1.349424e-05,1.349424e-05,1.349424e-05,1.349424e-05
50%,0.0001477994,0.0001477994,0.0001477994,0.0001477994,0.0001477994,0.0001477994,0.0001477994,0.0001477994,0.0001477994,0.0001477994
75%,0.0007121548,0.0007121548,0.0007121548,0.0007121548,0.0007121548,0.0007121548,0.0007121548,0.0007121548,0.0007121548,0.0007121548
max,0.09810913,0.09810913,0.09810913,0.09810913,0.09810913,0.09810913,0.09810913,0.09810913,0.09810913,0.09810913


#### Accoring to P and Q, we try to calculate result

In [294]:
orders = pd.read_csv(DATA_PATH + "orders.csv")

#### Get train use-item_list

In [295]:
train = pd.read_csv(DATA_PATH + "order_products__train.csv")

In [296]:
train_order_user = pd.merge(train, orders, on='order_id')

In [297]:
train_user_products = pd.DataFrame(train_order_user.groupby('user_id')['product_id'].apply(list))

In [298]:
train_user_products.head()

Unnamed: 0_level_0,product_id
user_id,Unnamed: 1_level_1
1,"[196, 25133, 38928, 26405, 39657, 10258, 13032..."
2,"[22963, 7963, 16589, 32792, 41787, 22825, 1364..."
5,"[15349, 19057, 16185, 21413, 20843, 20114, 482..."
7,"[12053, 47272, 37999, 13198, 43967, 40852, 176..."
8,"[15937, 5539, 10960, 23165, 22247, 4853, 27104..."


In [301]:
prior = pd.read_csv(DATA_PATH + "order_products__prior.csv")
order_user = pd.merge(prior, orders, on='order_id')
prior_user_products = pd.DataFrame(order_user.groupby('user_id')['product_id'].apply(list))
def cal_dict(x):
    from collections import Counter
    a = Counter(x)
    return dict(a)

prior_user_products_dict = pd.DataFrame(prior_user_products['product_id'].apply(cal_dict))
product_id_list = np.unique(order_user.product_id)
product_id_dict = {}
for idx, product_id in enumerate(product_id_list):
    product_id_dict[product_id] = idx
    
user_id_list = np.unique(order_user.user_id)
user_id_dict = {}
for idx, user_id in enumerate(user_id_list):
    user_id_dict[user_id] = idx

In [321]:
train_user_products_dict.product_id

user_id
1         {27845: 1, 196: 1, 26405: 1, 13032: 1, 39657: ...
2         {48523: 1, 24838: 1, 13640: 1, 11913: 1, 45066...
5         {40706: 1, 21413: 1, 20843: 1, 48204: 1, 21616...
7         {17638: 1, 29894: 1, 47272: 1, 45066: 1, 13198...
8         {15937: 1, 37803: 1, 41540: 1, 21903: 1, 10960...
9         {17600: 1, 8834: 1, 42347: 1, 27973: 1, 27596:...
10                 {48720: 1, 10177: 1, 29650: 1, 24654: 1}
13        {27086: 1, 4210: 1, 27435: 1, 47078: 1, 19934: 1}
14        {3808: 1, 11042: 1, 15172: 1, 29509: 1, 8744: ...
17        {12720: 1, 1217: 1, 4374: 1, 18534: 1, 43352: ...
18        {8518: 1, 22888: 1, 5450: 1, 25997: 1, 11022: ...
21        {32645: 1, 16615: 1, 44632: 1, 12683: 1, 25740...
23        {8224: 1, 42372: 1, 33797: 1, 32327: 1, 13544:...
24                                               {31222: 1}
27        {11777: 1, 44932: 1, 45446: 1, 32263: 1, 12745...
29        {39170: 1, 20874: 1, 37645: 1, 49615: 1, 20305...
30                              

In [320]:
prior_user_products_dict.product_id

user_id
1         {196: 10, 46149: 3, 12427: 10, 38928: 1, 10258...
2         {38656: 1, 34688: 3, 40198: 2, 45066: 3, 2573:...
3         {17668: 5, 47766: 9, 44683: 2, 21903: 8, 14992...
4         {21573: 1, 35469: 2, 37646: 1, 26576: 1, 43704...
5         {11777: 4, 40706: 2, 8518: 2, 48775: 1, 18761:...
6         {40992: 1, 27521: 1, 20323: 1, 48679: 1, 8424:...
7         {11520: 1, 35333: 2, 519: 2, 10504: 1, 45066: ...
8         {11136: 1, 8193: 1, 17794: 3, 39812: 1, 24838:...
9         {8834: 2, 12075: 1, 38277: 2, 30252: 2, 5002: ...
10        {28928: 1, 36865: 2, 40706: 3, 20995: 1, 260: ...
11        {17794: 1, 8197: 1, 30855: 1, 33037: 1, 30480:...
12        {11520: 1, 45056: 1, 17794: 1, 44422: 1, 17159...
13        {41926: 1, 41351: 2, 41480: 1, 37385: 1, 31372...
14        {8193: 1, 17923: 2, 18439: 1, 45066: 1, 34827:...
15        {11266: 10, 37059: 1, 196: 5, 10441: 8, 48142:...
16        {15872: 1, 28289: 1, 17794: 2, 43014: 2, 651: ...
17        {5128: 5, 38444: 1, 40

In [303]:
def cal_dict(x):
    from collections import Counter
    a = Counter(x)
    return dict(a)

train_user_products_dict = pd.DataFrame(train_user_products['product_id'].apply(cal_dict))

#### Accoring to train_data, calculate the threshold for all provided data

In [306]:
train_user_products_dict.product_id

user_id
1         [196, 25133, 38928, 26405, 39657, 10258, 13032...
2         [22963, 7963, 16589, 32792, 41787, 22825, 1364...
5         [15349, 19057, 16185, 21413, 20843, 20114, 482...
7         [12053, 47272, 37999, 13198, 43967, 40852, 176...
8         [15937, 5539, 10960, 23165, 22247, 4853, 27104...
9         [27555, 42347, 27596, 8834, 26604, 12075, 8467...
10                             [29650, 48720, 24654, 10177]
13                       [27435, 27086, 4210, 47078, 19934]
14        [11042, 32115, 28601, 29615, 15869, 37434, 380...
17                 [18534, 1217, 12720, 16797, 43352, 4374]
18        [36216, 47546, 21137, 5450, 8518, 22031, 22888...
21               [25740, 12683, 44632, 10957, 32645, 16615]
23        [13544, 42372, 33819, 33797, 20468, 13481, 822...
24                                                  [31222]
27        [2966, 21386, 11777, 21137, 4920, 46906, 33754...
29        [39170, 20305, 6128, 19541, 35385, 48800, 3980...
30                              

In [314]:
def predict(train_dict, P_uk, Q_ik, user_dict, item_dict):
    from collections import defaultdict
    result = defaultdict(dict)
    for x in train_dict['product_id'].iteritems():
        user_id, product_dict = x[0], x[1]
        for product_id, num in product_dict.iteritems():
            if product_id not in item_dict:
                continue
            user_idx = user_dict[user_id]
            item_idx = item_dict[product_id]
            result[user_id][product_id] = P_uk[user_idx, :].dot(Q_ik[item_idx, :].transpose())
    return result

In [352]:
train_result = predict(prior_user_products_dict, P, Q, user_id_dict, product_id_dict)

In [353]:
def normalize(data):
    result = {}
    for user_id, product_dict in data.iteritems():
        cnt = sum(product_dict.values())
        result[user_id] = {}
        for product_id, value in product_dict.iteritems():
            result[user_id][product_id] = value / cnt
    return result

normal_data = normalize(train_result)

In [354]:
def score_f(predict_list, product_list):
    recall = 0
    precision = 0
    
    same_one = 0
    for product_id in predict_list:
        if product_id in product_list:
            same_one += 1
    
    if len(predict_list) == 0:
        return 0
    
    recall = float(same_one) / len(product_list)
    precision = float(same_one) / len(predict_list)
    
    if recall == 0 and precision == 0:
        return 0
    
    return 2 * (recall * precision) / (recall + precision)

In [330]:
def cal_result(norm_data, train_dict, theshold):
    f_score = 0
    for x in train_dict['product_id'].iteritems():
        user_id, product_dict = x[0], x[1]
        
        product_list = list(product_dict.keys())
        predict_dict = normal_data[user_id]
        predict_list = []
        for key, value in predict_dict.iteritems():
            if value >= theshold:
                predict_list.append(key)
                
        f_score += score_f(predict_list, product_list)
    
    f_score /= float(len(train_dict))
    return f_score

In [375]:
cal_result(normal_data, train_user_products_dict, 0.0032)

0.19995515820848808

In [376]:
cal_result(normal_data, train_user_products_dict, 0.0034)

0.1999567086194917

In [378]:
cal_result(normal_data, train_user_products_dict, 0.0035)

0.19995918424112527

In [385]:
test_user_id_list = pd.DataFrame(orders.groupby('eval_set')['user_id'].apply(list))

In [391]:
unique_user_id_list = np.unique(test_user_id_list.user_id['test'])

In [393]:
len(unique_user_id_list)

75000

In [398]:
def predict_result(norm_data, user_id_list, theshold=0.0035):
    result = {}
    for user_id in user_id_list:
        predict_dict = normal_data[user_id]
        predict_list = []
        for key, value in predict_dict.iteritems():
            if value >= theshold:
                predict_list.append(key)
                
        result[user_id] = predict_list
    
    return result

In [399]:
test_result = predict_result(normal_data, unique_user_id_list)

In [404]:
test_order_data = orders.ix[orders['eval_set'] == 'test']

In [405]:
test_order_data.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
38,2774568,3,test,13,5,15,11.0
44,329954,4,test,6,3,12,30.0
53,1528013,6,test,4,3,16,22.0
96,1376945,11,test,8,6,11,8.0
102,1356845,12,test,6,1,20,30.0


In [406]:
test_order_data.iteritems()

<generator object iteritems at 0x4771aef50>

In [416]:
print_data = []
for x in test_order_data.iterrows():
    order_id = x[1].order_id
    user_id = x[1].user_id
    print_data.append((order_id, test_result[user_id]))

In [419]:
sorted_print_data = sorted(print_data)

In [420]:
sorted_print_data[0]

(17,
 [26429,
  40002,
  16965,
  11494,
  13107,
  39275,
  21709,
  21903,
  18288,
  44056,
  6291,
  47766,
  21463,
  39928,
  38777,
  7035,
  31964,
  15613,
  13535])