In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
#import lightgbm as lgb

In [2]:
priors = pd.read_csv('order_products__prior.csv', dtype={
            'order_id': np.int32,
            'product_id': np.uint16,
            'add_to_cart_order': np.int16,
            'reordered': np.int8})

In [3]:
train = pd.read_csv('order_products__train.csv', dtype={
            'order_id': np.int32,
            'product_id': np.uint16,
            'add_to_cart_order': np.int16,
            'reordered': np.int8})

In [4]:
orders = pd.read_csv('orders.csv', dtype={
        'order_id': np.int32,
        'user_id': np.int32,
        'eval_set': 'category',
        'order_number': np.int16,
        'order_dow': np.int8,
        'order_hour_of_day': np.int8,
        'days_since_prior_order': np.float32})

In [5]:
products = pd.read_csv('products.csv', dtype={
        'product_id': np.uint16,
        'order_id': np.int32,
        'aisle_id': np.uint8,
        'department_id': np.uint8},
        usecols=['product_id', 'aisle_id', 'department_id'])

In [6]:
print('priors {}: {}'.format(priors.shape, ', '.join(priors.columns)))
print('orders {}: {}'.format(orders.shape, ', '.join(orders.columns)))
print('train {}: {}'.format(train.shape, ', '.join(train.columns)))

priors (32434489, 4): order_id, product_id, add_to_cart_order, reordered
orders (3421083, 7): order_id, user_id, eval_set, order_number, order_dow, order_hour_of_day, days_since_prior_order
train (1384617, 4): order_id, product_id, add_to_cart_order, reordered


In [7]:
orders.columns

Index(['order_id', 'user_id', 'eval_set', 'order_number', 'order_dow',
       'order_hour_of_day', 'days_since_prior_order'],
      dtype='object')

In [8]:
orders['user_id'].unique()

array([     1,      2,      3, ..., 206207, 206208, 206209], dtype=int64)

In [9]:
orders['eval_set'].value_counts()

prior    3214874
train     131209
test       75000
Name: eval_set, dtype: int64

In [10]:
orders.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


In [11]:
train['product_id'] = train['product_id'].astype(str)
priors['product_id'] = priors['product_id'].astype(str)

In [12]:
train_products = train.groupby("order_id").apply(lambda order: order['product_id'].tolist())
prior_products = priors.groupby("order_id").apply(lambda order: order['product_id'].tolist())

In [13]:
prods = pd.DataFrame()
prods['orders'] = priors.groupby(priors.product_id).size().astype(np.int32)

In [14]:
prods['reorders'] = priors['reordered'].groupby(priors.product_id).sum().astype(np.float32)
prods['reorder_rate'] = (prods.reorders / prods.orders).astype(np.float32)

In [15]:
prods.head()

Unnamed: 0_level_0,orders,reorders,reorder_rate
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1852,1136.0,0.613391
10,2572,1304.0,0.506998
100,437,245.0,0.560641
1000,2610,1065.0,0.408046
10000,11,4.0,0.363636


In [16]:
type(train_products)

pandas.core.series.Series

In [17]:
user_prods = pd.DataFrame()

In [18]:
user_prods['products'] = train_products

In [19]:
user_prods.dtypes

products    object
dtype: object

In [20]:
user_prods[2:4]

Unnamed: 0_level_0,products
order_id,Unnamed: 1_level_1
38,"[11913, 18159, 4461, 21616, 23622, 32433, 2884..."
96,"[20574, 30391, 40706, 25610, 27966, 24489, 39275]"


In [21]:
user_prods['order_id'] = user_prods.index

In [22]:
user_prods.head()

Unnamed: 0_level_0,products,order_id
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,"[49302, 11109, 10246, 49683, 43633, 13176, 472...",1
36,"[39612, 19660, 49235, 43086, 46620, 34497, 486...",36
38,"[11913, 18159, 4461, 21616, 23622, 32433, 2884...",38
96,"[20574, 30391, 40706, 25610, 27966, 24489, 39275]",96
98,"[8859, 19731, 43654, 13176, 4357, 37664, 34065...",98


In [23]:
train_prods = user_prods.merge(orders, on ='order_id')

Defaulting to column, but this will raise an ambiguity error in a future version
  """Entry point for launching an IPython kernel.


In [24]:
train_prods.shape

(131209, 8)

In [25]:
user_prods = pd.DataFrame()
user_prods['products'] = prior_products
user_prods['order_id'] = user_prods.index
prior_prods = user_prods.merge(orders, on ='order_id')

Defaulting to column, but this will raise an ambiguity error in a future version
  after removing the cwd from sys.path.


In [26]:
prior_prods.shape

(3214874, 8)

In [27]:
prior_prods.head()

Unnamed: 0,products,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,"[33120, 28985, 9327, 45918, 30035, 17794, 4014...",2,202279,prior,3,5,9,8.0
1,"[33754, 24838, 17704, 21903, 17668, 46667, 174...",3,205970,prior,16,5,17,12.0
2,"[46842, 26434, 39758, 27761, 10054, 21351, 225...",4,178520,prior,36,1,9,7.0
3,"[13176, 15005, 47329, 27966, 23909, 48370, 132...",5,156122,prior,42,6,16,9.0
4,"[40462, 15873, 41897]",6,22352,prior,4,1,12,30.0


In [28]:
type(prior_prods["products"])

pandas.core.series.Series

In [29]:
no_of_prods = prior_prods['products'].apply(len)

In [30]:
all_prior_products = prior_prods.groupby("user_id").apply(lambda order: order['products'].tolist())

In [31]:
all_prior_prods = pd.DataFrame()
all_prior_prods['products'] = all_prior_products
all_prior_prods['user_id'] = all_prior_products.index

In [32]:
all_prior_prods.head()

Unnamed: 0_level_0,products,user_id
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,"[[196, 12427, 10258, 25133, 10326, 17122, 4178...",1
2,"[[49451, 32792, 32139, 34688, 36735, 37646, 22...",2
3,"[[38596, 21903, 248, 40604, 8021, 17668, 21137...",3
4,"[[22199, 25146], [1200, 17769, 43704, 37646, 1...",4
5,"[[27344, 24535, 43693, 40706, 16168, 21413, 13...",5


In [33]:
max_users = all_prior_prods.shape[0]

In [34]:
max_prods = products.shape[0]
print(max_prods,max_users)

49688 206209


In [37]:
user_prod = np.zeros((1000,max_prods))

In [49]:
p=0
for i in range(1000):
    x = list(all_prior_prods.iloc[i,0])
    x = np.array(x)
    for k in range(len(x)):
        z = np.array(x[k])
        for prod in z:
            prod = int(prod)
            user_prod[i,prod-1] =  user_prod[i,prod-1]+1  
           

In [50]:
user_prod

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [57]:
non_zero = np.count_nonzero(user_prod)

In [58]:
non_zero

62102

In [59]:
k = user_prod.shape[0]*user_prod.shape[1]

In [60]:
k

49688000

In [61]:
sparsity = 1 - non_zero/k

In [62]:
sparsity

0.9987501610046692

In [51]:
Norm_user_prod = np.zeros((1000,max_prods))

In [52]:
Norm_user_prod=user_prod/np.sum(user_prod,axis=1).reshape(1000,1)

In [53]:
Norm_user_prod[0,:].sum()

1.0

In [54]:
cos_lib = cosine_similarity(user_prod[0].reshape(1,-1),user_prod[1].reshape(1,-1))
print(cos_lib)

[[0.00798146]]


In [None]:
from sklearn.neighbors import LSHForest
>>> X_train = [[5, 5, 2], [21, 5, 5], [1, 1, 1], [8, 9, 1], [6, 10, 2]]
>>> X_test = [[9, 1, 6], [3, 1, 10], [7, 10, 3]]
>>> lshf = LSHForest(random_state=42)
>>> lshf.fit(X_train)  # doctest: +NORMALIZE_WHITESPACE
LSHForest(min_hash_match=4, n_candidates=50, n_estimators=10,
          n_neighbors=5, radius=1.0, radius_cutoff_ratio=0.9,
          random_state=42)
>>> distances, indices = lshf.kneighbors(X_test, n_neighbors=2)
>>> distances                                        # doctest: +ELLIPSIS
array([[ 0.069...,  0.149...],
       [ 0.229...,  0.481...],
       [ 0.004...,  0.014...]])
>>> indices
array([[1, 2],
       [2, 0],
       [4, 0]])

In [55]:
from sklearn.neighbors import LSHForest
X_train = Norm_user_prod[0:800,:]
X_test =  Norm_user_prod[800:1000,:]
lshf = LSHForest(random_state=42)
lshf.fit(X_train)
distances, indices = lshf.kneighbors(X_test, n_neighbors=2)




In [56]:
indices

array([[128, 650],
       [322, 503],
       [ 27, 125],
       [707, 572],
       [288, 615],
       [320, 312],
       [ 72, 450],
       [ 18, 706],
       [ 63, 128],
       [691, 133],
       [310, 481],
       [321,  84],
       [284, 695],
       [117, 526],
       [277, 101],
       [523, 318],
       [450,  72],
       [450, 508],
       [288, 737],
       [ 37,  78],
       [195, 554],
       [ 64, 445],
       [653, 368],
       [544, 566],
       [195, 445],
       [423, 469],
       [ 83, 284],
       [104, 572],
       [195, 379],
       [799, 400],
       [393, 288],
       [ 94, 648],
       [501, 652],
       [153, 728],
       [381, 619],
       [429, 346],
       [456, 284],
       [309, 504],
       [582, 452],
       [650, 178],
       [400, 463],
       [652, 226],
       [134, 728],
       [414, 574],
       [ 90, 390],
       [210, 111],
       [699,  78],
       [219, 462],
       [ 78, 631],
       [573, 382],
       [705, 201],
       [ 72, 101],
       [631,