In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
import copy
warnings.simplefilter(action='ignore')

In [2]:
data_train = pd.read_csv('order_products__prior.csv').iloc[:,0:3]
data_test = pd.read_csv('order_products__train.csv').iloc[:,0:3]

In [3]:
print(data_train.shape)
print(data_test.shape)

(32434489, 3)
(1384617, 3)


In [4]:
data_train

Unnamed: 0,order_id,product_id,add_to_cart_order
0,2,33120,1
1,2,28985,2
2,2,9327,3
3,2,45918,4
4,2,30035,5
...,...,...,...
32434484,3421083,39678,6
32434485,3421083,11352,7
32434486,3421083,4600,8
32434487,3421083,24852,9


In [5]:
data_test

Unnamed: 0,order_id,product_id,add_to_cart_order
0,1,49302,1
1,1,11109,2
2,1,10246,3
3,1,49683,4
4,1,43633,5
...,...,...,...
1384612,3421063,14233,3
1384613,3421063,35548,4
1384614,3421070,35951,1
1384615,3421070,16953,2


In [6]:
unique_train_order_ids = set(data_train['order_id'])
unique_test_order_ids = set(data_test['order_id'])

In [7]:
print(len(unique_train_order_ids)) # number of orders in training
print(len(unique_test_order_ids)) # number of orders in test

3214874
131209


In [8]:
unique_train_order_ids.intersection(unique_test_order_ids) # no overlap

set()

### Only include items which appear >= 50000 times

In [9]:
data_all = pd.concat([data_train, data_test])

In [10]:
items_occurrence = data_all.groupby(['product_id'])['order_id'].count().reset_index()

In [11]:
items_occurrence #### 49685 distinct items

Unnamed: 0,product_id,order_id
0,1,1928
1,2,94
2,3,283
3,4,351
4,5,16
...,...,...
49680,49684,9
49681,49685,49
49682,49686,127
49683,49687,14


In [12]:
items_50000 = np.array(items_occurrence[items_occurrence['order_id'] >= 50000]['product_id'])
len(items_50000) #### 63 distinct items

63

In [13]:
data_train_subset = data_train[data_train['product_id'].isin(items_50000)]
data_test_subset = data_test[data_test['product_id'].isin(items_50000)]

### Only include baskets containing >= 10 items

In [14]:
basket_10_train = data_train_subset.groupby(['order_id'])['product_id'].count().reset_index()
basket_10_train = np.array(basket_10_train[basket_10_train['product_id'] >= 10]['order_id'])
len(basket_10_train) #### 25K orders in training

24781

In [15]:
basket_10_test = data_test_subset.groupby(['order_id'])['product_id'].count().reset_index()
basket_10_test = np.array(basket_10_test[basket_10_test['product_id'] >= 10]['order_id'])
len(basket_10_test) #### 1.4K orders in test

1433

In [16]:
data_train_final = data_train_subset[data_train_subset['order_id'].isin(basket_10_train)]
data_test_final = data_test_subset[data_test_subset['order_id'].isin(basket_10_test)]

In [17]:
data_train_final['rank'] = data_train_final.groupby('order_id')['add_to_cart_order'] \
    .rank(method="dense", ascending=True)

data_test_final['rank'] = data_test_final.groupby('order_id')['add_to_cart_order'] \
    .rank(method="dense", ascending=True)

In [18]:
del data_train_final['add_to_cart_order']
del data_test_final['add_to_cart_order']

data_train_final['rank'] = data_train_final['rank'].astype(int)
data_test_final['rank'] = data_test_final['rank'].astype(int)

data_train_final = data_train_final.reset_index(drop = True)
data_test_final = data_test_final.reset_index(drop = True)

In [19]:
unique_product_ids = pd.concat([data_train_final['product_id'], data_test_final['product_id']]).unique()

product_id_map = {prod_id: 1 + idx for idx, prod_id in enumerate(unique_product_ids)}

data_train_final['product_id'] = data_train_final['product_id'].map(product_id_map)
data_test_final['product_id'] = data_test_final['product_id'].map(product_id_map)

In [27]:
unique_product_ids

array([21903, 30391, 46667, 13176, 21616,  8518, 22935,  5876, 48679,
       24838, 31717, 47209, 26209, 34969, 27966, 37646, 44632, 16797,
       39275,  5077, 10749, 49235, 21137, 28204, 21938, 46979, 47626,
       44359, 34126, 28985, 24852, 41950, 30489,  9076, 24964, 45007,
       42265, 49683, 47766, 39877, 19057, 40706,  5450, 43961, 39928,
       22825, 12341, 17794,  4605, 22035, 27845, 27104, 26604,  8277,
        4920, 25890, 31506, 35951, 45066, 24184, 19660, 27086, 43352])

In [20]:
data_train_final

Unnamed: 0,order_id,product_id,rank
0,251,1,1
1,251,2,2
2,251,3,3
3,251,4,4
4,251,5,5
...,...,...,...
275340,3421030,36,6
275341,3421030,48,7
275342,3421030,9,8
275343,3421030,11,9


In [21]:
data_test_final

Unnamed: 0,order_id,product_id,rank
0,878,31,1
1,878,59,2
2,878,1,3
3,878,35,4
4,878,12,5
...,...,...,...
16061,3420286,46,6
16062,3420286,15,7
16063,3420286,36,8
16064,3420286,20,9


In [22]:
len(set(data_train_final['product_id'])), len(set(data_test_final['product_id']))

(63, 63)

In [23]:
len(set(data_train_final['order_id'])), len(set(data_test_final['order_id']))

(24781, 1433)

In [25]:
data_train_final.to_csv('data_train_final.csv', index = False)
data_test_final.to_csv('data_test_final.csv', index = False)