In [1]:
import numpy as np
import pandas as pd
import gc
IDIR = '../input/'
IDIR4 = '../input4/'

FEATURES_PATH = './features3/'

In [2]:
gc.collect()

0

In [3]:
print('loading prior')
priors = pd.read_csv(IDIR + 'order_products__prior.csv', dtype={
            'order_id': np.int32,
            'product_id': np.uint16,
            'add_to_cart_order': np.int16,
            'reordered': np.int8})

print('loading train')
trains = pd.read_csv(IDIR + 'order_products__train.csv', dtype={
            'order_id': np.int32,
            'product_id': np.uint16,
            'add_to_cart_order': np.int16,
            'reordered': np.int8})

print('loading orders')
orders = pd.read_csv(IDIR + 'orders.csv', dtype={
        'order_id': np.int32,
        'user_id': np.int32,
        'eval_set': 'category',
        'order_number': np.int16,
        'order_dow': np.int8,
        'order_hour_of_day': np.int8,
        'days_since_prior_order': np.float32})

print('loading products')
products = pd.read_csv(IDIR + 'products.csv', dtype={
        'product_id': np.uint16,
        'order_id': np.int32,
        'aisle_id': np.uint8,
        'department_id': np.uint8},
        usecols=['product_id', 'aisle_id', 'department_id'])

print('loading aisle')
aisles = pd.read_csv(IDIR + 'aisles.csv', dtype={
        'aisle_id': np.int16 })

print('loading department')
departments = pd.read_csv(IDIR + 'departments.csv', dtype={
        'department_id': np.int16 })
        

print('priors {}: {}'.format(priors.shape, ', '.join(priors.columns)))
print('orders {}: {}'.format(orders.shape, ', '.join(orders.columns)))
print('train {}: {}'.format(trains.shape, ', '.join(trains.columns)))

loading prior
loading train
loading orders
loading products
loading aisle
loading department
priors (32434489, 4): order_id, product_id, add_to_cart_order, reordered
orders (3421083, 7): order_id, user_id, eval_set, order_number, order_dow, order_hour_of_day, days_since_prior_order
train (1384617, 4): order_id, product_id, add_to_cart_order, reordered


In [4]:
import pickle
with open(FEATURES_PATH + 'dtypes.pickle', 'rb') as f:
    dtype_dict = pickle.load(f)
dtype_dict

{'aisle_id': numpy.int16,
 'aisle_reorder_rate': numpy.float32,
 'dep_reorder_rate': numpy.float32,
 'department_id': numpy.int16,
 'product_id': numpy.int32,
 'product_reorder_ratio': numpy.float32,
 'up_add_to_cart_order_mean': numpy.float32,
 'up_days_since_last_order': numpy.int16,
 'up_first_order': numpy.int16,
 'up_in_same_day_previous_order': numpy.int8,
 'up_last_order': numpy.int16,
 'up_order_rate': numpy.float32,
 'up_order_rate_since_first_order': numpy.float32,
 'up_orders': numpy.int16,
 'up_orders_since_last_order': numpy.float32,
 'up_reordered': numpy.float32,
 'user_aisle_reorder_rate': numpy.float32,
 'user_days_since_last_order': numpy.float32,
 'user_dep_reorder_rate': numpy.float32,
 'user_id': numpy.int32,
 'user_order_size_mean': numpy.float32,
 'user_reorder_rate': numpy.float32,
 'user_total_order': numpy.int16}

In [5]:
user_info = pd.read_hdf(FEATURES_PATH + "data.h5", "user_info")

In [6]:
user_info.head()

Unnamed: 0,user_id,user_total_order,user_eval_set,user_days_since_last_order,user_reorder_rate,user_order_size_mean
0,1,10,train,14.0,0.694915,5.9
1,2,14,train,30.0,0.476923,13.928572
2,3,12,test,11.0,0.625,7.333333
3,4,5,test,30.0,0.055556,3.6
4,5,4,train,6.0,0.378378,9.25


## New orders

In [7]:
users4 = user_info[user_info.user_total_order > 3].user_id

In [8]:
orders4 = orders[orders.user_id.isin(users4)]
orders4 = orders4[orders4.eval_set=="prior"]
orders4_maxorder = orders4.groupby('user_id').order_number.max()
orders4['max_order_number'] = orders4['user_id'].map(orders4_maxorder)
orders4.loc[orders4.order_number ==  orders4.max_order_number,  'eval_set'] = "train"
orders4.drop(['max_order_number'], axis = 1, inplace = True)

In [13]:
orders4[orders4.user_id == 1]

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0
5,3367565,1,prior,6,2,7,19.0
6,550135,1,prior,7,1,9,20.0
7,3108588,1,prior,8,1,14,14.0
8,2295261,1,prior,9,1,16,0.0
9,2550362,1,train,10,4,8,30.0


In [11]:
orders4.set_index('order_id').to_csv(IDIR4 + "orders.csv")

## New priors

In [14]:
orders4_prior_ids = orders4[orders4.eval_set == 'prior'].order_id
orders4_train_ids = orders4[orders4.eval_set == 'train'].order_id

In [15]:
priors4_prior = priors[priors.order_id.isin(orders4_prior_ids)]
priors4_prior.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1
2,2,9327,3,0
3,2,45918,4,1
4,2,30035,5,0


In [19]:
priors4_prior.set_index(['order_id', 'product_id'], inplace = True)
priors4_prior.to_csv(IDIR4 + "order_products__prior.csv")

In [16]:
priors4_train = priors[priors.order_id.isin(orders4_train_ids)]
priors4_train.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
219,25,9755,1,1
220,25,31487,2,0
221,25,37510,3,1
222,25,14576,4,1
223,25,22105,5,0


In [20]:
priors4_train.to_csv(IDIR4 + "order_products__train.csv")