# Preamble

In [1]:
% pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
import scipy.sparse as ss

In [3]:
import itertools
import os
import pandas as pd

In [4]:
import sklearn.metrics as skm

from sklearn.model_selection import train_test_split

In [5]:
from code.constants import PROJECT_ROOT
from code.helpers import _FILES

In [6]:
from tqdm import tqdm as tqdm

# Load data

In [7]:
orders_prior = pd.read_hdf('../data/data.hd5', 'orders_prior')
orders_train = pd.read_hdf('../data/data.hd5', 'orders_train')

In [8]:
orders_meta = pd.read_csv(_FILES['orders_meta'])

In [9]:
sample_submission = pd.read_csv(_FILES['samples_submission'])

In [10]:
products = pd.read_csv(_FILES['products'])

#### Attach product info to orders

In [11]:
orders_prior = orders_prior.merge(products[['product_id', 'aisle_id', 'department_id']])
orders_train = orders_train.merge(products[['product_id', 'aisle_id', 'department_id']])

#### Gather counts by user

In [12]:
orders_by_user = (orders_meta[['order_id', 'user_id']]
                  .merge(orders_meta
                         [lambda _df: _df['eval_set'] != 'prior']
                         [['user_id', 'eval_set']].drop_duplicates()))

In [13]:
num_orders_by_user = (orders_prior[['order_id', 'product_id']]
                      .merge(orders_by_user, how='left')
                      [['order_id', 'user_id']]
                      .drop_duplicates()
                      .groupby(['user_id']).size()
                      .to_frame('num_orders'))

In [14]:
prior_orders_by_user_product = (orders_prior[['order_id', 'product_id']]
                                .merge(orders_by_user, how='left')
                                [['user_id', 'eval_set', 'product_id', 'order_id']]
                                .drop_duplicates()
                                .groupby(['user_id', 'eval_set', 'product_id'])
                                .size()
                                .to_frame('num_times_ordered'))

In [15]:
prior_orders_by_user_aisle = (orders_prior[['order_id', 'aisle_id']]
                              .merge(orders_by_user, how='left')
                              [['user_id', 'eval_set', 'aisle_id', 'order_id']]
                              .drop_duplicates()
                              .groupby(['user_id', 'eval_set', 'aisle_id'])
                              .size()
                              .to_frame('num_times_ordered'))

KeyboardInterrupt: 

In [None]:
prior_orders_by_user_department = (orders_prior[['order_id', 'department_id']]
                                   .merge(orders_by_user, how='left')
                                   [['user_id', 'eval_set', 'department_id', 'order_id']]
                                   .drop_duplicates()
                                   .groupby(['user_id', 'eval_set', 'department_id'])
                                   .size()
                                   .to_frame('num_times_ordered'))

In [None]:
prior_order_counts_product = (prior_orders_by_user_product
                              .reset_index()
                              .merge(num_orders_by_user.reset_index())
                              .assign(
                                  frac_times_ordered=lambda _df: _df['num_times_ordered'] / _df['num_orders']
                              )
                             )

In [None]:
prior_order_counts_aisle = (prior_orders_by_user_aisle
                            .reset_index()
                            .merge(num_orders_by_user.reset_index())
                            .assign(
                                frac_times_ordered=lambda _df: _df['num_times_ordered'] / _df['num_orders']
                            )
                           )

In [None]:
prior_order_counts_department = (prior_orders_by_user_department
                                 .reset_index()
                                 .merge(num_orders_by_user.reset_index())
                                 .assign(
                                     frac_times_ordered=lambda _df: _df['num_times_ordered'] / _df['num_orders']
                                 )
                                )

# Baselines

Actually...abandoned halfway through since not obvious what a baseline means in these cases

## Create the test matrix

In [None]:
def create_sparse(orders, products, values, shape, order_ix_map=None):
    if isinstance(values, int):
        values = np.ones(orders.shape) * values
        
    if not order_ix_map:
        order_ix_map = {order: ix for ix, order in enumerate(set(orders))}
        
    sparse_vals = values
    sparse_rows = [order_ix_map[x] for x in orders]
    sparse_cols = [x - 1 for x in products]  # product ids are rep'd as contiguous counts starting at 1
    
    return ss.csc_matrix((sparse_vals, (sparse_rows, sparse_cols)), shape=shape), order_ix_map

In [None]:
target_train = orders_train[lambda _df: _df['reordered'] == 1]

In [None]:
order_ix_map = {order: ix for ix, order in 
                enumerate(set(orders_meta[lambda _df: _df['eval_set'] == 'train']['order_id']))}

In [None]:
sparse_target_train, _ = create_sparse(target_train['order_id'], target_train['product_id'], 1,
                                                  (len(order_ix_map), products.shape[0]),
                                                  order_ix_map)

## Define some additional useful things

In [None]:
train_order_ids = orders_meta[lambda _df: _df['eval_set'] == 'train'][['order_id', 'user_id']]

## Using aisle

In [None]:
aisle_multiplier = (1. / products.groupby('aisle_id').size()).to_frame('aisle_multiplier').reset_index()

In [None]:
train_order_preds = (prior_order_counts[lambda _df: _df['eval_set'] == 'train'][['user_id', 'product_id', 'frac_times_ordered']]
                     .merge(train_order_ids))

In [None]:
prior_order_counts_aisle.merge(aisle_multiplier).merge(products).head()

## Using department

In [None]:
train_order_preds = (prior_order_counts[lambda _df: _df['eval_set'] == 'train'][['user_id', 'product_id', 'frac_times_ordered']]
                     .merge(train_order_ids))