In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
aisles = pd.read_csv('aisles.csv')
orders = pd.read_csv('orders.csv')
departments = pd.read_csv('departments.csv')
products = pd.read_csv('products.csv')
order_products_prior = pd.read_csv('order_products__prior.csv')
order_products_train = pd.read_csv('order_products__train.csv')

In [7]:
def _split_indices(grouped_ratings, retriever):
    return np.concatenate(grouped_ratings.apply(retriever).values)

In [8]:
def split(orders):
    grouper = orders.sort_values('order_number').groupby('user_id')
    train_indices = _split_indices(
        grouper,
        lambda user_ratings: user_ratings[:int(user_ratings.shape[0] * 0.5)].index.values)
    
    validation_indices = _split_indices(
        grouper,
        lambda user_ratings: user_ratings.iloc[int(user_ratings.shape[0] * 0.5):
                                               int(user_ratings.shape[0] * 0.75)].index.values)
    
    test_indices = _split_indices(
        grouper,
        lambda user_ratings: user_ratings.iloc[int(user_ratings.shape[0] * 0.75):].index.values)
    
    return train_indices, validation_indices, test_indices

In [9]:
train_indices, validation_indices, test_indices = split(orders)

In [11]:
# save results
import pickle
with open('train_indices.pickle', 'wb') as out:
    pickle.dump(train_indices, out)

with open('validation_indices.pickle', 'wb') as out:
    pickle.dump(validation_indices, out)

with open('test_indices.pickle', 'wb') as out:
    pickle.dump(test_indices, out)

In [12]:
# get results
with open('train_indices.pickle', 'rb') as input:
    train_indices = pickle.load(input)

with open('validation_indices.pickle', 'rb') as input:
    validation_indices = pickle.load(input)

with open('test_indices.pickle', 'rb') as input:
    test_indices = pickle.load(input)

In [13]:
# AP@k
def apk(actual, predicted, k=10):
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

In [14]:
# MAP@K
def mapk(actual, predicted, k=10):
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])