In [1]:
import pandas as pd
import numpy as np
import pickle
from most_popular_recommender import *
from metrics import mapk

In [2]:
def _split_indices(grouped_ratings, retriever):
    return np.concatenate(grouped_ratings.apply(retriever).values)

def split(orders):
    grouper = orders.sort_values('order_number').groupby('user_id')
    train_indices = _split_indices(
        grouper,
        lambda user_ratings: user_ratings[:int(user_ratings.shape[0] * 0.5)].index.values)
    
    validation_indices = _split_indices(
        grouper,
        lambda user_ratings: user_ratings.iloc[int(user_ratings.shape[0] * 0.5):
                                               int(user_ratings.shape[0] * 0.75)].index.values)
    
    test_indices = _split_indices(
        grouper,
        lambda user_ratings: user_ratings.iloc[int(user_ratings.shape[0] * 0.75):].index.values)
    
    return train_indices, validation_indices, test_indices

def save_indices():
    # save results
    with open('train_indices.pickle', 'wb') as out:
        pickle.dump(train_indices, out)

    with open('validation_indices.pickle', 'wb') as out:
        pickle.dump(validation_indices, out)

    with open('test_indices.pickle', 'wb') as out:
        pickle.dump(test_indices, out)

In [3]:
def read_data():
    
    products_df = pd.read_csv('products.csv')
    orders_df = pd.read_csv('orders.csv')
    order_products_prior_df = pd.read_csv('order_products__prior.csv')
    order_products_train_df = pd.read_csv('order_products__train.csv')
    order_products_df = pd.concat([order_products_prior_df, order_products_train_df])
    
    return products_df, orders_df, order_products_df

In [4]:
def get_split_df(orders, order_products):

    with open('train_indices.pickle', 'rb') as input:
        train_indices = pickle.load(input)

    with open('validation_indices.pickle', 'rb') as input:
        validation_indices = pickle.load(input)

    with open('test_indices.pickle', 'rb') as input:
        test_indices = pickle.load(input)
        
    train_df = orders.loc[train_indices]
    validation_df = orders.loc[validation_indices]
    test_df = orders.loc[test_indices]
    
    return train_df, validation_df, test_df

In [5]:
def get_actual_results():
    merged = pd.merge(test, order_products, on='order_id')[['user_id', 'product_id']]
    actual_df = (merged.groupby('user_id')['product_id'].apply(lambda x: list(set(x))).reset_index()
                 .rename(columns={'product_id': 'ground_truth'})) 

    return actual_df.sort_values(by=['user_id'])

In [6]:
products, orders, order_products = read_data()

In [7]:
train_indices, validation_indices, test_indices = split(orders)
save_indices()

In [8]:
train, valid, test = get_split_df(orders, order_products)

In [9]:
mpr = MostPopularRecommender(train, valid, test, orders, order_products)
mpr.fit()

In [10]:
users = orders['user_id'].unique()
top_k = 10

In [11]:
mpr.predict(users, top_k)

Unnamed: 0,user_id,predictions
0,1,"[24852, 13176, 21137, 21903, 47209, 47766, 262..."
1,2,"[24852, 13176, 21137, 21903, 47209, 47766, 262..."
2,3,"[24852, 13176, 21137, 21903, 47209, 47766, 262..."
3,4,"[24852, 13176, 21137, 21903, 47209, 47766, 262..."
4,5,"[24852, 13176, 21137, 21903, 47209, 47766, 262..."
...,...,...
206204,206205,"[24852, 13176, 21137, 21903, 47209, 47766, 262..."
206205,206206,"[24852, 13176, 21137, 21903, 47209, 47766, 262..."
206206,206207,"[24852, 13176, 21137, 21903, 47209, 47766, 262..."
206207,206208,"[24852, 13176, 21137, 21903, 47209, 47766, 262..."


In [12]:
merged = pd.merge(mpr.predict(users, top_k), get_actual_results(), on='user_id')

In [13]:
actual = list(merged['ground_truth'])
predicted = list(merged['predictions'])

In [14]:
mapk(actual, predicted, 30)

0.0376583726437911