In [1]:
import pandas as pd
import numpy as np
import pickle

In [2]:
class MostPopularRecommender():
    
    def __init__(self, train_data, validation_data, test_data):
        self.train = train_data
        self.validation_data = validation_data
        self.test = test_data
    
    def fit(self):
        merged = pd.merge(self.train, order_products, on='order_id')[['order_id', 'product_id']]
        self.product_ids = merged['product_id'].value_counts().index.values.tolist()
        
    def predict(self, user_ids, top_k):
        recommendations = pd.DataFrame()
        recommendations['user_id'] = [i for i in user_ids]
        recommendations['predictions'] = [self.product_ids[:top_k] for i in user_ids]
        return recommendations.sort_values(by=['user_id'])
    
    def get_test_products(self):
        merged = pd.merge(self.test, order_products, on='order_id')[['order_id', 'product_id']]
        test_product_ids = merged['product_id'].value_counts().index.values.tolist()
        return test_product_ids

    def get_actual_results(self, user_ids, top_k):
        ground_truth = self.get_test_products()
        actual_df = pd.DataFrame()
        actual_df['user_id'] = [i for i in user_ids]
        actual_df['ground_truth'] = [ground_truth[:top_k] for i in user_ids]
        return actual_df.sort_values(by=['user_id'])

In [3]:
def _split_indices(grouped_ratings, retriever):
    return np.concatenate(grouped_ratings.apply(retriever).values)

def split(orders):
    grouper = orders.sort_values('order_number').groupby('user_id')
    train_indices = _split_indices(
        grouper,
        lambda user_ratings: user_ratings[:int(user_ratings.shape[0] * 0.5)].index.values)
    
    validation_indices = _split_indices(
        grouper,
        lambda user_ratings: user_ratings.iloc[int(user_ratings.shape[0] * 0.5):
                                               int(user_ratings.shape[0] * 0.75)].index.values)
    
    test_indices = _split_indices(
        grouper,
        lambda user_ratings: user_ratings.iloc[int(user_ratings.shape[0] * 0.75):].index.values)
    
    return train_indices, validation_indices, test_indices

def save_indices():
    # save results
    with open('train_indices.pickle', 'wb') as out:
        pickle.dump(train_indices, out)

    with open('validation_indices.pickle', 'wb') as out:
        pickle.dump(validation_indices, out)

    with open('test_indices.pickle', 'wb') as out:
        pickle.dump(test_indices, out)

In [4]:
def read_data():
    
    products_df = pd.read_csv('products.csv')
    orders_df = pd.read_csv('orders.csv')
    order_products_prior_df = pd.read_csv('order_products__prior.csv')
    order_products_train_df = pd.read_csv('order_products__train.csv')
    order_products_df = pd.concat([order_products_prior_df, order_products_train_df])
    
    return products_df, orders_df, order_products_df

In [5]:
def get_split_df(orders, order_products):

    with open('train_indices.pickle', 'rb') as input:
        train_indices = pickle.load(input)

    with open('validation_indices.pickle', 'rb') as input:
        validation_indices = pickle.load(input)

    with open('test_indices.pickle', 'rb') as input:
        test_indices = pickle.load(input)
        
    train_df = orders.loc[train_indices]
    validation_df = orders.loc[validation_indices]
    test_df = orders.loc[test_indices]
    
    return train_df, validation_df, test_df

In [6]:
# AP@k
def apk(actual, predicted, k=10):
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

# MAP@K
def mapk(actual, predicted, k=10):
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [7]:
products, orders, order_products = read_data()

train_indices, validation_indices, test_indices = split(orders)
save_indices()

In [8]:
train, valid, test = get_split_df(orders, order_products)

In [9]:
mpr = MostPopularRecommender(train, valid, test)
mpr.fit()

In [10]:
users = orders['user_id'].unique()
top_k = 30

In [11]:
merged = pd.merge(mpr.predict(users, top_k), mpr.get_actual_results(users, top_k), on='user_id')

actual = list(merged['ground_truth'])
predicted = list(merged['predictions'])

In [12]:
mapk(actual, predicted, 30)

0.7588804432393893