In [1]:
import pandas as pd
import scipy.sparse
import numpy as np
import pickle
from sklearn.metrics.pairwise import cosine_distances
from sklearn.decomposition import TruncatedSVD

In [2]:
def load_data():
    with open('train_indices.pickle', 'rb') as input:
        train_indices = pickle.load(input)
    with open('test_indices.pickle', 'rb') as input:
        test_indices = pickle.load(input)
    with open('validation_indices.pickle', 'rb') as input:
        validation_indices = pickle.load(input)
        
    orders = pd.read_csv('orders.csv')[['user_id', 'order_id']]
    train_ids = orders.loc[orders.index.isin(train_indices)]['order_id']
    test_ids = orders.loc[orders.index.isin(test_indices)]['order_id']    
    validation_ids = orders.loc[orders.index.isin(validation_indices)]['order_id']
    
    order_products_prior = pd.read_csv('order_products__prior.csv')[['product_id', 'order_id']]
    orders = pd.merge(order_products_prior, orders, on = 'order_id')
    
    orders_train = orders[orders['order_id'].isin(train_ids)].drop('order_id', 1).drop_duplicates()
    users = orders_train['user_id'].values
    items = orders_train['product_id'].values
    user_item = scipy.sparse.csc_matrix((np.ones(orders_train.shape[0]), (users - 1, items - 1)))
    
    train_data = {key: [] for key in orders['user_id'].unique() - 1}
    test_data = {key: [] for key in orders['user_id'].unique() - 1}
    validation_data = {key: [] for key in orders['user_id'].unique() - 1}
    
    for data, indices in zip([train_data, test_data, validation_data], [train_ids, test_ids, validation_ids]):
        orders_tmp = orders[orders['order_id'].isin(indices)].drop('order_id', 1).drop_duplicates()
        for user, item in zip(orders_tmp['user_id'] - 1, orders_tmp['product_id'] - 1):
            data[user].append(item)
    
    return user_item, train_data, test_data, validation_data

In [3]:
def get_svd(user_item, k):
    svd = TruncatedSVD(n_components=k)
    US = svd.fit_transform(user_item)
    Vt = svd.components_
    return US, Vt, svd.explained_variance_ratio_.sum()

In [4]:
def get_bought_items(user_item):
    bought_items = {key: [] for key in range(user_item.shape[0])}
    for user, item in zip(*user_item.nonzero()):
        bought_items[user].append(item)
    return bought_items

In [5]:
def get_top_k_recommendations_for_user(user_id, k, item_vector, bought_items):
    item_mean = np.mean(item_vector[:, bought_items[user_id]], 1)
    distance = cosine_distances([item_mean], item_vector.T)
    recommendation = np.argsort(distance)[0]
#     recommendation = recommendation[~np.in1d(recommendation, bought_items[user_id])]
    return recommendation[:k]

In [6]:
def recommend(rec_number, components_number, user_number=None):
    user_item, train_data, test_data, validation_data = load_data()
    print(str(user_item.shape[0]) + ' users, ' + str(user_item.shape[1]) + ' items')
    bought_items = get_bought_items(user_item)
    
    user_vector, item_vector, explained_variance = get_svd(user_item, components_number)
    print('SVD explained variance = ' + str(explained_variance))
    
    user_num = user_number or user_item.shape[0]
    recommendations = {key: [] for key in range(user_num)}
    for user_id in range(user_num):
        recommendations[user_id] = get_top_k_recommendations_for_user(user_id, rec_number, item_vector, bought_items)

    mapk_value = mapk(list(train_data.values())[:user_num], recommendations.values(), rec_number)
    print('mapk on train data ' + str(mapk_value))
    mapk_value = mapk(list(validation_data.values())[:user_num], recommendations.values(), rec_number)
    print('mapk on validation data ' + str(mapk_value))
    mapk_value = mapk(list(test_data.values())[:user_num], recommendations.values(), rec_number)
    print('mapk on test data ' + str(mapk_value))
    
    return recommendations

In [34]:
predicted = recommend(10, 200, 20000)

206209 users, 49688 items
SVD explained variance = 0.25253426977667354
mapk on train data 0.01779862819664903
mapk on validation data 0.012928296619110609
mapk on test data 0.013361950688145628


In [11]:
predicted = recommend(10, 2000, 100)

206209 users, 49688 items
SVD explained variance = 0.614775192017869
mapk on train data 0.018007142857142857
mapk on validation data 0.010964285714285715
mapk on test data 0.008951190476190475


In [7]:
user_item, train_data, test_data, validation_data = load_data()
svd = TruncatedSVD(n_components=2000)
user_vector = svd.fit_transform(user_item)
item_vector = svd.components_

print('SVD explained variance = ' + str(svd.explained_variance_ratio_.sum()))

with open('user_vectors_2000.pickle', 'wb') as out:
    pickle.dump(user_vector, out)
with open('item_vector_2000.pickle', 'wb') as out:
    pickle.dump(item_vector, out)

SVD explained variance = 0.6147787835100156
