# Data pre-processing

In [None]:
import numpy as np
import pandas as pd
# Read datasets: play_ds, down_ds
df_play = pd.read_csv('../data/play_ds.csv', encoding = "latin1")
df_down = pd.read_csv('../data/down_ds.csv')

# Select relevant features
df_play = df_play[['uid','song_id','play_time']]
df_down = df_down[['uid','song_id','date']]

# Feature types conversion: play data
df_play.play_time = pd.to_numeric(df_play.play_time, errors='coerce')
df_play.song_id = pd.to_numeric(df_play.song_id, errors='coerce')

# Dropna
df_play.dropna(inplace=True)

# Add up play freqs
df_play_2 = df_play.groupby(['uid','song_id']).sum()

# Drop infrequently played songs: bucket level=5
dropped = df_play_2[df_play_2.play_time<5]
df_play_2 = df_play_2.drop(dropped.index)
df_play_2.reset_index(inplace=True)

# Feature types conversion: download data
df_down.uid = pd.to_numeric(df_down.uid, errors='coerce')
df_down.song_id = pd.to_numeric(df_down.song_id, errors='coerce')
df_down.date = pd.to_datetime(df_down.date, errors='coerce')

# dropna
df_down.dropna(inplace=True)

# Add up download freqs
df_down_2 = df_down.groupby(['uid','song_id']).count()

# Drop infrequently downloaded songs: bucket level=2
df_down_2 = df_down_2[df_down_2.date>1]
df_down_2.reset_index(inplace=True)

# Rename columns
df_play_2.rename(columns={'play_time':'freq'},inplace=True)
df_down_2.rename(columns={'date':'freq'},inplace=True)

# Concat the two dataframes
df_result = pd.concat([df_play_2,df_down_2])

# Add up play and download freqs
df_result = df_result.groupby(['uid','song_id']).sum().reset_index()

# Set the implicit ratings
df_result.rename(columns={'freq':'im_rating'},inplace=True)
df_result.im_rating = 1

# Re-encode uid, song_id
df_result.uid = df_result.uid.astype(str)
df_result.song_id = df_result.song_id.astype(str)
df_result.uid = pd.Categorical(df_result.uid).codes
df_result.song_id = pd.Categorical(df_result.song_id).codes

# Build the ratings matrix

In [10]:
from scipy import sparse
from sklearn.metrics.pairwise import cosine_similarity
from time import time
highest_user_id = df_result.uid.max()
highest_song_id = df_result.song_id.max()
ratings_mat = sparse.lil_matrix((highest_user_id, highest_song_id))
ratings_mat

<61574x374635 sparse matrix of type '<class 'numpy.float64'>'
	with 0 stored elements in LInked List format>

In [11]:
for _, row in df_result.iterrows():
    ratings_mat[row.uid-1, row.song_id-1] = 1

In [12]:
utility_mat = ratings_mat

## popularity-based recommender

In [7]:
df_pr = df_result.sort_values(by=['im_rating'], ascending=False)

In [9]:
# Top 10 songs
recommend_songs = df_pr.song_id.values[:10]
recommend_songs

array([301934,      1, 277882, 143431, 296477,      0, 244773, 240054,
       163227,  71599], dtype=int64)

## item-item based recommender

In [None]:
# Item-Item Similarity Matrix
item_sim_mat = cosine_similarity(utility_mat.T)
least_to_most_sim_indexes = np.argsort(item_sim_mat, axis=1)

# Neighborhoods
neighborhood_size = 75
neighborhoods = least_to_most_sim_indexes[:, -neighborhood_size:]

In [None]:
# Let's pick a lucky user
user_id = 100
n_users = utility_mat.shape[0]
n_items = utility_mat.shape[1]

start_time = time()
items_rated_by_this_user = ratings_mat[user_id].nonzero()[1]
# Just initializing so we have somewhere to put rating preds
out = np.zeros(n_items)
for item_to_rate in range(n_items):
    relevant_items = np.intersect1d(neighborhoods[item_to_rate],
                                    items_rated_by_this_user,
                                    assume_unique=True)  # assume_unique speeds up intersection op
    out[item_to_rate] = ratings_mat[user_id, relevant_items] * \
        item_sim_mat[item_to_rate, relevant_items] / \
        item_sim_mat[item_to_rate, relevant_items].sum()


pred_ratings = np.nan_to_num(out)
print(pred_ratings)

In [None]:
# Recommend n songs
n = 10

# Get item indexes sorted by predicted rating
item_index_sorted_by_pred_rating = list(np.argsort(pred_ratings))[::-1]

# Find items that have been rated by user
items_rated_by_this_user = ratings_mat[user_id].nonzero()[1]

# We want to exclude the items that have been rated by user
unrated_items_by_pred_rating = [item for item in item_index_sorted_by_pred_rating
                                if item not in items_rated_by_this_user]

unrated_items_by_pred_rating[:n]

## matrix factorization-based recommender

In [39]:
from sklearn.decomposition import TruncatedSVD

def fit_uvd(M,k):
    svd = TruncatedSVD(n_components=k, n_iter=10, random_state=0)
    svd.fit(M)

    V = svd.components_
    U = svd.transform(M)
    return U,V, svd

# decompose
U,V,svd = fit_uvd(ratings_mat,200)
print(U.shape,V.shape)

(61574, 200) (200, 374635)


In [None]:
# reconstruct
ratings_mat_fitted = U.dot(V) # U*V
errs = np.array((ratings_mat-ratings_mat_fitted).flatten()).squeeze()
mask = np.array((ratings_mat.todense()).flatten()).squeeze()>0

mse = np.mean(errs[mask]**2)
average_abs_err = abs(errs[mask]).mean()
print(mse)
print(average_abs_err)

In [41]:
# get recommendations for one user
user_id = 100
n = 10

pred_ratings = np.squeeze(np.asarray(ratings_mat[user_id,:].todense()))
item_index_sorted_by_pred_rating = list(np.argsort(pred_ratings))[::-1]

items_rated_by_this_user = ratings_mat[user_id].nonzero()[1]

unrated_items_by_pred_rating = [item for item in item_index_sorted_by_pred_rating
                                if item not in items_rated_by_this_user]

unrated_items_by_pred_rating[:n]

[124874,
 124869,
 124863,
 124864,
 124865,
 124866,
 124867,
 124868,
 124870,
 124873]