# Recommedation System with Neighborhood Model


Load the module neighborhood_model. See neighborhood_model.py
1. Read in the user-song pair data. 
2. Tune the hyperparameter for the neighborhood model. 
3. Make recommendation based on the model. The recommendation can be made for both user in the read-in data and out.
4. Showcase for the recommnedation system. Randomly select 10 users. Show the songs they have listened before and the songs we recommend to them.

## Read in data

In [1]:
from neighborhood_model import *
import numpy as np
import pandas as pd
from scipy.sparse import coo_matrix
import matplotlib.pyplot as plt
from sklearn.preprocessing import normalize
from scipy.sparse import lil_matrix
from scipy.sparse import csr_matrix
from scipy.sparse import load_npz
import scipy.sparse as sp
%load_ext autoreload
%autoreload 2
def load_data(filename):
    df = pd.read_table(filename, sep='\t', names = ['user_id','song_id','playcount'])
    n_users = df.user_id.unique().shape[0] 
    n_songs = df.song_id.unique().shape[0]
    ratings = np.zeros((n_users, n_songs))
    df['user_id'] = df['user_id'].astype('category')
    df['song_id'] = df['song_id'].astype('category')
    parsed_matrix = coo_matrix((df['playcount'].astype(float),(df['user_id'].cat.codes, df['song_id'].cat.codes))).tolil()
    return parsed_matrix

#obtain a subset of the data
def data_sampling(data, num_users,num_songs):
    num_instances, num_features = data.shape[0], data.shape[1]
    #sample users from the data
    sample_user_index = np.random.choice(num_instances, num_users, replace=False)
    #sample songs from the data
    sample_song_index = np.random.choice(num_features, num_songs, replace=False)
    samples = data[sample_user_index,:]
    samples = samples[:, sample_song_index,]
    return samples

#compute the inverse user frequency of the data reduce weights for commonly occurring songs
def ivf(x):
    num_user = x.shape[0]
    binary_data = x.copy()
    binary_data[x != 0] = 1
    nj = np.asarray(binary_data.sum(axis = 0)).squeeze()

    fj =np.log(num_user/nj)
    for i in range(num_user):
        x[i,:] = x[i,:].multiply(fj)
    return x

#produce the inverse user frequency feature if needed, and normalize the data
def prep2(delete_user_song, if_ivf):
    if if_ivf == 1:
    #compute ivf of delete_user_song_ivf
        delete_user_song = ivf(delete_user_song)
    user_song_normalized = lil_matrix(normalize(delete_user_song, axis=1),dtype = np.float64)
    return user_song_normalized

def train_test_split(data, size):
#data should be ndarray format
    test = np.zeros(data.shape)
    train = data.copy()
    for user in range(data.shape[0]):
        test_index = np.random.choice(data[user, :].nonzero()[0], 
                                        size=size, 
                                        replace=False)
        train[user, test_index] = 0.
        test[user, test_index] = data[user, test_index]
    # Test and training are truly disjoint
    assert(np.all((train * test) == 0)) 
    train = lil_matrix(train,dtype = np.float64)
    test = lil_matrix(test,dtype = np.float64)
    return train, test

def main_2(num_users, num_songs, thres, rho, user_id, num_recommend, if_ivf = 0):
    user_song_matrix = load_npz('./sparse_matrix.npz')
    raw_data = data_sampling(user_song_matrix, num_users, num_songs)
    data  = prep(raw_data,thres)
    model = NeighborhoodModel(rho)
    model.fit(data)
    #print(model.recommend(user_id,num_recommend))
    return model.evaluate()

def main(data, train, test, thres, rho, user_id, num_recommend, if_ivf = 0):
    model = NeighborhoodModel(data, train, test)
    model.fit(rho)
    print(model.recommend(user_id,num_recommend))
    print(model.evaluate())
    user_pref = np.zeros((1, data.shape[1]))
    user_pref[0,0], user_pref[0,22], user_pref[0,222], user_pref[0,2222] = 2, 3, 5, 6
    print(model.recommend_ind(user_pref, 3))
    return 1

def run(train, test, rho):
    model = NeighborhoodModel()
    model.fit(train, test, rho)
    return model.evaluate()



In [2]:
f = './sample_matrix.npz'
#read in the data produced by preprocessing.py
raw_data = sp.load_npz(f)

## Parameter Tuning

In [3]:
import operator

def parameter_tuning(raw_data, test_size, rho_range = [1,1.5, 1.999, 2.5, 3]):
    '''
    Tune two parameter: 
    if_ivf: whether to use Inverse User Frequency or original data feature
    rho: case amplificatio parameter
    Inputs:
        - raw_data: a sparse matrix of size(#users, #songs)
        - test_size: scalar, percent of obs used for testing
        - rho_range: list of rhos used for parameter tuning
    Output:
        - best_para: tuple of best parameter
    '''
    result = {}
    for if_ivf in [0,1]:
        data = prep2(raw_data.copy(), if_ivf = if_ivf)
        train, test = train_test_split(data.toarray(), test_size)
        for rho in [1,1.5, 1.999, 2.5, 3]:
            result[(rho,if_ivf)] = run(train, test, rho)

    for key in result:
        print(str(key) + ': ' + str(result[key]))
    best_para = min(result.items(), key=operator.itemgetter(1))[0]
    print('best parameter is rho = ' + str(best_para[0]) + ' if_ivf = ' + str(best_para[1]))
    return best_para

In [4]:
best_para = parameter_tuning(raw_data, test_size = 20,rho_range = [2.5, 3,4,5,10])

  return self._with_data(data ** n)


(2.5, 0): 16.3655613746
(3, 1): 15.7523197962
(2.5, 1): 24.4495429034
(1.5, 0): 16.8063741018
(1.999, 1): 24.3087870156
(1.999, 0): 16.4543514092
(1.5, 1): 24.3728337994
(1, 0): 14.865758076
(3, 0): 13.3545845664
(1, 1): 16.5995817516
best parameter is rho = 3 if_ivf = 0


## Recommendation system

In [95]:
def recommend(raw_data, best_para, num_rec, if_outside_user = 0, user_id = None, user_pref = None):
    '''
    Recommend songs using neighborhood models, produce song_ids for given user_id.
    Inputs:
        - raw_data: a sparse matrix of size(#users, #songs)
        - best_para: tuple of best parameter
        - if_outside_user: logical parameter that indicates whether the user is in the read-in data
        - id of the user
        - user_pref: if the user is not in the read-in data, this parameter is a list 
        of # of times of each song that the user has listened to (the same format in the read-in data)
    Output:
        - rec_song: a list of song_ids generated by the neighborhood model.
    '''
    rho, if_ivf= 1.5, best_para[1]
    data = prep2(raw_data, if_ivf = if_ivf)
    model = NeighborhoodModel()
    if if_outside_user == 0:
        rec_song = model.recommend(data,user_id = user_id, num_rec = num_rec)
    else:
        
        rec_song = model.recommend_out(data, user_pref, num_rec = num_rec)
    return rec_song

In [96]:
recommend(raw_data, best_para, if_outside_user = 0, num_rec = 1, user_id = 23)

[6962]

## Recommendation system showcase

In [98]:
def top_k_song(raw_data, user_id, k):
    # get the most listened k song ids for the specific user 
    song_arr = np.asarray(raw_data[user_id, :].todense()).squeeze()
    topksong = np.argsort(song_arr)[-k:]
    return topksong

In [99]:
#select 10 random users
user_ids = np.random.choice(num_instances, 10)


In [102]:
num_user_rec = 10
rec_dict = {}
read_dict = np.load('song_index_dictionary.npy').item()

for user_id in user_ids:
    # the list of song ids generated by the recommendation system
    rec_song = np.array([read_dict[x] for x in recommend(raw_data, best_para, if_outside_user = 0, num_rec = 3, user_id = user_id)])
    #the list of  10 song ids that the user listened to most frequently.
    topksong =  np.array([read_dict[x] for x in top_k_song(raw_data, user_id, 10)])
    rec_song_name = []
    topksong_name = []
    # convert song ids to the name of the songs using unique_tracks.txt for both rec_song and topksong
    # store the song name for each user in dictionary rec_dict, key is the user_id, 
    # value is a list of song names and artists
    for i in range(rec_song.shape[0]):
        song_id = rec_song[i]
        song_name = None
        searchfile = open("unique_tracks.txt", "r")
        for line in searchfile:
            if song_id in line:
                song_arr = line.rsplit('<SEP>', 2)
                song_name = song_arr[2].rstrip()
                artist_name = song_arr[1].rstrip()
        rec_song_name.append((song_name,artist_name))
        searchfile.close()
    for i in range(topksong.shape[0]):
        song_id = topksong[i]
        song_name = None
        searchfile = open("unique_tracks.txt", "r")
        for line in searchfile:
            if song_id in line: 
                song_arr = line.rsplit('<SEP>', 2)
                song_name = song_arr[2].rstrip()
                artist_name = song_arr[1].rstrip()
        topksong_name.append((song_name,artist_name))
        searchfile.close()
    rec_dict[user_id] = (topksong_name, rec_song_name)


In [103]:
# key is the user_id, 
# value is a list of song names and artists
# first list of the values are the song names of the past listening hitory
# second list of the values are the song name of the 
rec_dict

{279: ([('Prête A Porter', 'Paris Combo'),
   ('Invocation: Attica Blues', 'Archie Shepp / William Kunstler'),
   ("Where You'll Find Me Now", 'Neutral Milk Hotel'),
   ('Under The Gun', 'The Killers'),
   ('Anthems For a Seventeen Year-Old Girl', 'Broken Social Scene'),
   ('Angry Chair', 'Alice In Chains'),
   ('Would You Go With Me', 'Josh Turner'),
   ('Spilt Needles (Album)', 'The Shins'),
   ('Comet Course', 'Flying Lotus'),
   ('Jeane', 'The Smiths')],
  [('Scream', 'Michael Jackson'),
   ('Window Blues', 'Lykke Li'),
   ('I Believe In A Thing Called Love', 'The Darkness')]),
 854: ([("It's My Party", 'Lesley Gore'),
   ('People', 'Journey'),
   ('When A Man Loves A Woman', 'Percy Sledge'),
   ('Window Blues', 'Lykke Li'),
   ('Ride For You (Album Version)', 'Danity Kane'),
   ("Un-thinkable (I'm Ready)", 'Alicia Keys'),
   ('The Wild Boys', 'Duran Duran'),
   ('Secret Hell', 'dEUS'),
   ("Things I Don't Understand", 'Coldplay'),
   ('So Glad To See You', 'Hot Chip')],
  [('Jerr