# Collaborative Filter Approach

In [4]:
import numpy as np
import scipy as sp
import scipy.sparse as sps
import sklearn as sk
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import pairwise_distances

# All outputs are visible
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
#Retina resolution for the plots
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('retina')
# All plots contained in the notebook
%matplotlib inline

In [5]:
trainData = pd.read_csv("../data/raw/train.csv")
trackData = pd.read_csv("../data/raw/tracks.csv")
tP = pd.read_csv("../data/raw/target_playlists.csv")
sS = pd.read_csv("../data/raw/sample_submission.csv")



In [6]:
trainData.head()

Unnamed: 0,playlist_id,track_id
0,0,14301
1,0,8360
2,0,12844
3,0,18397
4,0,1220


In [7]:
test = trackData['album_id'].value_counts().reset_index().rename(columns={"index":"album_id","album_id":"album_counts"})
test2 = trackData['artist_id'].value_counts().reset_index().rename(columns={"index":"artist_id","artist_id":"artist_counts"})
test3 = trainData['playlist_id'].value_counts().reset_index().rename(columns={"index":"playlist_id","playlist_id":"playlist_counts"})


albums_songs = test.sort_values('album_counts',ascending=False)
artists_songs = test2.sort_values('artist_counts',ascending=False)
playlists_songs = test3.sort_values('playlist_counts',ascending=False)
albums_songs.head() # how many songs each album has
albums_songs.shape
artists_songs.head() # how many songs each artist performed
artists_songs.shape
playlists_songs.head() # how many songs each playlist has
playlists_songs.shape
trackData.head()
trackData.shape

Unnamed: 0,album_id,album_counts
0,6622,28
1,4853,26
2,11633,24
3,4774,21
8,1490,16


(12744, 2)

Unnamed: 0,artist_id,artist_counts
0,2175,84
1,5367,67
2,3829,64
3,4782,61
4,6591,57


(6668, 2)

Unnamed: 0,playlist_id,playlist_counts
0,38239,100
2,44196,93
1,48541,93
3,16146,92
4,26714,92


(50446, 2)

Unnamed: 0,track_id,album_id,artist_id,duration_sec
0,0,6306,449,167
1,1,12085,4903,185
2,2,1885,6358,201
3,3,3989,1150,263
4,4,11633,4447,96


(20635, 4)

In [8]:
trainData_merged = trainData.merge(trackData, on='track_id')

In [9]:
trainData.head()
trainData_merged.head()

Unnamed: 0,playlist_id,track_id
0,0,14301
1,0,8360
2,0,12844
3,0,18397
4,0,1220


Unnamed: 0,playlist_id,track_id,album_id,artist_id,duration_sec
0,0,14301,1160,5229,221
1,58,14301,1160,5229,221
2,249,14301,1160,5229,221
3,277,14301,1160,5229,221
4,1029,14301,1160,5229,221


Create the data matrix for each playlist and tracks

In [10]:
num_playlists = trainData['playlist_id'].unique().shape[0]
num_tracks = trackData['track_id'].unique().shape[0]
data_matrix = np.zeros((num_playlists,num_tracks))
for line in trainData.iloc[:,:].itertuples():
    data_matrix[line[1]-1,line[2]-2] = 1
pd_data = pd.DataFrame(data=data_matrix)

In [None]:
# from sklearn.metrics.pairwise import pairwise_distances 
# user_similarity = pairwise_distances(data_matrix, metric='cosine')
# item_similarity = pairwise_distances(data_matrix.T, metric='cosine')

In [11]:
trainData['interactions'] = np.ones((trainData.shape[0],1),dtype=int)

In [13]:
interaction = np.array(trainData['interactions'])
rows = np.array(trainData['playlist_id'])
cols = np.array(trainData['track_id'])
import scipy.sparse as sps
sparse_interaction = sps.coo_matrix((interaction,(rows,cols)))
sparse_interaction

<50446x20635 sparse matrix of type '<class 'numpy.int64'>'
	with 1211791 stored elements in COOrdinate format>

Create the User similarity matrix

In [None]:
user_similarity = sps.coo_matrix(pairwise_distances(sparse_interaction.tocsr(),metric='cosine'))

Create the Item similarity matrix

In [None]:
item_similarity = sps.coo_matrix(pairwise_distances(sparse_interaction.T.tocsr(),metric='cosine'))

Save the matrices as sparse matrices

In [None]:
sps.save_npz("user_similarity",user_similarity)
sps.save_npz("item_similarity",item_similarity)

In [1]:
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        #We use np.newaxis so that mean_user_rating has same format as ratings
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return pred

Make predictions based on the users

In [None]:
user_prediction = predict(data_matrix, user_similarity, type='user')

Make predictions based on the items

In [2]:
item_prediction = predict(data_matrix, item_similarity, type='item')

NameError: name 'data_matrix' is not defined