In [1]:
import numpy as np
import scipy
import pandas as pd
import math
import random
import sklearn
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse.linalg import svds
import matplotlib.pyplot as plt

In [24]:
data_items = pd.read_csv('https://gist.githubusercontent.com/victorkohler/0931d181ef126e0740d8aac6933f13f4/raw/07f87d79d2ad34ddc9d24e4ba5287ba3503725a6/lastfm.csv')

In [25]:
data_items.head()

Unnamed: 0,user,a perfect circle,abba,ac/dc,adam green,aerosmith,afi,air,alanis morissette,alexisonfire,...,timbaland,tom waits,tool,tori amos,travis,trivium,u2,underoath,volbeat,yann tiersen
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,33,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,42,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,51,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,62,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
last_df = data_items.drop('user', 1)

In [8]:
# As a first step we normalize the user vectors to unit vectors.

# magnitude = sqrt(x2 + y2 + z2 + ...)
magnitude = np.sqrt(np.square(last_df).sum(axis=1))

In [10]:
# unitvector = (x / magnitude, y / magnitude, z / magnitude, ...)
last_df = last_df.divide(magnitude, axis='index')

In [20]:
def calculate_similarity(data_items):
    """Calculate the column-wise cosine similarity for a sparse
    matrix. Return a new dataframe matrix with similarities.
    """
    data_sparse = scipy.sparse.csr_matrix(data_items)
    similarities = cosine_similarity(data_sparse.transpose())
    sim = pd.DataFrame(data=similarities, index= data_items.columns, columns= data_items.columns)
    return sim

In [19]:
# Build the similarity matrix
data_matrix = calculate_similarity(last_df)

# Lets get the top 11 similar artists for Beyonce
print(data_matrix.loc['beyonce'].nlargest(11))

  (0, 82)	0.30151134457776363
  (0, 87)	0.30151134457776363
  (0, 109)	0.30151134457776363
  (0, 114)	0.30151134457776363
  (0, 129)	0.30151134457776363
  (0, 140)	0.30151134457776363
  (0, 216)	0.30151134457776363
  (0, 220)	0.30151134457776363
  (0, 222)	0.30151134457776363
  (0, 253)	0.30151134457776363
  (0, 262)	0.30151134457776363
  (1, 3)	0.20412414523193154
  (1, 31)	0.20412414523193154
  (1, 58)	0.20412414523193154
  (1, 60)	0.20412414523193154
  (1, 68)	0.20412414523193154
  (1, 71)	0.20412414523193154
  (1, 75)	0.20412414523193154
  (1, 94)	0.20412414523193154
  (1, 97)	0.20412414523193154
  (1, 102)	0.20412414523193154
  (1, 126)	0.20412414523193154
  (1, 129)	0.20412414523193154
  (1, 148)	0.20412414523193154
  (1, 162)	0.20412414523193154
  :	:
  (1223, 174)	0.21320071635561041
  (1223, 188)	0.21320071635561041
  (1223, 238)	0.21320071635561041
  (1224, 34)	0.4472135954999579
  (1224, 62)	0.4472135954999579
  (1224, 207)	0.4472135954999579
  (1224, 241)	0.4472135954999579

In [28]:
#------------------------
# USER-ITEM CALCULATIONS
#------------------------

user = 5985 # The id of the user for whom we want to generate recommendations
user_index = data_items[data_items.user == user].index.tolist()[0] # Get the frame index

# Get the artists the user has likd.
known_user_likes = last_df.ix[user_index]
known_user_likes = known_user_likes[known_user_likes >0].index.values

# Users likes for all items as a sparse vector.
user_rating_vector = last_df.ix[user_index]

# Calculate the score.
score = data_matrix.dot(user_rating_vector).div(data_matrix.sum(axis=1))

# Remove the known likes from the recommendation.
score = score.drop(known_user_likes)

# Print the known likes and the top 20 recommendations.
print(known_user_likes)
print(score.nlargest(20))

['bob dylan' 'the cure']
joy division           0.038003
the smiths             0.031639
david bowie            0.031589
yann tiersen           0.031574
the rolling stones     0.030289
tom waits              0.027140
eric clapton           0.025789
misfits                0.025592
led zeppelin           0.023374
belle and sebastian    0.022591
elliott smith          0.022134
jimi hendrix           0.021589
the national           0.018477
ramones                0.017242
the beatles            0.017122
mogwai                 0.016272
the doors              0.016109
sufjan stevens         0.015978
bruce springsteen      0.015488
the clash              0.015378
dtype: float64


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  if __name__ == '__main__':
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  del sys.path[0]
