In [1]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

from pyspark.mllib.recommendation import ALS, Rating

import itertools, gc, math

from pyechonest import config, song
config.ECHO_NEST_API_KEY="secret"

In [2]:
def userStrToIntMap(data):
    user = data.map(lambda x:x[0]).distinct().zipWithUniqueId()
    return user

def songStrToIntMap(data):
    songs = data.map(lambda x:x[1]).distinct().zipWithUniqueId()
    return songs

def converUserAndSongToInt(data, user, songs):
    data = data.map(lambda x: (x[0], (x[1], x[2]))).join(user)   # (user, ((songs, listenCnt), userInt))
    data = data.map(lambda x: (x[1][1], x[1][0][0], x[1][0][1]))
    data = data.map(lambda x: (x[1], (x[0], x[2]))).join(songs)
    data = data.map(lambda x: (x[1][0][0], x[1][1], int(x[1][0][1])))
    return data

def userReverseMap(user):
    userReverse = user.map(lambda x: (x[1], x[0]))
    return userReverse

def songReverseMap(songs):
    songReverse = songs.map(lambda x: (x[1], x[0]))
    return songReverse

def songStrToName(songStr):
    try:
        return song.Song(songStr)
    except IndexError:    # Echo nest API error sometimes happens
        return

def songIdToStr(songId):
    try:
        songStr = songReverse.filter(lambda x: x[0] == songId).collect()[0][1]
        return songStr
    except:
        return

In [3]:
def computeRMSE(model, data):
    dataToUserSong = data.map(lambda r: (r[0], r[1]))
    predictions = model.predictAll(dataToUserSong).map(lambda r: ((r[0], r[1]), r[2]))
    ratesAndPreds = data.map(lambda r: ((r[0], r[1]), r[2])).join(predictions)
    RMSE = math.sqrt(ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean())
    return RMSE

def findBestModel(train, validation, ranks, lambdas, alphas, iteration):
    bestModel = None
    bestRMSE = 10000000
    bestRank = None
    bestLambda = None
    bestAlpha = None
    
    for rank, lmbda, alpha in itertools.product(ranks, lambdas, alphas):
        try:
            model = ALS.trainImplicit(train, rank=rank, iterations=iteration, lambda_=lmbda, alpha=alpha)
            RMSE = computeRMSE(model, validation)
            print rank, lmbda, alpha, RMSE
            if RMSE < bestRmse:
                bestModel = model
                bestRMSE = RMSE
                bestRank = rank
                bestLambda = lmbda
                bestAlpha = alpha
        except:
            pass
        gc.collect()
    return bestModel, bestRMSE

In [None]:
data = sc.textFile('train_triplets.txt')
data = data.map(lambda x: x.strip().split())

user = userStrToIntMap(data)
songs = songStrToIntMap(data)

userReverse = userReverseMap(user)
songReverse = songReverseMap(songs)

data = converUserAndSongToInt(data, user, songs)
train6, validation2,test2 = data.randomSplit([6,2,2])

ranks = [5, 10]
lambdas = [0.01, 0.1]
alphas = [0.01, 0.1]
iteration = 5

rank=10
#bestModel, bestRMSE = findBestModel(train6, validation2, ranks, lambdas, alphas, iteration)
model = ALS.trainImplicit(train6, rank, iteration, alpha=0.1)
#RMSE = computeRMSE(model, validation2)

In [34]:
RMSE = computeRMSE(model, test2)
print RMSE

7.56231115875


In [26]:
def recommendSongs(userId, numRec):
    listenedSongs = data.filter(lambda x:x[0] == userId).map(lambda x: x[0]).collect()[:20]
    print listenedSongs
    listenedSongs = map(songStrToName, listenedSongs)
    print listenedSongs
    results = []
    res = model.recommendProducts(userId, numRec)
    for r in res:
        songId = r.product
        songStr = songIdToStr(songId)
        print songId, songStr
        try:
            songName = songStrToName(songStr)
            results.append(songName)
        except IndexError:
            pass
    return results

In [27]:
for u in user.take(1):
    userStr, userId = u[0], u[1]
    res = recommendSongs(userId, 10)
    print 'res', res

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


AttributeError: 'PipelinedRDD' object has no attribute 'Song'