## Audio Recommender

### Wenting Su, ws2512
### Kewei Liu, kl2987 
### Tianhui Shen, ts3088

In [1]:
from pyspark import SparkContext
from pyspark.context import SparkConf

conf = SparkConf().setAppName("App")
conf = (conf.setMaster('local[*]')
        .set('spark.executor.memory', '4G')
        .set('spark.driver.memory', '45G')
        .set('spark.driver.maxResultSize', '10G'))
sc = SparkContext(conf=conf)

In [2]:
sc

Load Data

In [3]:
rawUserArtistData = sc.textFile("audio_data/user_artist_data.txt")

In [4]:
rawUserArtistData.map(lambda l: float(l.split(' ')[0])).stats()

(count: 24296858, mean: 1947573.26535, stdev: 496000.5449748051, max: 2443548.0, min: 90.0)

In [5]:
rawUserArtistData.map(lambda l: float(l.split(' ')[1])).stats()

(count: 24296858, mean: 1718704.09376, stdev: 2539389.0401707785, max: 10794401.0, min: 1.0)

The maximum user and artist IDs are 2443548 and 10794401, respectively. 
Both are smaller than 2147483647. Thus, no additional transformation will be necessary to use these IDs.

In [6]:
rawArtistData = sc.textFile("audio_data/artist_data.txt") 

In [7]:
def artistID (line):
    splits = line.split('\t', 1)
    if len(splits) != 2:
        return []
    else:
        try:
            return [(int(splits[0]), splits[1])]
        except:
            return []

artistByID = dict(rawArtistData.flatMap(lambda l: artistID(l)).collect())

In [8]:
rawArtistAlias = sc.textFile("audio_data/artist_alias.txt") 

In [9]:
def artistalias (line):
    splits = line.split('\t')
    if not splits[0]:
        return []
    else:
        return [(int(splits[0]), int(splits[1]))]

artistAlias = rawArtistAlias.flatMap(lambda l: artistalias(l)).collectAsMap() 

In [10]:
artistByID[6803336]

u'Aerosmith (unplugged)'

In [11]:
artistByID[1000010]

u'Aerosmith'

This entry evidently maps “Aerosmith (unplugged)” to “Aerosmith.”

Build model

In [12]:
from pyspark.mllib.recommendation import ALS, Rating
bArtistAlias = sc.broadcast(artistAlias)

In [13]:
def processtrain (line):
    userID, artistID, count = map(lambda l: int(l), line.split(' '))
    finalArtistID = bArtistAlias.value.get(artistID)
    if not finalArtistID:
        finalArtistID = artistID
    return Rating(userID, finalArtistID, count)

trainData = rawUserArtistData.map(lambda l: processtrain(l)).cache()

In [14]:
trainData.take(5)

[Rating(user=1000002, product=1, rating=55.0),
 Rating(user=1000002, product=1000006, rating=33.0),
 Rating(user=1000002, product=1000007, rating=8.0),
 Rating(user=1000002, product=1000009, rating=144.0),
 Rating(user=1000002, product=1000010, rating=314.0)]

In [15]:
model = ALS.trainImplicit(trainData, 10, 5, 0.01) # python model has a param less than scala model

In [16]:
userFeatures = model.userFeatures()

In [17]:
userFeatures.take(5)

[(90,
  array('d', [-0.02411564067006111, 0.03970932960510254, -0.004237746819853783, -0.06278897821903229, 0.07268514484167099, -0.001797014963813126, -0.0375199131667614, -0.04872237890958786, -0.057996898889541626, -0.050966233015060425])),
 (120,
  array('d', [-0.04114802926778793, -0.016246354207396507, -0.028688466176390648, 0.024308418855071068, -0.014551378786563873, 0.027085544541478157, -0.0024207697715610266, -0.018930630758404732, -0.014201169833540916, 0.03605864197015762])),
 (384,
  array('d', [-0.1724633276462555, -0.20634429156780243, 0.012565928511321545, -0.10770470649003983, 0.22335156798362732, 0.417214572429657, -0.08936502039432526, 0.2841363251209259, -0.27385929226875305, -0.014617789536714554])),
 (828,
  array('d', [0.00902756117284298, 0.01565699465572834, -0.013599229976534843, 0.00865993183106184, 0.005420266184955835, 0.025692369788885117, -0.001091812620870769, -0.006139861419796944, -0.009116570465266705, 0.01676923781633377])),
 (1014,
  array('d', [0.

In [18]:
bartistByID = sc.broadcast(artistByID)
ArtistsForUser = trainData.filter(lambda r: r.user == 2093760).map(lambda r: bartistByID.value.get(r.product)).collect()

In [19]:
for artist in ArtistsForUser:
    print artist

David Gray
The Saw Doctors
Blackalicious
Jurassic 5
Xzibit


In [20]:
recommendations = model.recommendProducts(2093760, 5)

In [21]:
for recom in recommendations:
    print recom

Rating(user=2093760, product=1007614, rating=0.034918000860505456)
Rating(user=2093760, product=4605, rating=0.03335815401725925)
Rating(user=2093760, product=2814, rating=0.032287476799391535)
Rating(user=2093760, product=1037970, rating=0.03156481228181917)
Rating(user=2093760, product=829, rating=0.031449306009725606)


Find top 10 recommendations for user 2093760

In [22]:
recommendedProductIDs = map(lambda r: artistByID.get(r.product), model.recommendProducts(2093760, 10))

In [23]:
print "top 10 recommendations for user 2093760\n"
for recom in recommendedProductIDs:
    print recom

top 10 recommendations for user 2093760

Jay-Z
Snoop Dogg
50 Cent
Kanye West
Nas
2Pac
Dr. Dre
Outkast
Ludacris
The Roots
