In [1]:
val rawUserArtistData = sc.textFile("file:///var/lib/myspark/dataset/profiledata_06-May-2005/user_artist_data.txt")
rawUserArtistData.count()

24296858

In [2]:
rawUserArtistData.map(_.split(' ')(0).toDouble).stats()

(count: 24296858, mean: 1947573.265353, stdev: 496000.544975, max: 2443548.000000, min: 90.000000)

In [3]:
rawUserArtistData.map(_.split(' ')(1).toDouble).stats()

(count: 24296858, mean: 1718704.093757, stdev: 2539389.040171, max: 10794401.000000, min: 1.000000)

In [4]:
val rawArtistData = sc.textFile("file:///var/lib/myspark/dataset/profiledata_06-May-2005/artist_data.txt")
rawArtistData.count()

1848707

In [5]:
val artistByID = rawArtistData.map{
    line => 
    val (id, name) = line.span(_ != '\t')
    (id.toInt, name.trim)
}
artistByID.take(5).foreach(println)

(1134999,06Crazy Life)
(6821360,Pang Nakarin)
(10113088,Terfel, Bartoli- Mozart: Don)
(10151459,The Flaming Sidebur)
(6826647,Bodenstandig 3000)


In [6]:
val artistByID = rawArtistData.map{ line => 
    val (id, name ) = line.span(_ != '\t')
    if(name.isEmpty){
        None
    }else{
        try{
            Some((id.toInt, name.trim))
        }catch{
            case e: NumberFormatException => None
        }
    }
}
val newArtistByID = artistByID.filter(_ != None).map{case Some((a,b))=>(a,b)}
newArtistByID.take(10).foreach(println)

(1134999,06Crazy Life)
(6821360,Pang Nakarin)
(10113088,Terfel, Bartoli- Mozart: Don)
(10151459,The Flaming Sidebur)
(6826647,Bodenstandig 3000)
(10186265,Jota Quest e Ivete Sangalo)
(6828986,Toto_XX (1977)
(10236364,U.S Bombs -)
(1135000,artist formaly know as Mat)
(10299728,Kassierer - Musik für beide Ohren)


In [7]:
val rawArtistAlias = sc.textFile("file:///var/lib/myspark/dataset/profiledata_06-May-2005/artist_alias.txt")
val artistAlias = rawArtistAlias.flatMap{ line =>
    val tokens = line.split('\t')
    if(tokens(0).isEmpty){
        None
    }else {
        Some((tokens(0).toInt, tokens(1).toInt))
    }
}.collectAsMap()
artistAlias.take(10).foreach(println)

(2094504,1012167)
(1186393,78)
(2139121,1011083)
(10208201,4605)
(2024757,1001941)
(9969191,1320354)
(10412283,1010353)
(2124273,2814)
(6663187,1992)
(6803336,1000010)


In [8]:
newArtistByID.lookup(6663187).head

Ja Rule ft. lil mo & vita

In [9]:
newArtistByID.lookup(1992).head

Ja Rule

In [10]:
import org.apache.spark.mllib.recommendation._

val bArtistAlias = sc.broadcast(artistAlias)

val trainData = rawUserArtistData.map{ line =>
    val Array(userID, artistID, count) = line.split(' ').map(_.toInt)
    val finalArtistID = bArtistAlias.value.getOrElse(artistID, artistID)
    Rating(userID, finalArtistID, count)
}.cache()
trainData.count()

24296858

In [11]:
trainData.take(20).foreach(println)

Rating(1000002,1,55.0)
Rating(1000002,1000006,33.0)
Rating(1000002,1000007,8.0)
Rating(1000002,1000009,144.0)
Rating(1000002,1000010,314.0)
Rating(1000002,1000013,8.0)
Rating(1000002,1000014,42.0)
Rating(1000002,1000017,69.0)
Rating(1000002,1000024,329.0)
Rating(1000002,1000025,1.0)
Rating(1000002,1000028,17.0)
Rating(1000002,1000031,47.0)
Rating(1000002,1000033,15.0)
Rating(1000002,1000042,1.0)
Rating(1000002,1000045,1.0)
Rating(1000002,1000054,2.0)
Rating(1000002,1000055,25.0)
Rating(1000002,1000056,4.0)
Rating(1000002,1000059,2.0)
Rating(1000002,1000062,71.0)


In [12]:
val model = ALS.trainImplicit(trainData, 10, 5, 0.01, 1.0)

In [13]:
model.userFeatures.mapValues(_.mkString(", ")).first()

(90,-0.6872329115867615, -0.6961866021156311, 0.6544296741485596, -0.021609988063573837, 0.01312947366386652, -0.22800526022911072, -0.33769890666007996, 0.3442131578922272, -0.5649751424789429, -0.7838118076324463)

In [23]:
val rawArtist4User = rawUserArtistData.map(_.split(' ')).filter{case Array(user,_,_)=> user.toInt == 2093760}
rawArtist4User.collect.foreach(x=>println(x(0) +" : "+ x(1) +" : "+ x(2)))

                                                                                2093760 : 1180 : 1
2093760 : 1255340 : 3
2093760 : 378 : 1
2093760 : 813 : 2
2093760 : 942 : 7


In [25]:
val rawArtist4UserGT2 = rawUserArtistData.map(_.split(' ')).filter{case Array(user,_,count)=> (user.toInt == 2093760 && count.toInt >=2 )}
rawArtist4User.collect.foreach(x=>println(x(0) +" : "+ x(1) +" : "+ x(2)))

                                                                                2093760 : 1255340 : 3
2093760 : 942 : 7


In [21]:
val existingProducts = rawArtist4User.map{case Array(_,artist,_)=>artist.toInt}.collect().toSet
existingProducts.foreach(println)

1255340
942
1180
813
378


In [20]:
newArtistByID.filter{case (id,name)=>existingProducts.contains(id)}.values.collect().foreach(println)

                                                                                David Gray
Blackalicious
Jurassic 5
The Saw Doctors
Xzibit


In [26]:
val recommendations = model.recommendProducts(2093760, 10)
recommendations.foreach(println)

Rating(2093760,2814,0.03293693435758711)
Rating(2093760,1001819,0.03229471003837945)
Rating(2093760,1300642,0.03206460631566366)
Rating(2093760,1007614,0.03136908302499334)
Rating(2093760,4605,0.03134113371120238)
Rating(2093760,1003249,0.030874872787023278)
Rating(2093760,1811,0.030867540635197574)
Rating(2093760,1037970,0.030770580428378828)
Rating(2093760,829,0.029971973588721963)
Rating(2093760,1004028,0.0299674146954826)


In [29]:
val recommendedProductIDs = recommendations.map(_.product).toSet
newArtistByID.filter{case (id,name)=>recommendedProductIDs.contains(id)}.values.collect().foreach(println)

                                                                                Notorious B.I.G.
50 Cent
Snoop Dogg
Nas
Jay-Z
Kanye West
Dr. Dre
Ludacris
2Pac
The Game


In [30]:
val newModel = ALS.trainImplicit(trainData, 50, 5, 1.0, 40)

In [31]:
val recommendationsNew = newModel.recommendProducts(2093760, 10)
val recommendedProductIDsNew = recommendationsNew.map(_.product).toSet
newArtistByID.filter{case (id,name)=>recommendedProductIDsNew.contains(id)}.values.collect().foreach(println)

                                                                                50 Cent
Snoop Dogg
Jay-Z
Black Eyed Peas
Kanye West
D12
Dr. Dre
2Pac
Eminem
Outkast
