In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import split, min, max 
from pyspark.sql.types import IntegerType, StringType
from pyspark.sql.functions import col
from pyspark.sql.functions import broadcast, when
import pandas as pd

In [3]:
spark = SparkSession.builder.appName("rec").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [4]:
raw_user_artist_data = spark.read.text("datasets/Audio-Scrobbler-Datasets/user_artist_data.txt")

In [5]:
raw_user_artist_data.show()

                                                                                

+-------------------+
|              value|
+-------------------+
|       1000002 1 55|
| 1000002 1000006 33|
|  1000002 1000007 8|
|1000002 1000009 144|
|1000002 1000010 314|
|  1000002 1000013 8|
| 1000002 1000014 42|
| 1000002 1000017 69|
|1000002 1000024 329|
|  1000002 1000025 1|
| 1000002 1000028 17|
| 1000002 1000031 47|
| 1000002 1000033 15|
|  1000002 1000042 1|
|  1000002 1000045 1|
|  1000002 1000054 2|
| 1000002 1000055 25|
|  1000002 1000056 4|
|  1000002 1000059 2|
| 1000002 1000062 71|
+-------------------+
only showing top 20 rows



In [6]:
raw_artist_data = spark.read.text("datasets/Audio-Scrobbler-Datasets/artist_data.txt")
raw_artist_data.show(5)

+--------------------+
|               value|
+--------------------+
|1134999	06Crazy Life|
|6821360	Pang Nakarin|
|10113088	Terfel, ...|
|10151459	The Flam...|
|6826647	Bodenstan...|
+--------------------+
only showing top 5 rows



In [7]:
raw_artist_alias = spark.read.text("datasets/Audio-Scrobbler-Datasets/artist_alias.txt")
raw_artist_alias.show(5)

+----------------+
|           value|
+----------------+
| 1092764	1000311|
| 1095122	1000557|
| 6708070	1007267|
|10088054	1042317|
| 1195917	1042317|
+----------------+
only showing top 5 rows



In [8]:
user_artist_df = raw_user_artist_data.withColumn("user", split(raw_user_artist_data['value'], ' ').getItem(0).cast(IntegerType())) \
                .withColumn("artist", split(raw_user_artist_data['value'], ' ').getItem(1).cast(IntegerType())) \
                .withColumn("count", split(raw_user_artist_data['value'], ' ').getItem(2).cast(IntegerType())) \
                .drop('value')

In [9]:
user_artist_df.show()

[Stage 3:>                                                          (0 + 1) / 1]

+-------+-------+-----+
|   user| artist|count|
+-------+-------+-----+
|1000002|      1|   55|
|1000002|1000006|   33|
|1000002|1000007|    8|
|1000002|1000009|  144|
|1000002|1000010|  314|
|1000002|1000013|    8|
|1000002|1000014|   42|
|1000002|1000017|   69|
|1000002|1000024|  329|
|1000002|1000025|    1|
|1000002|1000028|   17|
|1000002|1000031|   47|
|1000002|1000033|   15|
|1000002|1000042|    1|
|1000002|1000045|    1|
|1000002|1000054|    2|
|1000002|1000055|   25|
|1000002|1000056|    4|
|1000002|1000059|    2|
|1000002|1000062|   71|
+-------+-------+-----+
only showing top 20 rows



                                                                                

In [10]:
user_artist_df.select([min("user"), max("user"), min("artist"), max("artist")]).show()



+---------+---------+-----------+-----------+
|min(user)|max(user)|min(artist)|max(artist)|
+---------+---------+-----------+-----------+
|       90|  2443548|          1|   10794401|
+---------+---------+-----------+-----------+



                                                                                

In [11]:
artist_by_id = raw_artist_data.withColumn("id", split("value", "\s+", 2).getItem(0).cast(IntegerType())) \
                              .withColumn("name", split("value", "\s+", 2).getItem(1).cast(StringType())) \
                              .drop("value")

In [12]:
artist_by_id.show()

+--------+--------------------+
|      id|                name|
+--------+--------------------+
| 1134999|        06Crazy Life|
| 6821360|        Pang Nakarin|
|10113088|Terfel, Bartoli- ...|
|10151459| The Flaming Sidebur|
| 6826647|   Bodenstandig 3000|
|10186265|Jota Quest e Ivet...|
| 6828986|       Toto_XX (1977|
|10236364|         U.S Bombs -|
| 1135000|artist formaly kn...|
|10299728|Kassierer - Musik...|
|10299744|         Rahzel, RZA|
| 6864258|      Jon Richardson|
| 6878791|Young Fresh Fello...|
|10299751|          Ki-ya-Kiss|
| 6909716|Underminded - The...|
|10435121|             Kox-Box|
| 6918061|  alexisonfire [wo!]|
| 1135001|         dj salinger|
| 6940391|The B52's - Chann...|
|10475396|             44 Hoes|
+--------+--------------------+
only showing top 20 rows



In [13]:
artist_alias = raw_artist_alias.withColumn('artist' , split("Value", "\s+").getItem(0).cast(IntegerType())) \
                                .withColumn('alias' , split("Value", "\s+").getItem(1).cast(IntegerType())) \
                                .drop("value")

In [14]:
artist_alias.show()

+--------+-------+
|  artist|  alias|
+--------+-------+
| 1092764|1000311|
| 1095122|1000557|
| 6708070|1007267|
|10088054|1042317|
| 1195917|1042317|
| 1112006|1000557|
| 1187350|1294511|
| 1116694|1327092|
| 6793225|1042317|
| 1079959|1000557|
| 6789612|1000591|
| 1262241|1000591|
| 6791455|1000591|
| 6694867|1000591|
|10141141|1113738|
| 1295140|1000591|
| 1027859|1252408|
| 2127019|1000591|
| 2153974|1000591|
| 1232342|1000591|
+--------+-------+
only showing top 20 rows



In [15]:
artist_by_id.filter(col("id").isin(1092764, 1000311)).show()



+-------+--------------+
|     id|          name|
+-------+--------------+
|1000311| Steve Winwood|
|1092764|Winwood, Steve|
+-------+--------------+



                                                                                

In [16]:
broadcast(artist_alias)

DataFrame[artist: int, alias: int]

In [17]:
train_data = user_artist_df.join(artist_alias, 'artist', how="left") \
                            .withColumn('artist', when(col("alias").isNull(), col('artist')).otherwise(col('alias'))) \
                            .withColumn('artist', col('artist').cast(IntegerType())) \
                            .drop("alias")

In [18]:
train_data.cache()

DataFrame[artist: int, user: int, count: int]

In [19]:
from pyspark.ml.recommendation import ALS
model = ALS(rank=10, seed=0, maxIter=5, regParam=0.1, implicitPrefs=True, alpha=1.0, 
            userCol='user', itemCol='artist', ratingCol='count').fit(train_data)

22/02/14 15:16:45 WARN com.github.fommil.netlib.BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
22/02/14 15:16:45 WARN com.github.fommil.netlib.BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
                                                                                

In [20]:
model.userFactors.show(1, truncate = False)

+---+---------------------------------------------------------------------------------------------------------------------------+
|id |features                                                                                                                   |
+---+---------------------------------------------------------------------------------------------------------------------------+
|90 |[0.16020626, 0.20717518, -0.1719469, 0.06038466, 0.06272771, 0.54658705, -0.4048189, 0.43657345, -0.10396772, -0.042728323]|
+---+---------------------------------------------------------------------------------------------------------------------------+
only showing top 1 row



In [21]:
user_id = 2093760

In [22]:
existing_artist_ids = train_data.filter(col("user") == user_id).collect()

In [23]:
existing_artist_ids

[Row(artist=1180, user=2093760, count=1),
 Row(artist=1255340, user=2093760, count=3),
 Row(artist=378, user=2093760, count=1),
 Row(artist=813, user=2093760, count=2),
 Row(artist=942, user=2093760, count=7)]

In [24]:
existing_artist_ids = [i[0] for i in existing_artist_ids]

In [25]:
existing_artist_ids

[1180, 1255340, 378, 813, 942]

In [26]:
artist_by_id.filter(col('id').isin(existing_artist_ids)).show()



+-------+---------------+
|     id|           name|
+-------+---------------+
|   1180|     David Gray|
|    378|  Blackalicious|
|    813|     Jurassic 5|
|1255340|The Saw Doctors|
|    942|         Xzibit|
+-------+---------------+



                                                                                

In [27]:
user_subset = train_data.select('user').where(col('user') == user_id).distinct()

top_predictions = model.recommendForUserSubset(user_subset, 5)


In [28]:
top_predictions.show(truncate=False)



+-------+-----------------------------------------------------------------------------------------------------------------+
|user   |recommendations                                                                                                  |
+-------+-----------------------------------------------------------------------------------------------------------------+
|2093760|[{2814, 0.029410675}, {1300642, 0.028404653}, {1001819, 0.028333459}, {1007614, 0.027992856}, {4605, 0.02789098}]|
+-------+-----------------------------------------------------------------------------------------------------------------+



                                                                                

In [29]:
user_subset.show()

+-------+
|   user|
+-------+
|2093760|
+-------+



In [30]:
top_predictions_pandas = top_predictions.toPandas() 
print(top_predictions_pandas)

      user                                    recommendations
0  2093760  [(2814, 0.029410675168037415), (1300642, 0.028...


In [31]:
recommended_artist_ids = [i[0] for i in top_predictions_pandas.recommendations[0]]

In [32]:
recommended_artist_ids

[2814, 1300642, 1001819, 1007614, 4605]

In [33]:
artist_by_id.filter(col('id').isin(recommended_artist_ids)).show()

                                                                                

+-------+----------+
|     id|      name|
+-------+----------+
|   2814|   50 Cent|
|   4605|Snoop Dogg|
|1007614|     Jay-Z|
|1001819|      2Pac|
|1300642|  The Game|
+-------+----------+



                                                                                

In [34]:
def area_under_curve( positive_data,b_all_artist_IDs,predict_function):
    positiveUserProducts = positive_data.map(lambda r : (r.user, r.product))
    positivePredictions = predict_function(positive_data)

In [35]:
all_data = user_artist_df.join(broadcast(artist_alias), 'artist',how='left') \
            .withColumn('artist', when(col('alias').isNull(), col('artist')).otherwise(col('alias'))) \
            .withColumn('artist',col('artist').cast(IntegerType())) \
            .drop('alias')

In [36]:
train_data, cv_data = all_data.randomSplit([0.9, 0.1],seed=54321)

In [37]:
train_data.cache()

DataFrame[artist: int, user: int, count: int]

In [38]:
cv_data.cache()

DataFrame[artist: int, user: int, count: int]

In [39]:
all_artist_ids = all_data.select("artist").distinct()

In [40]:
all_artist_ids

DataFrame[artist: int]

In [41]:
b_all_artist_ids = broadcast(all_artist_ids)
model = ALS(rank=10, seed=0, maxIter=5, regParam=0.1,
            implicitPrefs=True, alpha=1.0, userCol='user', itemCol='artist',
              ratingCol='count') \
.fit(train_data)

                                                                                

In [42]:
cv_data.show()

[Stage 390:>                                                        (0 + 1) / 1]

+------+-------+-----+
|artist|   user|count|
+------+-------+-----+
|     1|  15434|   14|
|     1|1000020|   90|
|     1|1000056|   28|
|     1|1000185|    2|
|     1|1000202|    1|
|     1|1000275|   37|
|     1|1000533|    7|
|     1|1000611|  346|
|     1|1000708|    2|
|     1|1000772|   31|
|     1|1000786|    8|
|     1|1000855|    2|
|     1|1000903|   12|
|     1|1000943|   13|
|     1|1001095|    2|
|     1|1001126|   12|
|     1|1001215|   23|
|     1|1001251|    2|
|     1|1001359|    6|
|     1|1001534|    3|
+------+-------+-----+
only showing top 20 rows



                                                                                

In [43]:
area_under_curve(cv_data, b_all_artist_ids, model.transform)

AttributeError: 'DataFrame' object has no attribute 'map'

In [70]:
def area_under_curve( positive_data,b_all_artist_IDs,predict_function):
    columns = ['user', 'artist']
    positiveUserProducts = positive_data.rdd.map(lambda r: (r.user, r.artist)).toDF(columns)
    positiveUserProducts.show()
    positivePredictions = predict_function(positiveUserProducts).groupBy('user')
    
    
    

In [60]:
 positiveUserProducts = cv_data.rdd.map(lambda r: (r.user, r.artist)).toDF()

In [71]:
area_under_curve(cv_data, b_all_artist_ids, model.transform)

+-------+------+
|   user|artist|
+-------+------+
|  15434|     1|
|1000020|     1|
|1000056|     1|
|1000185|     1|
|1000202|     1|
|1000275|     1|
|1000533|     1|
|1000611|     1|
|1000708|     1|
|1000772|     1|
|1000786|     1|
|1000855|     1|
|1000903|     1|
|1000943|     1|
|1001095|     1|
|1001126|     1|
|1001215|     1|
|1001251|     1|
|1001359|     1|
|1001534|     1|
+-------+------+
only showing top 20 rows



AttributeError: 'GroupedData' object has no attribute 'collect'

In [51]:
cv_data.show()

+------+-------+-----+
|artist|   user|count|
+------+-------+-----+
|     1|  15434|   14|
|     1|1000020|   90|
|     1|1000056|   28|
|     1|1000185|    2|
|     1|1000202|    1|
|     1|1000275|   37|
|     1|1000533|    7|
|     1|1000611|  346|
|     1|1000708|    2|
|     1|1000772|   31|
|     1|1000786|    8|
|     1|1000855|    2|
|     1|1000903|   12|
|     1|1000943|   13|
|     1|1001095|    2|
|     1|1001126|   12|
|     1|1001215|   23|
|     1|1001251|    2|
|     1|1001359|    6|
|     1|1001534|    3|
+------+-------+-----+
only showing top 20 rows

