In [1]:
# Import Libraries
import os, sys, findspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as F, Window as W

In [2]:
# Uses Java 17 & Python 3.11
os.environ["JAVA_HOME"] = "/opt/homebrew/Cellar/openjdk@17/17.0.17/libexec/openjdk.jdk/Contents/Home"
os.environ["PYSPARK_PYTHON"] = sys.executable
os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable
# Builds PySpark session for 4 local cores with 10GB RAM
findspark.init()
spark = (
    SparkSession.builder
    .appName("SpotifyRec")
    .master("local[4]")
    .config("spark.driver.memory", "10g")
    .config("spark.sql.adaptive.enabled", "true")
    .getOrCreate()
)
# Remove error logs for cleaner output
spark.sparkContext.setLogLevel("ERROR")

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/11/06 02:10:20 WARN Utils: Your hostname, Ethans-MacBook-Pro.local, resolves to a loopback address: 127.0.0.1; using 100.64.14.129 instead (on interface en0)
25/11/06 02:10:20 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/11/06 02:10:20 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [None]:
# Interactions DataFrame
playlist_tracks = spark.read.parquet("parquet_data/playlist_tracks")
playlist_tracks.createOrReplaceTempView("playlist_tracks")

# User DataFrame
playlists = spark.read.parquet("parquet_data/playlists")
playlists.createOrReplaceTempView("playlists")

# Items DataFrame
tracks = spark.read.parquet("parquet_data/tracks")
tracks.createOrReplaceTempView("tracks")

# Item Features - Track
# Read tracks_features parquet for model training, create a temporary SQL table
track_features = spark.read.parquet('parquet_data/track_features')
track_features.createOrReplaceTempView("track_features")

# Item Features - Artist
# Read artist_features parquet for model training, create a temporary SQL table
artist_features = spark.read.parquet('parquet_data/artist_features')
artist_features.createOrReplaceTempView('artist_features')

# User Features
# Read playlist_features parquet for model training, create a temporary SQL table
playlist_features = spark.read.parquet('parquet_data/playlist_features')
playlist_features.createOrReplaceTempView("playlist_features")

# Item-User Interactions
# Read edges parquet for model training, create a temporary SQL table
edges = spark.read.parquet('parquet_data/edges')
edges.createOrReplaceTempView("edges")

In [6]:
artist_features.show(5)

+---+--------------------+--------------------+-------------------+----------+
|aid|          artist_uri|         artist_name|artist_playlist_cnt|artist_cnt|
+---+--------------------+--------------------+-------------------+----------+
|  1|spotify:artist:00...|        Jordan Colle|                  1|         2|
|  3|spotify:artist:00...|"Faron Young, Nat...|                  5|         5|
|  5|spotify:artist:00...|       Thug Brothers|                 14|        15|
|  6|spotify:artist:00...|       Darren Gibson|                  2|         3|
| 15|spotify:artist:00...|          Cuzzo Shay|                  1|         1|
+---+--------------------+--------------------+-------------------+----------+
only showing top 5 rows


In [7]:
playlist_features.show(5)

+------+---------+----------+-----------+----------+-----------------+-------------------+-------------+
|   pid|     name|num_tracks|num_artists|num_albums|playlist_duration|days_since_modified|collaborative|
+------+---------+----------+-----------+----------+-----------------+-------------------+-------------+
|298014|     frat|        67|         50|        61|         15303907|              533.0|        false|
|298015|Christian|        86|         25|        66|         23202573|               23.0|        false|
|298016|    Party|       178|        117|       152|         43705022|              158.0|        false|
|298017|   Chilll|       154|         92|       111|         37417246|               78.0|        false|
|298018|      RAP|        19|         18|        19|          4386013|              235.0|        false|
+------+---------+----------+-----------+----------+-----------------+-------------------+-------------+
only showing top 5 rows


In [8]:
track_features.show(5)

+---+--------------------+------------+--------------------+--------------------+--------------+---------+
|tid|          artist_uri| artist_name|           album_uri|          album_name|track_duration|track_cnt|
+---+--------------------+------------+--------------------+--------------------+--------------+---------+
|  0|spotify:artist:5k...|Cherryholmes|spotify:album:3SP...|        Cherryholmes|        161186|        1|
|  2|spotify:artist:2j...| Zach Farlow|spotify:album:0UH...|  The Great Escape 2|        222727|        2|
| 11|spotify:artist:1K...|        Ka√Øn|spotify:album:0u6...| Nulle part ailleurs|        189106|        1|
| 14|spotify:artist:2s...|   Mike Love|spotify:album:4jC...|The Change I'm Se...|        357573|        9|
| 18|spotify:artist:4K...|      R.E.M.|spotify:album:4eo...|               Green|        193547|        1|
+---+--------------------+------------+--------------------+--------------------+--------------+---------+
only showing top 5 rows
