In [1]:
# install Java8
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
# download spark 3.5.0
!wget -q https://apache.osuosl.org/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz

In [2]:
ls -l # check the .tgz is there

total 391120
drwxr-xr-x 1 root root      4096 Jan 11 17:02 [0m[01;34msample_data[0m/
-rw-r--r-- 1 root root 400395283 Sep  9 02:10 spark-3.5.0-bin-hadoop3.tgz
-rw-r--r-- 1 root root    106270 Jan 16 07:51 spotify-2023.csv


In [3]:
# unzip it
!tar xf spark-3.5.0-bin-hadoop3.tgz

In [4]:
!pip install -q findspark

In [5]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.5.0-bin-hadoop3"
os.environ["PYSPARK_SUBMIT_ARGS"] = "--master local[*] pyspark-shell"

In [6]:
import findspark
findspark.init("spark-3.5.0-bin-hadoop3")# SPARK_HOME

from pyspark.sql import SparkSession

# create the session
spark = SparkSession \
        .builder \
        .appName("Window Partitioning") \
        .master("local[*]") \
        .getOrCreate()

spark.version

'3.5.0'

In [7]:
spark

In [8]:
# For Pandas conversion optimization
spark.conf.set("spark.sql.execution.arrow.enabled", "true")

In [9]:
# Import sql functions
from pyspark.sql.functions import *

In [57]:
dataset_df = spark.read.option("header", "true").option("delimiter", ",").csv("/content/spotify-2023.csv")
spotify_df = dataset_df.withColumn("id", monotonically_increasing_id())
spotify_df.show(10)

+--------------------+--------------------+------------+-------------+--------------+------------+--------------------+-----------------+---------+------------------+---------------+-------------------+----------------+----------------+---+---+-----+--------------+---------+--------+--------------+------------------+----------+-------------+---+
|          track_name|      artist(s)_name|artist_count|released_year|released_month|released_day|in_spotify_playlists|in_spotify_charts|  streams|in_apple_playlists|in_apple_charts|in_deezer_playlists|in_deezer_charts|in_shazam_charts|bpm|key| mode|danceability_%|valence_%|energy_%|acousticness_%|instrumentalness_%|liveness_%|speechiness_%| id|
+--------------------+--------------------+------------+-------------+--------------+------------+--------------------+-----------------+---------+------------------+---------------+-------------------+----------------+----------------+---+---+-----+--------------+---------+--------+--------------+-----

In [None]:
spotify_df.printSchema()

In [56]:
songs_df = spotify_df.select('id', 'track_name', 'artist(s)_name', col('artist_count').cast('int'), col('released_year').cast('long'))
platforms_df = spotify_df.select('id', col('streams').cast('long'), col('in_spotify_playlists').cast('long'), col('in_spotify_charts').cast('long'), col('in_apple_playlists').cast('long'), col('in_apple_charts').cast('long'), col('in_deezer_playlists').cast('long'), col('in_deezer_charts').cast('long'), col('in_shazam_charts').cast('long'))
characteristics_df = spotify_df.select('id', col('bpm').cast('int'), 'key', 'mode', col('danceability_%').cast('int'), col('valence_%').cast('int'), col('energy_%').cast('int'), col('acousticness_%').cast('int'), col('instrumentalness_%').cast('int'), col('liveness_%').cast('int'), col('speechiness_%').cast('int'))

In [58]:
songs_df.show(10)

+---+--------------------+--------------------+------------+-------------+
| id|          track_name|      artist(s)_name|artist_count|released_year|
+---+--------------------+--------------------+------------+-------------+
|  0|Seven (feat. Latt...|    Latto, Jung Kook|           2|         2023|
|  1|                LALA|         Myke Towers|           1|         2023|
|  2|             vampire|      Olivia Rodrigo|           1|         2023|
|  3|        Cruel Summer|        Taylor Swift|           1|         2019|
|  4|      WHERE SHE GOES|           Bad Bunny|           1|         2023|
|  5|            Sprinter|   Dave, Central Cee|           2|         2023|
|  6|     Ella Baila Sola|Eslabon Armado, P...|           2|         2023|
|  7|            Columbia|             Quevedo|           1|         2023|
|  8|            fukumean|               Gunna|           1|         2023|
|  9|     La Bebe - Remix|Peso Pluma, Yng L...|           2|         2023|
+---+--------------------

In [59]:
platforms_df.show(10)

+---+---------+--------------------+-----------------+------------------+---------------+-------------------+----------------+----------------+
| id|  streams|in_spotify_playlists|in_spotify_charts|in_apple_playlists|in_apple_charts|in_deezer_playlists|in_deezer_charts|in_shazam_charts|
+---+---------+--------------------+-----------------+------------------+---------------+-------------------+----------------+----------------+
|  0|141381703|                 553|              147|                43|            263|                 45|              10|             826|
|  1|133716286|                1474|               48|                48|            126|                 58|              14|             382|
|  2|140003974|                1397|              113|                94|            207|                 91|              14|             949|
|  3|800840817|                7858|              100|               116|            207|                125|              12|          

In [60]:
characteristics_df.show(10)

+---+---+---+-----+--------------+---------+--------+--------------+------------------+----------+-------------+
| id|bpm|key| mode|danceability_%|valence_%|energy_%|acousticness_%|instrumentalness_%|liveness_%|speechiness_%|
+---+---+---+-----+--------------+---------+--------+--------------+------------------+----------+-------------+
|  0|125|  B|Major|            80|       89|      83|            31|                 0|         8|            4|
|  1| 92| C#|Major|            71|       61|      74|             7|                 0|        10|            4|
|  2|138|  F|Major|            51|       32|      53|            17|                 0|        31|            6|
|  3|170|  A|Major|            55|       58|      72|            11|                 0|        11|           15|
|  4|144|  A|Minor|            65|       23|      80|            14|                63|        11|            6|
|  5|141| C#|Major|            92|       66|      58|            19|                 0|         

In [None]:
# Number of songs released in 2023
songs_2023 = songs_df.filter(col('released_year') == 2023)
songs_2023.show()

In [None]:
# Number of songs not released in 2023
songs_Not2023 = songs_df.filter(col('released_year') != 2023).count()
print(songs_Not2023)

In [None]:
# Top 20 songs with + streams
highest_streams_df = songs_df.join(platforms_df, 'id', "left").select('id', 'track_name', 'artist(s)_name', 'released_year', 'streams').orderBy((col('streams')), ascending = False)

highest_streams_df.limit(20).show()

In [None]:
# Top 20 songs with + streams in 2023
highest_streams2023_df = highest_streams_df.filter(col('released_year') == 2023)
TOP20_2023 = highest_streams2023_df.limit(20)
TOP20_2023.show()

In [None]:
# Max and min number of streams
max_min_df = platforms_df.agg((max('streams').alias('Max_streams')), (min('streams').alias('Min_streams')))
max_min_df.show()

In [None]:
# Range of 'released_year' available in dataset
range_year_df = songs_df.agg((max('released_year').alias('Year_newest')), (min('released_year').alias('Year_oldest')))
range_year_df.show()

In [None]:
# Number of songs per artist
songs_artist_df = songs_df.groupBy('artist(s)_name').agg(countDistinct(col('track_name')).alias("songs_published"))
songs_artist_df.show()

In [None]:
max_songs = songs_artist_df.orderBy((col('songs_published')), ascending = False).limit(5).withColumnRenamed('artist(s)_name', 'more_songs')
min_songs = songs_artist_df.orderBy((col('songs_published')), ascending = True).limit(5).withColumnRenamed('artist(s)_name', 'fewer_songs')
max_songs.show()
min_songs.show()


In [54]:
# import library
from pyspark.sql.window import Window

In [None]:
# Number of years that each artist has published a song
# byTitle = Window.partitionBy("title").orderBy(col("salary").desc())
#bestPaidPerTitleDF = bestPaidPerTitlerawDF.withColumn("rank_salary", row_number().over(byTitle)).filter(col("rank_salary") <= 3)

byArtist = Window.partitionBy('artist(s)_name').orderBy(col('released_year').desc())
historic_year_df = songs_df.withColumn('rank_years', row_number().over(byArtist))
historic_year_df.show()
