In [91]:
# Importing necessary libraries and settings
import os
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns
%config InlineBackend.figure_format ='retina'
import random
from functools import reduce
import spotipy
import spotipy.util as util
from spotipy.oauth2 import SpotifyClientCredentials
from spotipy import oauth2

from kafka import KafkaProducer
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json
from pyspark.sql.types import (
    StructType,
    StructField,
    StringType,
    IntegerType,
    FloatType,
    ArrayType,
)
from pyspark.ml.classification import RandomForestClassifier

In [92]:
packages = {
    "org.apache.spark:spark-sql-kafka-0-10_2.12:3.4.0,org.neo4j:neo4j-connector-apache-spark_2.12:5.0.2_for_spark_3"
}

In [93]:
cid = 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
secret = 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
redirect_uri='http://localhost:7777/callback'
username = 'vass.zora'

In [94]:
# Once the Authorisation is complete, we just need to `sp` to call the APIs
scope = 'user-top-read user-read-private playlist-modify-private playlist-modify-public user-read-currently-playing'
token = util.prompt_for_user_token(username, scope, client_id=cid, client_secret=secret, redirect_uri=redirect_uri)

if token:
    sp = spotipy.Spotify(auth=token)
else:
    print("Can't get token for", username)

In [5]:
# Creating a function to get tracks IDs from a playlist
def get_playlist_tracks(username,playlist_id):
    results = sp.user_playlist_tracks(username,playlist_id)
    tracks = results['items']
    while results['next']:
        results = sp.next(results)
        tracks.extend(results['items'])
    return tracks

In [6]:
# Getting playlist IDs from each of Spotify's playlists #TODO add to method
playlists = sp.user_playlists(username)
spotify_playlist_ids = []
while playlists:
    for i, playlist in enumerate(playlists['items']):
        spotify_playlist_ids.append(playlist['uri'][-22:])
    if playlists['next']:
        playlists = sp.next(playlists)
    else:
        playlists = None

In [97]:
current_tracks = sp.current_user_playing_track()

In [106]:
playlist_id_mood_dict = {
    'sad': '37i9dQZF1DX3rxVfibe1L0',
    'happy': '37i9dQZF1DX4uPi2roRUwU',
    'chill': '37i9dQZF1DWWQRwui0ExPn',
    'angry': '37i9dQZF1DX3ND264N08pv',
    'romantic': '37i9dQZF1DX7rOY2tZUw1k',
}

In [112]:
moods = ['sad', 'happy', 'chill', 'angry', 'romantic']

# Getting tracks from each playlist
tracks = []
audio_features = []
for mood in moods:
    current_tracks = get_playlist_tracks(username, playlist_id_mood_dict[mood])
    tracks.append(current_tracks)
    for track in current_tracks:
        current_audio = sp.audio_features(track['track']['id'])[0]
        current_audio['mood'] = moods.index(mood)
        audio_features.append(current_audio)

In [9]:
spark = (
    SparkSession.builder.appName("Final assignment")
    .config("spark.jars.packages", ",".join(packages))
    .getOrCreate()
)

23/05/31 09:47:45 WARN Utils: Your hostname, HP-Elite830 resolves to a loopback address: 127.0.1.1; using 192.168.1.18 instead (on interface wlp1s0)
23/05/31 09:47:45 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
:: loading settings :: url = jar:file:/home/vaszo/.local/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/vaszo/.ivy2/cache
The jars for the packages stored in: /home/vaszo/.ivy2/jars
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
org.neo4j#neo4j-connector-apache-spark_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-165d07cb-ede7-4f92-9e99-58a295385e55;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.4.0 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.4.0 in central
	found org.apache.kafka#kafka-clients;3.3.2 in central
	found org.lz4#lz4-java;1.8.0 in central
	found org.xerial.snappy#snappy-java;1.1.9.1 in central
	found org.slf4j#slf4j-api;2.0.6 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.4 in central
	found org.apache.hadoop#hadoop-client-api;3.3.4 in central
	found commons-logging#commons-logging;1.1.3 in central
	found com.google.code.findbugs#jsr305;3.0.0 in central
	found org.apache.commons#commons-pool2;2.11.1 in local-m

23/05/31 09:47:48 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [10]:
producer = KafkaProducer(bootstrap_servers="localhost:9092")
spark.sparkContext.setLogLevel("ERROR")

In [63]:
producer.send("tracks_topic", b"")
producer.send("number_of_clusters", b"")
producer.send("audio_features_topic", b"")

<kafka.producer.future.FutureRecordMetadata at 0x7fdad1366080>

In [12]:
df = (
    spark.readStream.format("kafka")
    .option("kafka.bootstrap.servers", "localhost:9092")
    .option("subscribe", "tracks_topic, number_of_clusters, audio_features_topic")
    .option("startingOffsets", "latest")
    .load()
)

In [101]:
tracks[0]

{'added_at': '2019-09-08T19:03:51Z',
 'added_by': {'external_urls': {'spotify': 'https://open.spotify.com/user/vass.zora'},
  'href': 'https://api.spotify.com/v1/users/vass.zora',
  'id': 'vass.zora',
  'type': 'user',
  'uri': 'spotify:user:vass.zora'},
 'is_local': False,
 'primary_color': None,
 'track': {'album': {'album_type': 'album',
   'artists': [{'external_urls': {'spotify': 'https://open.spotify.com/artist/6sHCvZe1PHrOAuYlwTLNH4'},
     'href': 'https://api.spotify.com/v1/artists/6sHCvZe1PHrOAuYlwTLNH4',
     'id': '6sHCvZe1PHrOAuYlwTLNH4',
     'name': 'Gus Dapperton',
     'type': 'artist',
     'uri': 'spotify:artist:6sHCvZe1PHrOAuYlwTLNH4'}],
   'available_markets': [],
   'external_urls': {'spotify': 'https://open.spotify.com/album/4qSYIc2q02zeCN2fB2hBBR'},
   'href': 'https://api.spotify.com/v1/albums/4qSYIc2q02zeCN2fB2hBBR',
   'id': '4qSYIc2q02zeCN2fB2hBBR',
   'images': [{'height': 640,
     'url': 'https://i.scdn.co/image/ab67616d0000b27361d8be1be46d21609f68d476',


In [82]:
tracks_schema = StructType(
    [
        StructField("id", StringType()),
        StructField("name", StringType()),
        StructField("artists", ArrayType(
            StructType([
                StructField("id", StringType()),
                StructField("name", StringType())
            ])
        )),
        StructField("duration_ms", StringType())
    ]
)       


number_of_clusters_schema = StructType(
    [
        StructField("K", IntegerType())
    ]
)

audio_features_schema = StructType(
    [
        StructField("id", StringType(), True),
        StructField("danceability", FloatType(), True),
        StructField("energy", FloatType(), True),
        StructField("key", IntegerType(), True),
        StructField("loudness", FloatType(), True),
        StructField("mode", IntegerType(), True),
        StructField("speechiness", FloatType(), True),
        StructField("acousticness", FloatType(), True),
        StructField("instrumentalness", FloatType(), True),
        StructField("liveness", FloatType(), True),
        StructField("valence", FloatType(), True),
        StructField("tempo", FloatType(), True),
    ]
)


In [102]:
37i9dQZF1DX7qK8ma5wgG1

{'danceability': 0.663,
 'energy': 0.623,
 'key': 11,
 'loudness': -5.283,
 'mode': 1,
 'speechiness': 0.0239,
 'acousticness': 0.14,
 'instrumentalness': 0,
 'liveness': 0.0918,
 'valence': 0.773,
 'tempo': 81.513,
 'type': 'audio_features',
 'id': '227bXIqHWP8Z7gycUGO1sY',
 'uri': 'spotify:track:227bXIqHWP8Z7gycUGO1sY',
 'track_href': 'https://api.spotify.com/v1/tracks/227bXIqHWP8Z7gycUGO1sY',
 'analysis_url': 'https://api.spotify.com/v1/audio-analysis/227bXIqHWP8Z7gycUGO1sY',
 'duration_ms': 173742,
 'time_signature': 4}

In [83]:
tracks_stream = (
    df.selectExpr("CAST(value AS STRING)")
    .filter("topic = 'tracks_topic'")
    .select(from_json("value", tracks_schema).alias("data"))
    .select("data.*")
)

number_of_clusters_stream = (
    df.selectExpr("CAST(value AS STRING)")
    .select(from_json("value", number_of_clusters_schema).alias("data"))
    .select("data.*")
)

audio_features_stream = (
    df.selectExpr("CAST(value AS STRING)")
    .select(from_json("value", audio_features_schema).alias("data"))
    .select("data.*")
)

In [84]:
tracks_query = (
    tracks_stream.writeStream.format("memory")
    .queryName("tracks")
    .start()
)

number_of_clusters_query = (
    number_of_clusters_stream.writeStream.format("memory")
    .queryName("number_of_clusters")
    .start()
)

audio_features_query = (
    audio_features_stream.writeStream.format("memory")
    .queryName("audio_features")
    .start()
)

In [81]:
# tracks_query.stop()
# number_of_clusters_query.stop()
# audio_features_query.stop()

In [85]:
for track in tracks:
    producer.send("tracks_topic", json.dumps(track['track']).encode("utf-8"))

for audio in audio_features:
    producer.send("audio_features_topic", json.dumps(audio).encode("utf-8"))

                                                                                

In [45]:
tracks_spark = spark.sql("select * from tracks")

In [90]:
tracks_spark.show(15, False)

+----------------------+-----------------------------+-------------------------------------------+-----------+
|id                    |name                         |artists                                    |duration_ms|
+----------------------+-----------------------------+-------------------------------------------+-----------+
|227bXIqHWP8Z7gycUGO1sY|Coax & Botany                |[{6sHCvZe1PHrOAuYlwTLNH4, Gus Dapperton}]  |173742     |
|3mJcEwAZ7GgPmppR6LvAdp|Sockboy                      |[{6sHCvZe1PHrOAuYlwTLNH4, Gus Dapperton}]  |247029     |
|2CNM0Q27eYpDRo5GTVTNYM|Matador                      |[{1CMml5seBEaxQzlmaGxMPx, The Buttertones}]|199360     |
|6X3FZtz4cKU2MKSQlGG9ZG|Bags                         |[{3l0CmX0FuQjFxr8SK7Vqag, Clairo}]         |260519     |
|7zI1PDpYJ1yEyAINS604Zs|Humongous                    |[{2D4FOOOtWycb3Aw9nY5n3c, Declan McKenna}] |321680     |
|6BqWhxll86CGGE6WxgdRqG|Golden Skans                 |[{2qlAMLpUyBjZgnzuFXXZXI, Klaxons}]        |165120     |
|

In [86]:
audio_features_spark = spark.sql("select * from audio_features")

In [89]:
audio_features_spark.show(15, False)

+----------------------+------------+------+----+--------+----+-----------+------------+----------------+--------+-------+-------+
|id                    |danceability|energy|key |loudness|mode|speechiness|acousticness|instrumentalness|liveness|valence|tempo  |
+----------------------+------------+------+----+--------+----+-----------+------------+----------------+--------+-------+-------+
|227bXIqHWP8Z7gycUGO1sY|null        |null  |null|null    |null|null       |null        |null            |null    |null   |null   |
|227bXIqHWP8Z7gycUGO1sY|0.663       |0.623 |11  |-5.283  |1   |0.0239     |0.14        |0.0             |0.0918  |0.773  |81.513 |
|3mJcEwAZ7GgPmppR6LvAdp|0.651       |0.719 |6   |-3.38   |0   |0.0334     |0.109       |0.0             |0.129   |0.4    |117.01 |
|2CNM0Q27eYpDRo5GTVTNYM|0.462       |0.837 |4   |-4.513  |0   |0.0365     |0.00169     |0.0182          |0.103   |0.582  |146.943|
|6X3FZtz4cKU2MKSQlGG9ZG|0.742       |0.546 |1   |-7.694  |1   |0.0315     |0.172   