In [None]:
import findspark
findspark.init()
import random
import pyspark

In [None]:
import os

config = # config setting            

ss = pyspark.sql.SparkSession.builder.config(conf = config).getOrCreate()
sc = ss.sparkContext

# data preprocessing - one file(playlist and audio)

In [None]:
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pyspark.sql.types import *
from pyspark.mllib.clustering import KMeans, KMeansModel



In [None]:
file_path = '/scratch/ISE495/2020_project_03/team-3/mpd.slice.332000-332999.json'
audio_file = '/scratch/ISE495/2020_project_03/team-3/spotify.json'

In [None]:
data = json.load(open(file_path))
DF = pd.DataFrame.from_dict(data['playlists'])

In [None]:
schema = StructType([StructField('name',StringType()),
                     StructField('collaborative',StringType()),
                     StructField('pid',StringType()),
                     StructField('modified_at',IntegerType()),
                     StructField('num_tracks',IntegerType()),
                     StructField('num_albums',IntegerType()),
                     StructField('num_followers',IntegerType()),
                     StructField('tracks',ArrayType(MapType(StringType(),StringType()))),
                     StructField('num_edits',IntegerType()),
                     StructField('duration_ms',IntegerType()),
                     StructField('num_artists',IntegerType()),
                     StructField('description',StringType())])
DF2 = ss.createDataFrame(DF,schema)

In [None]:
RDD = DF2.rdd.map(lambda x: x.asDict())
RDD.take(1)

[{'name': 'cg',
  'collaborative': 'false',
  'pid': '332000',
  'modified_at': 1508284800,
  'num_tracks': 116,
  'num_albums': 84,
  'num_followers': 1,
  'tracks': [{'duration_ms': '194893',
    'artist_uri': 'spotify:artist:1dID9zgn0OV0Y8ud7Mh2tS',
    'artist_name': 'Dustin Lynch',
    'pos': '0',
    'album_name': 'Current Mood',
    'track_uri': 'spotify:track:7pxhKtuTwofDIdgHx2DcVK',
    'album_uri': 'spotify:album:23cuZhPWDfX1uKD4qwuv7t',
    'track_name': "Seein' Red"},
   {'duration_ms': '199746',
    'artist_uri': 'spotify:artist:1n2pb9Tsfe4SwAjmUac6YT',
    'artist_name': 'Jake Owen',
    'pos': '1',
    'album_name': 'American Love',
    'track_uri': 'spotify:track:0O1x2tRm8ZpfDbcpOWZp7z',
    'album_uri': 'spotify:album:5gsWgFeHRxRkIXGXWPiOIW',
    'track_name': 'American Country Love Song'},
   {'duration_ms': '202346',
    'artist_uri': 'spotify:artist:4MoAOfV4ROWofLG3a3hhBN',
    'artist_name': 'Jon Pardi',
    'pos': '2',
    'album_name': 'California Sunrise',
    '

In [None]:
audio_data = json.load(open(audio_file))
audioDF = pd.DataFrame.from_dict(audio_data.values())
audioDF.head()

Unnamed: 0,0
0,"{'danceability': 0.651, 'energy': 0.808, 'key'..."
1,"{'danceability': 0.529, 'energy': 0.891, 'key'..."
2,"{'danceability': 0.563, 'energy': 0.688, 'key'..."
3,"{'danceability': 0.583, 'energy': 0.862, 'key'..."
4,"{'danceability': 0.652, 'energy': 0.719, 'key'..."


In [None]:
# create (track_uri, (danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo))
# remove data with missing values
audioRDD = sc.parallelize(audioDF.values).filter(lambda x: x!=None).map(lambda x: x.item())\
                                         .map(lambda x: (x['uri'],(x['danceability'],x['energy'],x['key'],x['loudness'],x['mode'],x['speechiness'],x['acousticness'],x['instrumentalness'],x['liveness'],x['valence'],x['tempo'])))
audioRDD.take(5)

[('spotify:track:7pxhKtuTwofDIdgHx2DcVK',
  (0.651, 0.808, 1, -4.731, 0, 0.0435, 0.054, 0, 0.0482, 0.858, 100.993)),
 ('spotify:track:0O1x2tRm8ZpfDbcpOWZp7z',
  (0.529, 0.891, 8, -4.36, 1, 0.043, 0.0457, 0, 0.0942, 0.329, 104.031)),
 ('spotify:track:4ly1QBXEwYoDmje9rmEgC4',
  (0.563, 0.688, 7, -4.474, 1, 0.0262, 0.0763, 0, 0.0863, 0.622, 108.008)),
 ('spotify:track:7DYswEnZFL6SKaNLTxBf5b',
  (0.583, 0.862, 0, -5.804, 1, 0.0337, 0.00492, 0, 0.12, 0.671, 112.953)),
 ('spotify:track:6p5GuyEqQH5pv02ouEyogQ',
  (0.652, 0.719, 1, -7.251, 1, 0.033, 0.0353, 0, 0.108, 0.24, 94.969))]

# k-means clustering

In [None]:
# divide tempo by 100 then normalize the data
audio_normal = audioRDD.map(lambda x: (x[0], (x[1][0],x[1][1],x[1][2],x[1][3],x[1][4],x[1][5],x[1][6],x[1][7],x[1][8],x[1][9],x[1][10]/100)))\
                       .map(lambda x: (x[0],[i/np.linalg.norm(x[1]) for i in x[1]]))
audio_normal.take(1)

[('spotify:track:7pxhKtuTwofDIdgHx2DcVK',
  [0.12712933128506923,
   0.15778878598822726,
   0.19528315097552876,
   -0.9238845872652266,
   0.0,
   0.008494817067435502,
   0.010545290152678553,
   0.0,
   0.009412647877020487,
   0.16755294353700367,
   0.19722231266471577])]

In [None]:
model = KMeans.train(audio_normal.map(lambda x: x[1]), k=6, maxIterations=10)

In [None]:
model.centers

[array([ 0.050935  ,  0.04717649,  0.39211685, -0.89819687,  0.0542622 ,
         0.00700995,  0.0358603 ,  0.01055233,  0.01709011,  0.03934639,
         0.10571398]),
 array([ 0.06023995,  0.08459159,  0.89296171, -0.38999671,  0.05690263,
         0.01013917,  0.01092545,  0.00481058,  0.02239812,  0.05459326,
         0.1275338 ]),
 array([ 0.06964367,  0.07085994,  0.07801829, -0.96426545,  0.0873731 ,
         0.01063293,  0.03298074,  0.01054345,  0.02278953,  0.0539134 ,
         0.1403669 ]),
 array([ 0.06202286,  0.07563195,  0.78021323, -0.58672384,  0.06282637,
         0.00949643,  0.0166931 ,  0.0054191 ,  0.02080222,  0.05287167,
         0.12735163]),
 array([ 0.12479396,  0.17485432,  0.25770133, -0.84472944,  0.17483078,
         0.02195923,  0.01953128,  0.00943669,  0.04618833,  0.11333947,
         0.27480351]),
 array([ 0.06115399,  0.06618157,  0.62185815, -0.75268852,  0.0606882 ,
         0.00909758,  0.02559963,  0.00761901,  0.02039547,  0.05047618,
         

In [None]:
# chill: acoustic and sad, rock: loud, classical: instrumental and live, pop: everything else, hip-hop: speechy, r-n-b: quieter than pop
genre = {0:'chill',1:'rock',2:'classical',3:'pop',4:'hip-hop',5:'r-n-b'}
# create dic[track_id] = genre
track_genre = audio_normal.map(lambda x: (x[0],genre[model.predict(x[1])])).collectAsMap()

In [None]:
# define playlist genre by the most frequent track genre
# remove data with missing values
# create dic[pid] = genre
pid_genre = RDD.map(lambda x: (x['pid'],[track_genre.get(i['track_uri']) for i in x['tracks']]))\
               .filter(lambda x: None not in x[1])\
               .map(lambda x: (x[0],max(set(x[1]),key=x[1].count))).collectAsMap()

recommend songs with same genre by sp.recommendations

In [None]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

In [None]:
cid = '09e83aa879454e2f9fce57559cf9c756'
secret = 'f3eaf812c5d64571a53b4b9b46f613fc'

auth = SpotifyClientCredentials(client_id=cid, client_secret=secret)
sp = spotipy.Spotify(auth_manager=auth)

In [None]:
# define function to get recommended tracks in (song, performers) format
def rec_track(pid):
  rec = sp.recommendations(seed_genres=[pid_genre[pid]],limit=10)
  
  result = []
  for track in rec['tracks']:
    performers = [performer['name'] for performer in track['artists']]
    song = track['name']
    result.append((song, performers))

  return result

In [None]:
# recommend 10 songs to playlist 332000
rec_track('332000')

[('MIA (feat. Drake)', ['Bad Bunny', 'Drake']),
 ('The Cure', ['Lady Gaga']),
 ('I Don’t Wanna Live Forever (Fifty Shades Darker)',
  ['ZAYN', 'Taylor Swift']),
 ('Fake Love', ['Drake']),
 ('Here', ['Alessia Cara']),
 ('Faded', ['Alan Walker']),
 ('Electric (feat. Khalid)', ['Alina Baraz', 'Khalid']),
 ('Say You Love Me', ['Jessie Ware']),
 ('Trumpets', ['Jason Derulo']),
 ('Candy Paint', ['Post Malone'])]

In [None]:
sc.stop()
ss.stop()