In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 

import spotipy
from spotipy import SpotifyOAuth

from sklearn.feature_extraction.text import CountVectorizer

import settings


In [3]:
# Client ID and Client Secret is called and used
spotify_client_id = settings.SPOTIFY_CLIENT_ID
spotify_client_secret = settings.SPOTIFY_CLIENT_SECRET
spotify_redirect_uri = "http://localhost:5000/callback"

In [4]:
# provide scope of access for Python which requests access from the current user
scope =  'user-library-read user-top-read user-read-private playlist-read-private playlist-read-collaborative playlist-modify-public playlist-modify-private'

# authentication manager ensures secured point of access from spotify to python to client (me)
auth_manager = SpotifyOAuth(
                            scope = scope, 
                            client_id = spotify_client_id,
                            client_secret = spotify_client_secret,
                            redirect_uri = spotify_redirect_uri
                            )

# initial call of spotify using authentication manager
sp = spotipy.Spotify(auth_manager = auth_manager)

In [5]:
liked_songs = []
offset = 0
users_liked = sp.current_user_saved_tracks(limit = 50)

while users_liked["next"] != None:
    liked_songs.extend(users_liked["items"])
    offset+=50
    users_liked = sp.current_user_saved_tracks(offset=offset,limit = 50)

liked_songs.extend(users_liked["items"])

In [6]:
len(liked_songs)

1830

In [7]:
df = pd.DataFrame(list(map(lambda x: x["track"],liked_songs)))

In [8]:
unique_artists = df['artists'].apply(lambda x: x[0]['id']).drop_duplicates().reset_index(drop = True).tolist()

In [9]:
# sp.artists(unique_artists[:50])

In [10]:
len(unique_artists)

719

In [11]:
unique_artists_copy = unique_artists
last = len(unique_artists)%50

artists = [sp.artists(unique_artists[x:x+50])['artists'] if len(unique_artists)-x >= 50 else sp.artists(unique_artists[-last:]) for x in range(0,len(unique_artists),50)]



In [12]:
len(artists)

15

In [13]:
collapse_artist = [item for sublist in artists for item in sublist]
collapse_artist.extend(sp.artists(unique_artists[-last:])['artists'])

In [14]:
collapse_artist_notnull = list(map(lambda x : x if type(x) is dict else None, collapse_artist))
artists = [i for i in collapse_artist_notnull if i is not None]

In [15]:
artists_df = pd.DataFrame(artists)

In [16]:
artists_df.columns

Index(['external_urls', 'followers', 'genres', 'href', 'id', 'images', 'name',
       'popularity', 'type', 'uri'],
      dtype='object')

In [17]:
unique_genres = artists_df[artists_df['genres'].apply(lambda x: len(x) > 0)]['genres'].explode().drop_duplicates().tolist()

In [18]:
# CountVectorizer(input =  vocabulary=unique_genres)

artists_df['total_followers'] = artists_df['followers'].apply(lambda x: x['total'])

In [19]:
artists_df['cnt_vectorizer'] = artists_df['genres'].apply(lambda x: CountVectorizer(input=x, vocabulary=unique_genres))

In [20]:
unique_genres

['k-rap',
 'bedroom pop',
 'rochester mn indie',
 'lo-fi rap',
 'pop',
 'brostep',
 'canadian electronic',
 'edm',
 'electra',
 'electro house',
 'east coast hip hop',
 'hip hop',
 'rap',
 'afrofuturism',
 'indie soul',
 'k-pop',
 'k-pop girl group',
 'k-pop boy group',
 'alt z',
 'talent show',
 'kansas indie',
 'pixel',
 'pov: indie',
 'alternative r&b',
 'modern indie pop',
 'alternative hip hop',
 'hyperpop',
 'contemporary r&b',
 'alternative metal',
 'funk metal',
 'hard rock',
 'nu metal',
 'post-grunge',
 'rap metal',
 'rock',
 'idol rock',
 'j-metal',
 'kawaii metal',
 'bedroom r&b',
 'chill r&b',
 'la pop',
 'oakland indie',
 'indie hip hop',
 'boston electronic',
 'electronic trap',
 'future bass',
 'vapor twitch',
 'la indie',
 'alternative rock',
 'blues rock',
 'detroit rock',
 'garage rock',
 'modern blues rock',
 'permanent wave',
 'punk blues',
 'house',
 'stutter house',
 'british soul',
 'sad lo-fi',
 'canadian hip hop',
 'dark trap',
 'meme rap',
 'art pop',
 'instr

In [30]:
artists_df['genres'].apply(lambda x: CountVectorizer(vocabulary=unique_genres).fit_transform(x).toarray())

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [55]:
remainder = len(df['id'].tolist())%100

round(len(df['id'].tolist())/100)

audio_features = [sp.audio_features(x) for x in np.array_split(df['id'].tolist(),round(len(df['id'].tolist())/100)+1)]

In [57]:
audio_features = [item for row in audio_features for item in row]

In [59]:
features = pd.DataFrame(audio_features)

In [63]:
combined_df = df.merge(features, on = 'id')

In [65]:
combined_df.columns

Index(['album', 'artists', 'available_markets', 'disc_number', 'duration_ms_x',
       'explicit', 'external_ids', 'external_urls', 'href', 'id', 'is_local',
       'is_playable', 'name', 'popularity', 'preview_url', 'track_number',
       'type_x', 'uri_x', 'danceability', 'energy', 'key', 'loudness', 'mode',
       'speechiness', 'acousticness', 'instrumentalness', 'liveness',
       'valence', 'tempo', 'type_y', 'uri_y', 'track_href', 'analysis_url',
       'duration_ms_y', 'time_signature'],
      dtype='object')

In [73]:
combined_df['num_artists'] = combined_df['artists'].apply(lambda x: len(x))

In [79]:
round(combined_df['num_artists']/combined_df['popularity'], 7)

0       0.017241
1       0.017544
2       0.012500
3       0.018868
4       0.016949
          ...   
1825    0.016949
1826         inf
1827    0.266667
1828         inf
1829    0.068965
Length: 1830, dtype: float64

In [103]:
set([item for row in list(map(str.split, unique_genres)) for item in row])

{'2-step',
 '5th',
 'abstract',
 'acid',
 'acoustic',
 'adult',
 'aesthetic',
 'afrofuturism',
 'age',
 'aggressive',
 'ai',
 'alabama',
 'album',
 'alt',
 'alternative',
 'ambient',
 'americana',
 'and',
 'anime',
 'anti-folk',
 'area',
 'art',
 'atl',
 'aussietronica',
 'australian',
 'avant-prog',
 'baile',
 'baltimore',
 'band',
 'bap',
 'baroque',
 'bass',
 'basshall',
 'bassline',
 'bay',
 'beats',
 'bedroom',
 'big',
 'bitpop',
 'black',
 'blues',
 'boom',
 'boston',
 'bounce',
 'bow',
 'boy',
 'brasileiro',
 'brazilian',
 'brighton',
 'british',
 'brooklyn',
 'brostep',
 'brutal',
 'buffalo',
 'cali',
 'canadian',
 'carolina',
 'chamber',
 'channel',
 'chicago',
 'chill',
 'chillhop',
 'chillsynth',
 'chillwave',
 'christian',
 'city',
 'classic',
 'classical',
 'club',
 'clubbing',
 'coast',
 'collage',
 'colombian',
 'comedy',
 'comic',
 'complextro',
 'conscious',
 'contemporary',
 'country',
 'crank',
 'cyberpunk',
 'dance',
 'dance-punk',
 'dancefloor',
 'danish',
 'dark',

In [85]:
artists_df['genres'].apply(lambda x: [y.split() for y in x])

0                                              [[k-rap]]
1               [[bedroom, pop], [rochester, mn, indie]]
2                                         [[lo-fi, rap]]
3                                                [[pop]]
4      [[brostep], [canadian, electronic], [edm], [el...
                             ...                        
714                                                   []
715                                                   []
716                                                   []
717                                                   []
718                               [[deep, talent, show]]
Name: genres, Length: 719, dtype: object