In [1]:
import pandas as pd
import numpy as np

import os
from urllib.request import urlopen, Request
import requests as r
from sys import argv, exit
from base64 import b64encode
import json
import re

from dotenv import load_dotenv, find_dotenv
dotenv_path=find_dotenv()
load_dotenv(dotenv_path)
client_id= os.environ.get('SPOTIFY_CLIENT_ID')
client_secret= os.environ.get('SPOTIFY_CLIENT_SECRET')

import spotipy
import sys
from sys import argv, exit
from spotipy.oauth2 import SpotifyClientCredentials

#Authentication without user
client_credentials_manager= SpotifyClientCredentials(client_id= client_id, client_secret=client_secret)
sp= spotipy.Spotify(client_credentials_manager=client_credentials_manager)

We are going to be building models based 

In [2]:
df_collab= pd.read_csv('/Users/josephlim/Desktop/Data Science/Capstone Projects/Music Recommendation System- Capstone 3/Data/Raw data/spotify_dataset.csv', on_bad_lines='skip')
df_content=pd.read_csv('/Users/josephlim/Desktop/Data Science/Capstone Projects/Capstone project- Spotify/Data/Raw Data/1921-2020_tracks.csv')

In [3]:
df_collab.shape

(12891680, 4)

This is a sizeable data set! Let's get it!

In [4]:
df_collab.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12891680 entries, 0 to 12891679
Data columns (total 4 columns):
 #   Column           Dtype 
---  ------           ----- 
 0   user_id          object
 1    "artistname"    object
 2    "trackname"     object
 3    "playlistname"  object
dtypes: object(4)
memory usage: 393.4+ MB


It makes sense that our features are all object type data.

In [5]:
df_collab.columns

Index(['user_id', ' "artistname"', ' "trackname"', ' "playlistname"'], dtype='object')

We have quotes and white spaces as part of our column names. Let's clean them up.

In [6]:
with_quote= df_collab.columns.to_list()
columns =[x.replace('"','') for x in with_quote]
columns_nowhite= [x.strip() for x in columns]

col_replace= dict(zip(with_quote, columns_nowhite))
df_collab= df_collab.rename(columns=col_replace)

In [7]:
df_collab.columns

Index(['user_id', 'artistname', 'trackname', 'playlistname'], dtype='object')

Let's also make sure our data doesn't have white spaces.

In [8]:
for i in columns_nowhite:
    df_collab[i].str.strip()

Now, that's what I like to see! Let's continue.

In [9]:
df_collab.head()

Unnamed: 0,user_id,artistname,trackname,playlistname
0,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello,(The Angels Wanna Wear My) Red Shoes,HARD ROCK 2010
1,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello & The Attractions,"(What's So Funny 'Bout) Peace, Love And Unders...",HARD ROCK 2010
2,9cc0cfd4d7d7885102480dd99e7a90d6,Tiffany Page,7 Years Too Late,HARD ROCK 2010
3,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello & The Attractions,Accidents Will Happen,HARD ROCK 2010
4,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello,Alison,HARD ROCK 2010


In [10]:
df_collab.isna().any()

user_id         False
artistname       True
trackname        True
playlistname     True
dtype: bool

We have a bit of missing data value. Let's explore a bit and decide what to do with them.

In [11]:
df_collab.isna().sum()

user_id             0
artistname      33568
trackname          85
playlistname     1246
dtype: int64

There isn't any way to replace playlist information for those tracks without playlist names. Let's drop them.

In [12]:
df_collab_w_playlist= df_collab[df_collab.playlistname.notna()]

In [13]:
df_collab_w_playlist.isna().sum()

user_id             0
artistname      33568
trackname          85
playlistname        0
dtype: int64

In [14]:
df_notrack= df_collab_w_playlist[df_collab_w_playlist.trackname.isna()]

In [15]:
df_notrack.head(50)

Unnamed: 0,user_id,artistname,trackname,playlistname
224037,42f5289bfa83726edd652392ea09984a,,,Starred
268384,48388a944d86ca079dac6e5d825a2b57,Silversun Pickups,,No One Sleeps When I’m Awake
276878,317a0f3ff15ff0cd8b12fe06f390a24f,Toufic Farroukh,,Lounge
459375,db0d3d755f35fa0ed3985a0be1df0e49,Tamia,,2
459754,db0d3d755f35fa0ed3985a0be1df0e49,Krezip,,moi
459927,db0d3d755f35fa0ed3985a0be1df0e49,Tamia,,relax
569216,c0cf65e23e3df6f75d60f26af75c7162,,,Beat Rush
570028,c0cf65e23e3df6f75d60f26af75c7162,,,Groovin'
846965,798ddeb5ce830765d64b1ff2de51660d,蘇永康,,7- Shanghai
1181985,a21cb7091c1bd79f7b21414b1d07ebdf,,,Fuckin' electro-pop-rockin' stuff


We need both track names and artist names to be able to recommend songs, because there are many songs with same titles, and even the same artists don't always make the same types of music. Let's drop data that miss them.

In [16]:
df_collab_track= df_collab_w_playlist[df_collab_w_playlist.trackname.notna()] 
df_collab_name=df_collab_track[df_collab_track.artistname.notna()]

In [17]:
df_collab.isna().sum()

user_id             0
artistname      33568
trackname          85
playlistname     1246
dtype: int64

In [18]:
df_collab_final=df_collab_name

The data we'll use to build collaborative filtering model is clean. Let's move onto data we'll use for content-based filtering model. 

In [19]:
df_content.shape

(586672, 20)

In [20]:
df_content.head()

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,35iwgR4jXetI318WEWsa1Q,Carve,6,126903,0,['Uli'],['45tIt06XoI0Iio4LBEVpls'],1922-02-22,0.645,0.445,0,-13.338,1,0.451,0.674,0.744,0.151,0.127,104.851,3
1,021ht4sdgPcrDgSk7JTbKY,Capítulo 2.16 - Banquero Anarquista,0,98200,0,['Fernando Pessoa'],['14jtPCOoNZwquk5wd9DxrY'],1922-06-01,0.695,0.263,0,-22.136,1,0.957,0.797,0.0,0.148,0.655,102.009,1
2,07A5yehtSnoedViJAZkNnc,Vivo para Quererte - Remasterizado,0,181640,0,['Ignacio Corsini'],['5LiOoJbxVSAMkBS2fUm3X2'],1922-03-21,0.434,0.177,1,-21.18,1,0.0512,0.994,0.0218,0.212,0.457,130.418,5
3,08FmqUhxtyLTn6pAh6bk45,El Prisionero - Remasterizado,0,176907,0,['Ignacio Corsini'],['5LiOoJbxVSAMkBS2fUm3X2'],1922-03-21,0.321,0.0946,7,-27.961,1,0.0504,0.995,0.918,0.104,0.397,169.98,3
4,08y9GfoqCWfOGsKdwojr5e,Lady of the Evening,0,163080,0,['Dick Haymes'],['3BiJGZsyX9sJchTqcSA7Su'],1922,0.402,0.158,3,-16.9,0,0.039,0.989,0.13,0.311,0.196,103.22,4


We see that some data in release_date feature is in yyyy-mm-dd format, while others are in years. Let's put all of them into years.

In [21]:
df_content.release_date= pd.DatetimeIndex(df_content['release_date']).year

In [22]:
df_content.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 586672 entries, 0 to 586671
Data columns (total 20 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   id                586672 non-null  object 
 1   name              586601 non-null  object 
 2   popularity        586672 non-null  int64  
 3   duration_ms       586672 non-null  int64  
 4   explicit          586672 non-null  int64  
 5   artists           586672 non-null  object 
 6   id_artists        586672 non-null  object 
 7   release_date      586672 non-null  int64  
 8   danceability      586672 non-null  float64
 9   energy            586672 non-null  float64
 10  key               586672 non-null  int64  
 11  loudness          586672 non-null  float64
 12  mode              586672 non-null  int64  
 13  speechiness       586672 non-null  float64
 14  acousticness      586672 non-null  float64
 15  instrumentalness  586672 non-null  float64
 16  liveness          58

All the datatypes look good. Let's check for missing values.

In [23]:
df_content.isna().any()

id                  False
name                 True
popularity          False
duration_ms         False
explicit            False
artists             False
id_artists          False
release_date        False
danceability        False
energy              False
key                 False
loudness            False
mode                False
speechiness         False
acousticness        False
instrumentalness    False
liveness            False
valence             False
tempo               False
time_signature      False
dtype: bool

We have some missing values in name feature. Let's dive in.

In [24]:
df_cont_noname= df_content[df_content.name.isna()]

In [25]:
len(df_cont_noname)

71

We have 71 missing values.

In [26]:
df_cont_noname.sort_values('release_date')

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
511005,6iYs4Z3f6bf8i4AhSSF9pm,,0,177842,0,[''],['0LyfQWJT6nXafLPZqxe9Of'],1922,0.633,0.405,4,-8.762,1,0.0460,0.9810,0.000000,0.5140,0.657,76.162,4
511010,7r26LzTNdjmD3InNNnfKHY,,0,198360,0,[''],['0LyfQWJT6nXafLPZqxe9Of'],1922,0.300,0.267,6,-11.885,1,0.0438,0.9940,0.300000,0.4260,0.282,128.322,3
511002,5vdTZkEtEc7MIeG5n2deZD,,0,196120,0,[''],['0LyfQWJT6nXafLPZqxe9Of'],1922,0.336,0.256,10,-10.744,0,0.0645,0.9950,0.000000,0.2370,0.249,78.685,3
511001,5Wjlz5WSaTuiDo2VoncxnO,,0,186093,0,[''],['0LyfQWJT6nXafLPZqxe9Of'],1922,0.365,0.196,8,-12.543,1,0.0557,0.9950,0.000015,0.3650,0.314,68.255,4
510998,5D7HzxyOeGt46T0VfKwaeT,,0,186973,0,[''],['0LyfQWJT6nXafLPZqxe9Of'],1922,0.491,0.135,0,-13.963,1,0.0787,0.9960,0.131000,0.1220,0.531,74.894,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
517176,5QQHqUZqhbepZwwhie2BUH,,3,235253,0,[''],['0LyfQWJT6nXafLPZqxe9Of'],1962,0.611,0.567,7,-5.642,0,0.0473,0.8180,0.000003,0.4460,0.778,105.355,4
520127,0hKA9A2JPtFdg0fiMhyjQD,,6,194081,0,[''],['0LyfQWJT6nXafLPZqxe9Of'],1974,0.471,0.369,4,-12.927,0,0.1460,0.9680,0.001100,0.1410,0.766,94.063,4
226336,4iH7negBYMfj2z0wDNmgdx,,28,264973,0,[''],['0LyfQWJT6nXafLPZqxe9Of'],1994,0.512,0.578,0,-12.280,0,0.0299,0.0433,0.000064,0.5160,0.692,156.465,1
525238,1kR4gIb7nGxHPI3D2ifs59,,26,289440,0,[''],['0LyfQWJT6nXafLPZqxe9Of'],1998,0.501,0.583,7,-9.460,0,0.0605,0.6900,0.003960,0.0747,0.734,138.391,4


This seems pretty alarming at first, not because the names are missing, but because or artists are. Let's try removing [' '] from artists column and check again for missing artist names. Let's try to query it from Spotify API.

In [27]:
def get_auth_key():
    headers = {}
    client_id = ""
    client_secret = ""
    with open("client_id.txt", "r") as infile:
        client_id = infile.read()
        client_id = client_id[ 0 : len(client_id) - 1]
    with open("client_secret.txt", "r") as infile:
        client_secret = infile.read()
        client_secret = client_secret[ 0 : len(client_secret) - 1]
    client_str = f"{client_id}:{client_secret}"
    client_str_bytes = client_str.encode('ascii')
    client_str = b64encode( client_str_bytes ) 
    client_str = client_str.decode('ascii')
    auth_header = f"Basic {client_str}"
    headers['Authorization'] = auth_header
    data = {
        "grant_type" : "client_credentials"
    }
    url = "https://accounts.spotify.com/api/token"
    myreq = r.post(url, headers=headers, data=data)
    status_code = myreq.status_code 
    content = myreq.content.decode('ascii')
    json_data = json.loads(content)
    access_token = json_data['access_token']
    return access_token

In [28]:
# The function to actually perform the request.

def do_request(id):

    url=f"https://api.spotify.com/v1/tracks/{id}"
    
    headers = {
        "Accept"        : "application/json",
        "Content-Type"  : "application/json",
    }
    auth_key = get_auth_key()
    headers['Authorization'] = f"Bearer {auth_key}"
    myreq = r.get(url, headers=headers)
    content = myreq.content
    status_code = myreq.status_code 
    if status_code != 200:
        print("Error: status code:", status_code)
        exit(-1)
    json_data = json.loads(content)
  
    return json_data

In [29]:
id_list=df_cont_noname['id'].tolist()

In [30]:
track_info= do_request(id_list[70])
track_info.keys()

dict_keys(['album', 'artists', 'available_markets', 'disc_number', 'duration_ms', 'explicit', 'external_ids', 'external_urls', 'href', 'id', 'is_local', 'name', 'popularity', 'preview_url', 'track_number', 'type', 'uri'])

In [31]:
for i in id_list:
    track_info= do_request(i)
    print(track_info['album'])

{'album_type': 'album', 'artists': [{'external_urls': {'spotify': 'https://open.spotify.com/artist/0LyfQWJT6nXafLPZqxe9Of'}, 'href': 'https://api.spotify.com/v1/artists/0LyfQWJT6nXafLPZqxe9Of', 'id': '0LyfQWJT6nXafLPZqxe9Of', 'name': 'Various Artists', 'type': 'artist', 'uri': 'spotify:artist:0LyfQWJT6nXafLPZqxe9Of'}], 'available_markets': ['AD', 'AE', 'AG', 'AL', 'AM', 'AO', 'AR', 'AT', 'AU', 'AZ', 'BA', 'BB', 'BD', 'BE', 'BF', 'BG', 'BH', 'BI', 'BJ', 'BN', 'BO', 'BR', 'BS', 'BT', 'BW', 'BY', 'BZ', 'CA', 'CD', 'CG', 'CH', 'CI', 'CL', 'CM', 'CO', 'CR', 'CV', 'CW', 'CY', 'CZ', 'DE', 'DJ', 'DK', 'DM', 'DO', 'DZ', 'EC', 'EE', 'EG', 'ES', 'ET', 'FI', 'FJ', 'FM', 'FR', 'GA', 'GB', 'GD', 'GE', 'GH', 'GM', 'GN', 'GQ', 'GR', 'GT', 'GW', 'GY', 'HK', 'HN', 'HR', 'HT', 'HU', 'ID', 'IE', 'IL', 'IN', 'IQ', 'IS', 'IT', 'JM', 'JO', 'JP', 'KE', 'KG', 'KH', 'KI', 'KM', 'KN', 'KR', 'KW', 'KZ', 'LA', 'LB', 'LC', 'LI', 'LK', 'LR', 'LS', 'LT', 'LU', 'LV', 'LY', 'MA', 'MC', 'MD', 'ME', 'MG', 'MH', 'MK', 'ML

{'album_type': 'compilation', 'artists': [{'external_urls': {'spotify': 'https://open.spotify.com/artist/0LyfQWJT6nXafLPZqxe9Of'}, 'href': 'https://api.spotify.com/v1/artists/0LyfQWJT6nXafLPZqxe9Of', 'id': '0LyfQWJT6nXafLPZqxe9Of', 'name': 'Various Artists', 'type': 'artist', 'uri': 'spotify:artist:0LyfQWJT6nXafLPZqxe9Of'}], 'available_markets': [], 'external_urls': {'spotify': 'https://open.spotify.com/album/4ChuKQyM9bKb3Se86YUY8R'}, 'href': 'https://api.spotify.com/v1/albums/4ChuKQyM9bKb3Se86YUY8R', 'id': '4ChuKQyM9bKb3Se86YUY8R', 'images': [], 'name': '', 'release_date': '0000', 'release_date_precision': 'year', 'total_tracks': 18, 'type': 'album', 'uri': 'spotify:album:4ChuKQyM9bKb3Se86YUY8R'}
{'album_type': 'compilation', 'artists': [{'external_urls': {'spotify': 'https://open.spotify.com/artist/0LyfQWJT6nXafLPZqxe9Of'}, 'href': 'https://api.spotify.com/v1/artists/0LyfQWJT6nXafLPZqxe9Of', 'id': '0LyfQWJT6nXafLPZqxe9Of', 'name': 'Various Artists', 'type': 'artist', 'uri': 'spotify:

{'album_type': 'compilation', 'artists': [{'external_urls': {'spotify': 'https://open.spotify.com/artist/0LyfQWJT6nXafLPZqxe9Of'}, 'href': 'https://api.spotify.com/v1/artists/0LyfQWJT6nXafLPZqxe9Of', 'id': '0LyfQWJT6nXafLPZqxe9Of', 'name': 'Various Artists', 'type': 'artist', 'uri': 'spotify:artist:0LyfQWJT6nXafLPZqxe9Of'}], 'available_markets': [], 'external_urls': {'spotify': 'https://open.spotify.com/album/10R2p3d0Ppitd3spBziRPt'}, 'href': 'https://api.spotify.com/v1/albums/10R2p3d0Ppitd3spBziRPt', 'id': '10R2p3d0Ppitd3spBziRPt', 'images': [], 'name': '', 'release_date': '0000', 'release_date_precision': 'year', 'total_tracks': 13, 'type': 'album', 'uri': 'spotify:album:10R2p3d0Ppitd3spBziRPt'}
{'album_type': 'compilation', 'artists': [{'external_urls': {'spotify': 'https://open.spotify.com/artist/0LyfQWJT6nXafLPZqxe9Of'}, 'href': 'https://api.spotify.com/v1/artists/0LyfQWJT6nXafLPZqxe9Of', 'id': '0LyfQWJT6nXafLPZqxe9Of', 'name': 'Various Artists', 'type': 'artist', 'uri': 'spotify:

{'album_type': 'album', 'artists': [{'external_urls': {'spotify': 'https://open.spotify.com/artist/0LyfQWJT6nXafLPZqxe9Of'}, 'href': 'https://api.spotify.com/v1/artists/0LyfQWJT6nXafLPZqxe9Of', 'id': '0LyfQWJT6nXafLPZqxe9Of', 'name': 'Various Artists', 'type': 'artist', 'uri': 'spotify:artist:0LyfQWJT6nXafLPZqxe9Of'}], 'available_markets': ['IN'], 'external_urls': {'spotify': 'https://open.spotify.com/album/3gtSWpxwGEr56U6OaXbWFL'}, 'href': 'https://api.spotify.com/v1/albums/3gtSWpxwGEr56U6OaXbWFL', 'id': '3gtSWpxwGEr56U6OaXbWFL', 'images': [], 'name': '', 'release_date': '0000', 'release_date_precision': 'year', 'total_tracks': 9, 'type': 'album', 'uri': 'spotify:album:3gtSWpxwGEr56U6OaXbWFL'}
{'album_type': 'album', 'artists': [{'external_urls': {'spotify': 'https://open.spotify.com/artist/0LyfQWJT6nXafLPZqxe9Of'}, 'href': 'https://api.spotify.com/v1/artists/0LyfQWJT6nXafLPZqxe9Of', 'id': '0LyfQWJT6nXafLPZqxe9Of', 'name': 'Various Artists', 'type': 'artist', 'uri': 'spotify:artist:0L

{'album_type': 'compilation', 'artists': [{'external_urls': {'spotify': 'https://open.spotify.com/artist/0LyfQWJT6nXafLPZqxe9Of'}, 'href': 'https://api.spotify.com/v1/artists/0LyfQWJT6nXafLPZqxe9Of', 'id': '0LyfQWJT6nXafLPZqxe9Of', 'name': 'Various Artists', 'type': 'artist', 'uri': 'spotify:artist:0LyfQWJT6nXafLPZqxe9Of'}], 'available_markets': [], 'external_urls': {'spotify': 'https://open.spotify.com/album/27959NnbMnYIX5MMvdU06w'}, 'href': 'https://api.spotify.com/v1/albums/27959NnbMnYIX5MMvdU06w', 'id': '27959NnbMnYIX5MMvdU06w', 'images': [], 'name': '', 'release_date': '0000', 'release_date_precision': 'year', 'total_tracks': 15, 'type': 'album', 'uri': 'spotify:album:27959NnbMnYIX5MMvdU06w'}
{'album_type': 'compilation', 'artists': [{'external_urls': {'spotify': 'https://open.spotify.com/artist/0LyfQWJT6nXafLPZqxe9Of'}, 'href': 'https://api.spotify.com/v1/artists/0LyfQWJT6nXafLPZqxe9Of', 'id': '0LyfQWJT6nXafLPZqxe9Of', 'name': 'Various Artists', 'type': 'artist', 'uri': 'spotify:

{'album_type': 'single', 'artists': [{'external_urls': {'spotify': 'https://open.spotify.com/artist/0LyfQWJT6nXafLPZqxe9Of'}, 'href': 'https://api.spotify.com/v1/artists/0LyfQWJT6nXafLPZqxe9Of', 'id': '0LyfQWJT6nXafLPZqxe9Of', 'name': 'Various Artists', 'type': 'artist', 'uri': 'spotify:artist:0LyfQWJT6nXafLPZqxe9Of'}], 'available_markets': ['IN'], 'external_urls': {'spotify': 'https://open.spotify.com/album/2u8E2us9hjikWjzSs1TYZp'}, 'href': 'https://api.spotify.com/v1/albums/2u8E2us9hjikWjzSs1TYZp', 'id': '2u8E2us9hjikWjzSs1TYZp', 'images': [], 'name': '', 'release_date': '0000', 'release_date_precision': 'year', 'total_tracks': 1, 'type': 'album', 'uri': 'spotify:album:2u8E2us9hjikWjzSs1TYZp'}
{'album_type': 'single', 'artists': [{'external_urls': {'spotify': 'https://open.spotify.com/artist/0LyfQWJT6nXafLPZqxe9Of'}, 'href': 'https://api.spotify.com/v1/artists/0LyfQWJT6nXafLPZqxe9Of', 'id': '0LyfQWJT6nXafLPZqxe9Of', 'name': 'Various Artists', 'type': 'artist', 'uri': 'spotify:artist:

Results from queries do not show any information that would allow us to identify tracks or their artists. After accessing external urls, I have learned that tracks are empty. This means that those tracks have been removed (due to many possible reasons, from issues with distributors, copyrights, or artist discretion). While we do have track information, we cannot recommend music that is currently not available. Let's drop them.

In [32]:
df_cont_track= df_content[df_content.name.notna()]

In [33]:
df_cont_track.isna().any()

id                  False
name                False
popularity          False
duration_ms         False
explicit            False
artists             False
id_artists          False
release_date        False
danceability        False
energy              False
key                 False
loudness            False
mode                False
speechiness         False
acousticness        False
instrumentalness    False
liveness            False
valence             False
tempo               False
time_signature      False
dtype: bool

In [34]:
df_content_final=df_cont_track

In [35]:
df_collab_final.to_csv('df_collab_cleaned.csv',index=False)
df_content_final.to_csv('df_content_cleaned.csv', index=False)