# Spotify Data Scraping
using spotipy (spotify api)

## Get all tracks and info about the tracks for NCT artist (NCT 127, NCT, NCT U, NCT Dream, WayV)
References:
- https://rareloot.medium.com/extracting-spotify-data-on-your-favourite-artist-via-python-d58bc92a4330
- https://towardsdatascience.com/extracting-song-data-from-the-spotify-api-using-python-b1e79388d50
- https://rareloot.medium.com/extracting-spotify-data-on-your-favourite-artist-via-python-d58bc92a4330
- spotipy dev: https://developer.spotify.com/discover/
- metadata: https://developer.spotify.com/documentation/web-api/reference/#/
- https://developer.spotify.com/documentation/web-api/reference/#/operations/get-playlists-tracks
- docs: https://spotipy.readthedocs.io/en/2.19.0/#getting-started

In [1]:
# Client ID XXXXXX
# Client Secret YYYYYYY
import spotipy
import pandas as pd
from spotipy.oauth2 import SpotifyClientCredentials
from http import client
from pydoc import cli
cid = 'XXXXXX'
secret = 'YYYYYYY'
client_credentials_manager = SpotifyClientCredentials(client_id=cid, client_secret=secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

#### URI for each artist.
You can get it from the artist spotify page.
- URI NCT 127  https://open.spotify.com/artist/7f4ignuCJhLXfZ9giKT7rH?si=Aj1uY3-IRfuBA-mWhi3DqA 
- URI NCT https://open.spotify.com/artist/48eO052eSDcn8aTxiv6QaG?si=-aMQhhqjQ4ysAoGA1smzXQ
- URI NCT U https://open.spotify.com/artist/3paGCCtX1Xr4Gx53mSeZuQ?si=UZd10jthT3CprVpXI8P5DA
- URI NCT DREAM https://open.spotify.com/artist/1gBUSTR3TyDdTVFIaQnc02?si=CAMGP0bKR_OTIOsWmBMlUQ
- URI WAYV https://open.spotify.com/artist/1qBsABYUrxg9afpMtyoFKz?si=3vKmSjrdSpevtSUsD1ShRw

In [2]:
# link of each artist
nct127_link = 'https://open.spotify.com/artist/7f4ignuCJhLXfZ9giKT7rH?si=Aj1uY3-IRfuBA-mWhi3DqA'
nct_link = 'https://open.spotify.com/artist/48eO052eSDcn8aTxiv6QaG?si=-aMQhhqjQ4ysAoGA1smzXQ'
nctu_link = 'https://open.spotify.com/artist/3paGCCtX1Xr4Gx53mSeZuQ?si=UZd10jthT3CprVpXI8P5DA'
nctdream_link = 'https://open.spotify.com/artist/1gBUSTR3TyDdTVFIaQnc02?si=CAMGP0bKR_OTIOsWmBMlUQ'
wayv_link = 'https://open.spotify.com/artist/1qBsABYUrxg9afpMtyoFKz?si=3vKmSjrdSpevtSUsD1ShRw'

### Functions

##### Get list of albums `get_albums(artist_name, artist_link, type_of_album)`

In [3]:
"""
Function to get list of artist's albums and return it as dataframe and raw dictionary of album's info.
Input parameter: artist name, artist link, type of album (single/album)
return:
1. albums_artist_list = dataframe of the artist's albums = get_albums(par)[0]
2. albums_artist = raw data (dictionary) of artist's album = get_albums(par)[1]
"""

def get_albums(artist_name, artist_link, type_of_album):
    artist_uri = artist_link.split("/")[-1].split('?')[0]
    artist_results = sp.artist_albums(artist_uri, album_type=type_of_album)
    albums_artist = artist_results['items']
    while artist_results['next']:
        artist_results = sp.next(artist_results)
        artist_results.extend(artist_results['items'])
    albums_artist_list = []
    for album in albums_artist:
        albums_artist_list.append([artist_name, album['name'],album['album_type'], album['release_date'], album['total_tracks'],album['uri']])

    albums_artist_list = pd.DataFrame(albums_artist_list, columns=['artist_name', 'album_name','album_type','release_date','total_tracks','album_uri'])
    albums_artist_list.drop_duplicates(inplace=True)
    return albums_artist_list, albums_artist

##### Get all tracks `get_all_tracks(artist_name, albums)`

In [4]:
"""
Get all tracks from all albums of an artist.
input:
- artist_name = artist name
- albums = dictionary of album raw data.

return
- all tracks in dataframe format
"""
def get_all_tracks(albums):
    artist_tracks = []
    for album in albums:
        track = sp.album_tracks(album['uri'])
        for n in (track['items']):
            track_uri = n['uri'].split(':')[2]
            track_info = sp.track(str(track_uri))
            track_features = sp.audio_features(track_uri)
            # track_audio_analysis = sp.audio_analysis(track_uri)
            # print(track_info)
            all_information = [n['name'], n['duration_ms'],n['explicit'],n['track_number'],
                                track_info['popularity'],
                                album['artists'][0]['name'],
                                album['name'],album['release_date'],
                                track_features[0]['danceability'],track_features[0]['energy'],track_features[0]['key'],track_features[0]['loudness'],track_features[0]['mode'],track_features[0]['speechiness'],track_features[0]['acousticness'],track_features[0]['instrumentalness'],track_features[0]['liveness'],track_features[0]['tempo'],track_features[0]['time_signature'],
                                album['uri'],
                                n['uri']]
            artist_tracks.append(all_information)

    artist_tracks_df = pd.DataFrame(artist_tracks, columns=['name','duration_ms','explicit','track_number',
                                                    'popularity',
                                                    'artist_name',
                                                    'album_name','release_date',
                                                    'danceability','energy','key','loudness','mode','speechiness','acousticness','instrumentalness','liveness','tempo','time_signature',
                                                    'album_uri',
                                                    'track_uri'])
    return artist_tracks_df

##### Get top tracks for each artist `get_top_tracks(artist_link)`

In [5]:
"""
Get top tracks of an artist and other audio features for each track
Input parameter: 
- artist_link : link to artist page
Output
- dataframe of top tracks from an artist
"""
def get_top_tracks(artist_name, artist_link):
    artist_uri = artist_link.split("/")[-1].split('?')[0]
    artist_toptracks = sp.artist_top_tracks(artist_uri)
    toptracks = artist_toptracks['tracks']
    list_toptracks = [[artist_name, i['name'], i['album']['name'], i['popularity'], i['track_number'], i['duration_ms'], i['explicit'], i['id'], i['uri']] for i in toptracks]
    toptracks_df = pd.DataFrame(list_toptracks, columns=['artist_name','name','album','popularity','track_number','duration_ms','explicit','id', 'track_uri'])
    
    features = []
    for i in toptracks:
        uri = i['uri'].split(':')[2]
        track_info = sp.track(str(uri))
        track_features = sp.audio_features(uri)
        # print(track_info)
        comb = [track_info['name'],track_features[0]['danceability'],track_features[0]['energy'],track_features[0]['key'],track_features[0]['loudness'],track_features[0]['mode'],track_features[0]['speechiness'],track_features[0]['acousticness'],track_features[0]['instrumentalness'],track_features[0]['liveness'],track_features[0]['tempo'],track_features[0]['time_signature'],i['uri']]
        features.append(comb)
    # features
    feature_tracks_df = pd.DataFrame(features, columns=['name','danceability','energy','key','loudness','mode','speechiness','acousticness','instrumentalness','liveness','tempo','time_signature','track_uri'])
    top_tracks_info = pd.merge(toptracks_df, feature_tracks_df, on=['name','track_uri']).reset_index(drop=True)
    return top_tracks_info

#### Scrape the data
Get the album, single, all tracks in the album/single, and also top tracks for each artist

In [6]:
# nct127_album = get_albums('NCT 127', nct127_link, 'album')[0]
# nct127_tracks = get_all_tracks(get_albums('NCT 127', nct127_link, 'album')[1])
# nct127_top_tracks = get_top_tracks(nct127_link)

In [7]:
nct127_single = get_albums('NCT 127', nct127_link, 'single')
nct_single = get_albums('NCT', nct_link, 'single')
nctu_single = get_albums('NCT U', nctu_link, 'single')
nctdream_single = get_albums('NCT DREAM', nctdream_link, 'single')
wayv_single = get_albums('WayV', wayv_link, 'single')
all_single = pd.concat([nct127_single[0], nct_single[0], nctu_single[0], nctdream_single[0], wayv_single[0]]).reset_index(drop=True)
print(all_single.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58 entries, 0 to 57
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   artist_name   58 non-null     object
 1   album_name    58 non-null     object
 2   album_type    58 non-null     object
 3   release_date  58 non-null     object
 4   total_tracks  58 non-null     int64 
 5   album_uri     58 non-null     object
dtypes: int64(1), object(5)
memory usage: 2.8+ KB
None


In [8]:
nct127_album = get_albums('NCT 127', nct127_link, 'album')
nct_album = get_albums('NCT', nct_link, 'album')
nctu_album = get_albums('NCT U', nctu_link, 'album')
nctdream_album = get_albums('NCT DREAM', nctdream_link, 'album')
wayv_album = get_albums('WayV', wayv_link, 'album')
all_albums = pd.concat([nct127_album[0], nct_album[0], nctu_album[0], nctdream_album[0], wayv_album[0]]).reset_index(drop=True)
print(all_albums.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31 entries, 0 to 30
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   artist_name   31 non-null     object
 1   album_name    31 non-null     object
 2   album_type    31 non-null     object
 3   release_date  31 non-null     object
 4   total_tracks  31 non-null     object
 5   album_uri     31 non-null     object
dtypes: object(6)
memory usage: 1.6+ KB
None


In [9]:
nct127_tracks = get_all_tracks(nct127_album[1])
nct_tracks = get_all_tracks(nct_album[1])
nctu_tracks = get_all_tracks(nctu_album[1])
nctdream_tracks = get_all_tracks(nctdream_album[1])
wayv_tracks = get_all_tracks(wayv_album[1])
all_tracks = pd.concat([nct127_tracks, nct_tracks, nctu_tracks, nctdream_tracks, wayv_tracks]).reset_index(drop=True)
print(all_tracks.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 417 entries, 0 to 416
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   name              417 non-null    object 
 1   duration_ms       417 non-null    object 
 2   explicit          417 non-null    object 
 3   track_number      417 non-null    object 
 4   popularity        417 non-null    object 
 5   artist_name       417 non-null    object 
 6   album_name        417 non-null    object 
 7   release_date      417 non-null    object 
 8   danceability      417 non-null    float64
 9   energy            417 non-null    float64
 10  key               417 non-null    object 
 11  loudness          417 non-null    float64
 12  mode              417 non-null    object 
 13  speechiness       417 non-null    float64
 14  acousticness      417 non-null    float64
 15  instrumentalness  417 non-null    float64
 16  liveness          417 non-null    float64
 1

In [10]:
nct127_tracks_single = get_all_tracks(nct127_single[1])
nct_tracks_single = get_all_tracks(nct_single[1])
nctu_tracks_single = get_all_tracks(nctu_single[1])
nctdream_tracks_single = get_all_tracks(nctdream_single[1])
wayv_tracks_single = get_all_tracks(wayv_single[1])
all_tracks_single = pd.concat([nct127_tracks_single, nct_tracks_single, nctu_tracks_single, nctdream_tracks_single, wayv_tracks_single]).reset_index(drop=True)
print(all_tracks_single.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 151 entries, 0 to 150
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   name              151 non-null    object 
 1   duration_ms       151 non-null    int64  
 2   explicit          151 non-null    bool   
 3   track_number      151 non-null    int64  
 4   popularity        151 non-null    int64  
 5   artist_name       151 non-null    object 
 6   album_name        151 non-null    object 
 7   release_date      151 non-null    object 
 8   danceability      151 non-null    float64
 9   energy            151 non-null    float64
 10  key               151 non-null    int64  
 11  loudness          151 non-null    float64
 12  mode              151 non-null    int64  
 13  speechiness       151 non-null    float64
 14  acousticness      151 non-null    float64
 15  instrumentalness  151 non-null    float64
 16  liveness          151 non-null    float64
 1

In [11]:
nct127_top_tracks = get_top_tracks('NCT 127', nct127_link)
nct_top_tracks = get_top_tracks('NCT', nct_link)
nctu_top_tracks = get_top_tracks('NCT U', nctu_link)
nctdream_top_tracks = get_top_tracks('NCT DREAM', nctdream_link)
wayv_top_tracks = get_top_tracks('WayV', wayv_link)
all_top_tracks = pd.concat([nct127_top_tracks, nct_top_tracks, nctu_top_tracks, nctdream_top_tracks, wayv_top_tracks]).reset_index(drop=True)
print(all_top_tracks.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   artist_name       50 non-null     object 
 1   name              50 non-null     object 
 2   album             50 non-null     object 
 3   popularity        50 non-null     int64  
 4   track_number      50 non-null     int64  
 5   duration_ms       50 non-null     int64  
 6   explicit          50 non-null     bool   
 7   id                50 non-null     object 
 8   track_uri         50 non-null     object 
 9   danceability      50 non-null     float64
 10  energy            50 non-null     float64
 11  key               50 non-null     int64  
 12  loudness          50 non-null     float64
 13  mode              50 non-null     int64  
 14  speechiness       50 non-null     float64
 15  acousticness      50 non-null     float64
 16  instrumentalness  50 non-null     float64
 17 

In [12]:
# all_single.to_csv('/home/yanamuh/projects/2022_oct_03_spotify/all_single.csv', index=False)
# all_albums.to_csv('/home/yanamuh/projects/2022_oct_03_spotify/all_albums.csv', index=False)
# all_tracks.to_csv('/home/yanamuh/projects/2022_oct_03_spotify/all_tracks.csv', index=False)
# all_tracks_single.to_csv('/home/yanamuh/projects/2022_oct_03_spotify/all_tracks_single.csv', index=False)

In [42]:
# merge album/single info with related tracks
tracks_and_albums_info = pd.merge(all_albums, all_tracks, on=['album_uri','album_name','release_date','artist_name']).reset_index(drop=True)
tracks_and_single_info = pd.merge(all_single, all_tracks_single, on=['album_uri','album_name','release_date','artist_name']).reset_index(drop=True)

In [46]:
tracks_and_single_info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 143 entries, 0 to 142
Data columns (total 23 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   artist_name       143 non-null    object 
 1   album_name        143 non-null    object 
 2   album_type        143 non-null    object 
 3   release_date      143 non-null    object 
 4   total_tracks      143 non-null    int64  
 5   album_uri         143 non-null    object 
 6   name              143 non-null    object 
 7   duration_ms       143 non-null    int64  
 8   explicit          143 non-null    bool   
 9   track_number      143 non-null    int64  
 10  popularity        143 non-null    int64  
 11  danceability      143 non-null    float64
 12  energy            143 non-null    float64
 13  key               143 non-null    int64  
 14  loudness          143 non-null    float64
 15  mode              143 non-null    int64  
 16  speechiness       143 non-null    float64
 1

Remove same track & same release_date that eventually has different uri (drop that has duplicate release_date and name). Also drop single that has the same release date and name, but the name/album name has different style.
- example: Kick It - MINIMONSTER Remix AND Kick It - Minimonster Remix

In [51]:
tracks_albums_single_info = pd.concat([tracks_and_albums_info, tracks_and_single_info]).reset_index(drop=True)
tracks_albums_single_info['name'] = tracks_albums_single_info['name'].str.lower()

In [52]:
tracks_albums_single_info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 560 entries, 0 to 559
Data columns (total 23 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   artist_name       560 non-null    object 
 1   album_name        560 non-null    object 
 2   album_type        560 non-null    object 
 3   release_date      560 non-null    object 
 4   total_tracks      560 non-null    object 
 5   album_uri         560 non-null    object 
 6   name              560 non-null    object 
 7   duration_ms       560 non-null    object 
 8   explicit          560 non-null    object 
 9   track_number      560 non-null    object 
 10  popularity        560 non-null    object 
 11  danceability      560 non-null    float64
 12  energy            560 non-null    float64
 13  key               560 non-null    object 
 14  loudness          560 non-null    float64
 15  mode              560 non-null    object 
 16  speechiness       560 non-null    float64
 1

In [62]:
tracks_albums_single = tracks_albums_single_info[~tracks_albums_single_info.duplicated(['artist_name','release_date','name'])]
tracks_albums_single.to_csv('/home/projects/2022_oct_03_spotify/results/tracks_albums_single.csv', index=False)

In [63]:
all_top_tracks.to_csv('/home/projects/2022_oct_03_spotify/results/all_top_tracks.csv', index=False)