# Spotify Data Scraping
using spotipy (spotify api)

## Get all tracks and info about the tracks for NCT artist (NCT 127, NCT, NCT U, NCT Dream, WayV)
References:
- https://rareloot.medium.com/extracting-spotify-data-on-your-favourite-artist-via-python-d58bc92a4330
- https://towardsdatascience.com/extracting-song-data-from-the-spotify-api-using-python-b1e79388d50
- https://rareloot.medium.com/extracting-spotify-data-on-your-favourite-artist-via-python-d58bc92a4330
- spotipy dev: https://developer.spotify.com/discover/
- metadata: https://developer.spotify.com/documentation/web-api/reference/#/
- https://developer.spotify.com/documentation/web-api/reference/#/operations/get-playlists-tracks
- docs: https://spotipy.readthedocs.io/en/2.19.0/#getting-started

In [2]:
# Client ID XXXXX
# Client Secret YYYYY
import spotipy
import pandas as pd
import numpy as np
from spotipy.oauth2 import SpotifyClientCredentials
from http import client
from pydoc import cli
cid = 'XXXXX'
secret = 'YYYYY'
client_credentials_manager = SpotifyClientCredentials(client_id=cid, client_secret=secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

#### URI for each artist.
You can get it from the artist spotify page.
- URI NCT 127  https://open.spotify.com/artist/7f4ignuCJhLXfZ9giKT7rH?si=Aj1uY3-IRfuBA-mWhi3DqA 
- URI NCT https://open.spotify.com/artist/48eO052eSDcn8aTxiv6QaG?si=-aMQhhqjQ4ysAoGA1smzXQ
- URI NCT U https://open.spotify.com/artist/3paGCCtX1Xr4Gx53mSeZuQ?si=UZd10jthT3CprVpXI8P5DA
- URI NCT DREAM https://open.spotify.com/artist/1gBUSTR3TyDdTVFIaQnc02?si=CAMGP0bKR_OTIOsWmBMlUQ
- URI WAYV https://open.spotify.com/artist/1qBsABYUrxg9afpMtyoFKz?si=3vKmSjrdSpevtSUsD1ShRw

In [3]:
# link of each artist
nct127_link = 'https://open.spotify.com/artist/7f4ignuCJhLXfZ9giKT7rH?si=Aj1uY3-IRfuBA-mWhi3DqA'
nct_link = 'https://open.spotify.com/artist/48eO052eSDcn8aTxiv6QaG?si=-aMQhhqjQ4ysAoGA1smzXQ'
nctu_link = 'https://open.spotify.com/artist/3paGCCtX1Xr4Gx53mSeZuQ?si=UZd10jthT3CprVpXI8P5DA'
nctdream_link = 'https://open.spotify.com/artist/1gBUSTR3TyDdTVFIaQnc02?si=CAMGP0bKR_OTIOsWmBMlUQ'
wayv_link = 'https://open.spotify.com/artist/1qBsABYUrxg9afpMtyoFKz?si=3vKmSjrdSpevtSUsD1ShRw'

### Functions

##### Get list of albums `get_albums(artist_name, artist_link, type_of_album)`

In [4]:
"""
Function to get list of artist's albums and return it as dataframe and raw dictionary of album's info.
Input parameter: artist name, artist link, type of album (single/album)
return:
1. albums_artist_list = dataframe of the artist's albums = get_albums(par)[0]
2. albums_artist = raw data (dictionary) of artist's album = get_albums(par)[1]
"""

def get_albums(artist_name, artist_link, type_of_album):
    artist_uri = artist_link.split("/")[-1].split('?')[0]
    artist_results = sp.artist_albums(artist_uri, album_type=type_of_album)
    albums_artist = artist_results['items']
    while artist_results['next']:
        artist_results = sp.next(artist_results)
        artist_results.extend(artist_results['items'])
    albums_artist_list = []
    for album in albums_artist:
        albums_artist_list.append([artist_name, album['name'],album['album_type'], album['release_date'], album['total_tracks'],album['uri']])

    albums_artist_list = pd.DataFrame(albums_artist_list, columns=['artist_name', 'album_name','album_type','release_date','total_tracks','album_uri'])
    albums_artist_list.drop_duplicates(inplace=True)
    return albums_artist_list, albums_artist

##### Get all tracks `get_all_tracks(artist_name, albums)`

In [5]:
"""
Get all tracks from all albums of an artist.
input:
- artist_name = artist name
- albums = dictionary of album raw data.

return
- all tracks in dataframe format
"""
def get_all_tracks(albums):
    artist_tracks = []
    for album in albums:
        track = sp.album_tracks(album['uri'])
        for n in (track['items']):
            track_uri = n['uri'].split(':')[2]
            track_info = sp.track(str(track_uri))
            track_features = sp.audio_features(track_uri)
            # track_audio_analysis = sp.audio_analysis(track_uri)
            # print(track_info)
            all_information = [n['name'], n['duration_ms'],n['explicit'],n['track_number'],
                                track_info['popularity'],
                                album['artists'][0]['name'],
                                album['name'],album['release_date'],
                                track_features[0]['valence'],track_features[0]['danceability'],track_features[0]['energy'],track_features[0]['key'],track_features[0]['loudness'],track_features[0]['mode'],track_features[0]['speechiness'],track_features[0]['acousticness'],track_features[0]['instrumentalness'],track_features[0]['liveness'],track_features[0]['tempo'],track_features[0]['time_signature'],
                                album['uri'],
                                n['uri']]
            artist_tracks.append(all_information)

    artist_tracks_df = pd.DataFrame(artist_tracks, columns=['name','duration_ms','explicit','track_number',
                                                    'popularity',
                                                    'artist_name',
                                                    'album_name','release_date',
                                                    'valence','danceability','energy','key','loudness','mode','speechiness','acousticness','instrumentalness','liveness','tempo','time_signature',
                                                    'album_uri',
                                                    'track_uri'])
    return artist_tracks_df

##### Get top tracks for each artist `get_top_tracks(artist_link)`

In [6]:
"""
Get top tracks of an artist and other audio features for each track
Input parameter: 
- artist_link : link to artist page
Output
- dataframe of top tracks from an artist
"""
def get_top_tracks(artist_name, artist_link):
    artist_uri = artist_link.split("/")[-1].split('?')[0]
    artist_toptracks = sp.artist_top_tracks(artist_uri)
    toptracks = artist_toptracks['tracks']
    list_toptracks = [[artist_name, i['name'], i['album']['name'], i['popularity'], i['track_number'], i['duration_ms'], i['explicit'], i['id'], i['uri']] for i in toptracks]
    toptracks_df = pd.DataFrame(list_toptracks, columns=['artist_name','name','album','popularity','track_number','duration_ms','explicit','id', 'track_uri'])
    
    features = []
    for i in toptracks:
        uri = i['uri'].split(':')[2]
        track_info = sp.track(str(uri))
        track_features = sp.audio_features(uri)
        # print(track_info)
        comb = [track_info['name'],track_features[0]['valence'],track_features[0]['danceability'],track_features[0]['energy'],track_features[0]['key'],track_features[0]['loudness'],track_features[0]['mode'],track_features[0]['speechiness'],track_features[0]['acousticness'],track_features[0]['instrumentalness'],track_features[0]['liveness'],track_features[0]['tempo'],track_features[0]['time_signature'],i['uri']]
        features.append(comb)
    # features
    feature_tracks_df = pd.DataFrame(features, columns=['name','valence','danceability','energy','key','loudness','mode','speechiness','acousticness','instrumentalness','liveness','tempo','time_signature','track_uri'])
    top_tracks_info = pd.merge(toptracks_df, feature_tracks_df, on=['name','track_uri']).reset_index(drop=True)
    return top_tracks_info

#### Scrape the data
Get the album, single, all tracks in the album/single, and also top tracks for each artist

In [7]:
# nct127_album = get_albums('NCT 127', nct127_link, 'album')[0]
# nct127_tracks = get_all_tracks(get_albums('NCT 127', nct127_link, 'album')[1])
# nct127_top_tracks = get_top_tracks(nct127_link)

In [8]:
nct127_single = get_albums('NCT 127', nct127_link, 'single')
nct_single = get_albums('NCT', nct_link, 'single')
nctu_single = get_albums('NCT U', nctu_link, 'single')
nctdream_single = get_albums('NCT DREAM', nctdream_link, 'single')
wayv_single = get_albums('WayV', wayv_link, 'single')
all_single = pd.concat([nct127_single[0], nct_single[0], nctu_single[0], nctdream_single[0], wayv_single[0]]).reset_index(drop=True)
print(all_single.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58 entries, 0 to 57
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   artist_name   58 non-null     object
 1   album_name    58 non-null     object
 2   album_type    58 non-null     object
 3   release_date  58 non-null     object
 4   total_tracks  58 non-null     int64 
 5   album_uri     58 non-null     object
dtypes: int64(1), object(5)
memory usage: 2.8+ KB
None


In [9]:
nct127_album = get_albums('NCT 127', nct127_link, 'album')
nct_album = get_albums('NCT', nct_link, 'album')
nctu_album = get_albums('NCT U', nctu_link, 'album')
nctdream_album = get_albums('NCT DREAM', nctdream_link, 'album')
wayv_album = get_albums('WayV', wayv_link, 'album')
all_albums = pd.concat([nct127_album[0], nct_album[0], nctu_album[0], nctdream_album[0], wayv_album[0]]).reset_index(drop=True)
print(all_albums.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31 entries, 0 to 30
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   artist_name   31 non-null     object
 1   album_name    31 non-null     object
 2   album_type    31 non-null     object
 3   release_date  31 non-null     object
 4   total_tracks  31 non-null     object
 5   album_uri     31 non-null     object
dtypes: object(6)
memory usage: 1.6+ KB
None


In [11]:
nct127_tracks = get_all_tracks(nct127_album[1])
nct_tracks = get_all_tracks(nct_album[1])
nctu_tracks = get_all_tracks(nctu_album[1])
nctdream_tracks = get_all_tracks(nctdream_album[1])
wayv_tracks = get_all_tracks(wayv_album[1])
all_tracks = pd.concat([nct127_tracks, nct_tracks, nctu_tracks, nctdream_tracks, wayv_tracks]).reset_index(drop=True)
print(all_tracks.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 417 entries, 0 to 416
Data columns (total 22 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   name              417 non-null    object 
 1   duration_ms       417 non-null    object 
 2   explicit          417 non-null    object 
 3   track_number      417 non-null    object 
 4   popularity        417 non-null    object 
 5   artist_name       417 non-null    object 
 6   album_name        417 non-null    object 
 7   release_date      417 non-null    object 
 8   valence           417 non-null    float64
 9   danceability      417 non-null    float64
 10  energy            417 non-null    float64
 11  key               417 non-null    object 
 12  loudness          417 non-null    float64
 13  mode              417 non-null    object 
 14  speechiness       417 non-null    float64
 15  acousticness      417 non-null    float64
 16  instrumentalness  417 non-null    float64
 1

In [12]:
nct127_tracks_single = get_all_tracks(nct127_single[1])
nct_tracks_single = get_all_tracks(nct_single[1])
nctu_tracks_single = get_all_tracks(nctu_single[1])
nctdream_tracks_single = get_all_tracks(nctdream_single[1])
wayv_tracks_single = get_all_tracks(wayv_single[1])
all_tracks_single = pd.concat([nct127_tracks_single, nct_tracks_single, nctu_tracks_single, nctdream_tracks_single, wayv_tracks_single]).reset_index(drop=True)
print(all_tracks_single.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 151 entries, 0 to 150
Data columns (total 22 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   name              151 non-null    object 
 1   duration_ms       151 non-null    int64  
 2   explicit          151 non-null    bool   
 3   track_number      151 non-null    int64  
 4   popularity        151 non-null    int64  
 5   artist_name       151 non-null    object 
 6   album_name        151 non-null    object 
 7   release_date      151 non-null    object 
 8   valence           151 non-null    float64
 9   danceability      151 non-null    float64
 10  energy            151 non-null    float64
 11  key               151 non-null    int64  
 12  loudness          151 non-null    float64
 13  mode              151 non-null    int64  
 14  speechiness       151 non-null    float64
 15  acousticness      151 non-null    float64
 16  instrumentalness  151 non-null    float64
 1

In [13]:
nct127_top_tracks = get_top_tracks('NCT 127', nct127_link)
nct_top_tracks = get_top_tracks('NCT', nct_link)
nctu_top_tracks = get_top_tracks('NCT U', nctu_link)
nctdream_top_tracks = get_top_tracks('NCT DREAM', nctdream_link)
wayv_top_tracks = get_top_tracks('WayV', wayv_link)
all_top_tracks = pd.concat([nct127_top_tracks, nct_top_tracks, nctu_top_tracks, nctdream_top_tracks, wayv_top_tracks]).reset_index(drop=True)
print(all_top_tracks.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   artist_name       50 non-null     object 
 1   name              50 non-null     object 
 2   album             50 non-null     object 
 3   popularity        50 non-null     int64  
 4   track_number      50 non-null     int64  
 5   duration_ms       50 non-null     int64  
 6   explicit          50 non-null     bool   
 7   id                50 non-null     object 
 8   track_uri         50 non-null     object 
 9   valence           50 non-null     float64
 10  danceability      50 non-null     float64
 11  energy            50 non-null     float64
 12  key               50 non-null     int64  
 13  loudness          50 non-null     float64
 14  mode              50 non-null     int64  
 15  speechiness       50 non-null     float64
 16  acousticness      50 non-null     float64
 17 

In [90]:
all_top_tracks.to_csv('/home/projects/2022_oct_03_spotify/results/all_top_tracks.csv', index=False)

In [34]:
# merge album/single info with related tracks
tracks_and_albums_info = pd.merge(all_albums, all_tracks, on=['album_uri','album_name','release_date','artist_name']).reset_index(drop=True)
tracks_and_single_info = pd.merge(all_single, all_tracks_single, on=['album_uri','album_name','release_date','artist_name']).reset_index(drop=True)

Merge all tracks from singles and albums. Remove duplicate tracks that has same name and release date but eventually has different uri, name styling, or album name styling.
- example: Kick It - MINIMONSTER Remix & Kick It - Minimonster Remix

In [35]:
tracks_albums_single_info = pd.concat([tracks_and_albums_info, tracks_and_single_info]).reset_index(drop=True)
tracks_albums_single_info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 560 entries, 0 to 559
Data columns (total 24 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   artist_name       560 non-null    object 
 1   album_name        560 non-null    object 
 2   album_type        560 non-null    object 
 3   release_date      560 non-null    object 
 4   total_tracks      560 non-null    object 
 5   album_uri         560 non-null    object 
 6   name              560 non-null    object 
 7   duration_ms       560 non-null    object 
 8   explicit          560 non-null    object 
 9   track_number      560 non-null    object 
 10  popularity        560 non-null    object 
 11  valence           560 non-null    float64
 12  danceability      560 non-null    float64
 13  energy            560 non-null    float64
 14  key               560 non-null    object 
 15  loudness          560 non-null    float64
 16  mode              560 non-null    object 
 1

In [36]:
tracks_albums_single_info['name'] = tracks_albums_single_info['name'].str.lower()
tracks_albums_single = tracks_albums_single_info[~tracks_albums_single_info.duplicated(['artist_name','release_date','name'])]
tracks_albums_single.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 415 entries, 0 to 559
Data columns (total 24 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   artist_name       415 non-null    object 
 1   album_name        415 non-null    object 
 2   album_type        415 non-null    object 
 3   release_date      415 non-null    object 
 4   total_tracks      415 non-null    object 
 5   album_uri         415 non-null    object 
 6   name              415 non-null    object 
 7   duration_ms       415 non-null    object 
 8   explicit          415 non-null    object 
 9   track_number      415 non-null    object 
 10  popularity        415 non-null    object 
 11  valence           415 non-null    float64
 12  danceability      415 non-null    float64
 13  energy            415 non-null    float64
 14  key               415 non-null    object 
 15  loudness          415 non-null    float64
 16  mode              415 non-null    object 
 1

## Further cleaning

For further analysis, we will use `tracks_albums_single` dataset only and do cleaning for this dataset. The top tracks can be obtained using popularity scores.

Because actually NCT's songs are sung by an U unit, we will cite NCT's tracks as NCT U's tracks. So we need to rename the `artist_name = "NCT"` to `"NCT U"`

In [37]:
print(tracks_albums_single['artist_name'].value_counts())
print(np.where(tracks_albums_single['artist_name'] == 'NCT'))
tracks_albums_single['artist_name'] = tracks_albums_single['artist_name'].replace(['NCT'],'NCT U')
print(tracks_albums_single['artist_name'].value_counts())

NCT 127      184
NCT DREAM    112
NCT           63
WayV          35
NCT U         21
Name: artist_name, dtype: int64
(array([154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166,
       167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
       180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192,
       193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205,
       206, 207, 208, 209, 210, 211, 212, 213, 214, 326, 327]),)
NCT 127      184
NCT DREAM    112
NCT U         84
WayV          35
Name: artist_name, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tracks_albums_single['artist_name'] = tracks_albums_single['artist_name'].replace(['NCT'],'NCT U')


The duplicate songs those exist both in (the album and repackage album) or (album and other single) releases will be dropped. We will take the songs with earlier release date.

In [62]:
# NCT 127 songs
nct127 = tracks_albums_single[tracks_albums_single['artist_name']=='NCT 127']
print(np.where(nct127['name'].duplicated()))
print(nct127.info())

(array([ 26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  54,  55,
        56,  57,  58,  59,  60,  61,  62,  63,  64,  65,  66, 127, 129,
       130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 163, 171,
       172, 174, 175, 176]),)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 184 entries, 0 to 467
Data columns (total 24 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   artist_name       184 non-null    object 
 1   album_name        184 non-null    object 
 2   album_type        184 non-null    object 
 3   release_date      184 non-null    object 
 4   total_tracks      184 non-null    object 
 5   album_uri         184 non-null    object 
 6   name              184 non-null    object 
 7   duration_ms       184 non-null    object 
 8   explicit          184 non-null    object 
 9   track_number      184 non-null    object 
 10  popularity        184 non-null    object 
 11  valence           184 non-null  

In [63]:
pd.set_option('display.max_rows', 50)
nct127.sort_values(by=['name','release_date'],ascending=False).head(50)

Unnamed: 0,artist_name,album_name,album_type,release_date,total_tracks,album_uri,name,duration_ms,explicit,track_number,...,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,tempo,time_signature,track_uri
221,NCT 127,NCT #127 - The 1st Mini Album,album,2016-07-10,7,spotify:album:7xmqXxPAmkgn5HK9FcyBxR,소방차 fire truck,178565,False,1,...,1,-3.279,1,0.0602,0.116,1.3e-05,0.352,110.006,4,spotify:track:5hHlmrSV6d9LFMsDA1lamE
465,NCT 127,NCT#127 LIMITLESS - The 2nd Mini Album,single,2017-01-06,6,spotify:album:4EEfpF1qcPAl1J4Z770A2U,롤러코스터 heartbreaker,194854,False,4,...,0,-3.864,1,0.0928,0.162,0.0,0.176,154.809,4,spotify:track:5IhVzUL9qR7MW2ISpB6iYR
434,NCT 127,LOVEHOLIC,single,2021-02-17,6,spotify:album:0k5S0L54m5ga5YDEXt2WJ6,英雄; kick it,233225,False,5,...,7,-2.419,1,0.177,0.0313,0.0,0.0894,167.008,4,spotify:track:5HjK06wlDrkTvBZX2Ns15S
462,NCT 127,NCT#127 LIMITLESS - The 2nd Mini Album,single,2017-01-06,6,spotify:album:4EEfpF1qcPAl1J4Z770A2U,無限的我 무한적아; limitless,247467,False,1,...,5,-5.655,0,0.0743,0.00934,0.0,0.0337,78.216,4,spotify:track:6m96zMB8BrLrpyT7NvApxB
87,NCT 127,NCT #127 Neo Zone: The Final Round - The 2nd A...,album,2020-05-19,17,spotify:album:64F7bFLt8ULAfBl2L4Pyv4,white night,243426,False,14,...,7,-3.968,0,0.0288,0.203,0.0,0.13,70.978,4,spotify:track:2xszaAGmpxMjoqDBxgTKKn
118,NCT 127,NCT #127 Neo Zone - The 2nd Album,album,2020-03-06,13,spotify:album:5YOvg682zFOleCiSndLnZr,white night,243426,False,11,...,7,-3.967,0,0.0288,0.203,0.0,0.13,70.978,4,spotify:track:33txTXgQioQ5eaXc7yWd67
218,NCT 127,NCT #127 CHERRY BOMB– The 3rd Mini Album,album,2017-06-14,7,spotify:album:7H5FkCA6cTDBX3wtvIbN8s,whiplash,201658,False,5,...,1,-6.119,0,0.182,0.0318,0.0,0.19,99.984,4,spotify:track:2iPxDFeN27VrKgGdZ0AIKw
156,NCT 127,NEO CITY : SEOUL– The Origin – The 1st Live Album,album,2019-10-24,25,spotify:album:5tS5Lpx9RSoJCoPPUuEnwy,welcome to my playground - live,240053,False,12,...,5,-4.639,1,0.103,0.459,0.0,0.635,94.032,4,spotify:track:0vQp6vS75JZSZoHAOrom93
192,NCT 127,NCT #127 Regulate - The 1st Album Repackage,album,2018-11-23,14,spotify:album:5iJsCigDROOTMgUpJ6ex2S,welcome to my playground,239133,False,4,...,5,-6.593,1,0.159,0.313,0.0,0.0427,94.074,4,spotify:track:56f5GO8ir5ucrcQ00k4a8t
166,NCT 127,Awaken,album,2019-04-17,12,spotify:album:1eneZzXf46fUJPBeOgjSx8,wakey-wakey,182385,False,2,...,10,-4.877,0,0.0494,0.0529,0.0,0.293,140.057,4,spotify:track:304px13jll5AIVxIU0IzuO


In [66]:
nct127_cleaned = nct127.drop_duplicates(['name'],keep='last').reset_index(drop=True)
nct127_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 141 entries, 0 to 140
Data columns (total 24 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   artist_name       141 non-null    object 
 1   album_name        141 non-null    object 
 2   album_type        141 non-null    object 
 3   release_date      141 non-null    object 
 4   total_tracks      141 non-null    object 
 5   album_uri         141 non-null    object 
 6   name              141 non-null    object 
 7   duration_ms       141 non-null    object 
 8   explicit          141 non-null    object 
 9   track_number      141 non-null    object 
 10  popularity        141 non-null    object 
 11  valence           141 non-null    float64
 12  danceability      141 non-null    float64
 13  energy            141 non-null    float64
 14  key               141 non-null    object 
 15  loudness          141 non-null    float64
 16  mode              141 non-null    object 
 1

In [72]:
nct127_cleaned.sort_values(by='release_date',ascending=False).head(50)

Unnamed: 0,artist_name,album_name,album_type,release_date,total_tracks,album_uri,name,duration_ms,explicit,track_number,...,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,tempo,time_signature,track_uri
0,NCT 127,2 Baddies - The 4th Album,album,2022-09-16,12,spotify:album:6p80QT3z7kOHpYdnsItQTQ,faster,170626,False,1,...,1,-4.176,1,0.118,0.0251,0.0,0.241,96.973,4,spotify:track:46hCrsB4mC1IWcYOMQqNWA
2,NCT 127,2 Baddies - The 4th Album,album,2022-09-16,12,spotify:album:6p80QT3z7kOHpYdnsItQTQ,time lapse,215306,False,3,...,6,-3.567,0,0.209,0.0589,0.0,0.288,84.901,4,spotify:track:3FxGTxebY5j4Etl1KbIIZl
3,NCT 127,2 Baddies - The 4th Album,album,2022-09-16,12,spotify:album:6p80QT3z7kOHpYdnsItQTQ,crash landing,204800,False,4,...,1,-2.059,1,0.0904,0.199,0.0,0.0769,140.06,4,spotify:track:66eZq38bO1ofzmpNa02XiL
4,NCT 127,2 Baddies - The 4th Album,album,2022-09-16,12,spotify:album:6p80QT3z7kOHpYdnsItQTQ,designer,237080,False,5,...,1,-3.379,1,0.0932,0.0978,0.0,0.111,92.965,4,spotify:track:4axTxw4EXSVSGm6f6vxTdZ
5,NCT 127,2 Baddies - The 4th Album,album,2022-09-16,12,spotify:album:6p80QT3z7kOHpYdnsItQTQ,gold dust,249613,False,6,...,0,-5.477,1,0.0401,0.704,0.0,0.35,140.008,4,spotify:track:0Hj4YtlD0Q5O0srdZxJUtl
6,NCT 127,2 Baddies - The 4th Album,album,2022-09-16,12,spotify:album:6p80QT3z7kOHpYdnsItQTQ,black clouds,218400,False,7,...,2,-4.305,1,0.0436,0.265,0.0,0.099,84.982,4,spotify:track:1n3XdyPzDSbbVoRUb9QSW7
7,NCT 127,2 Baddies - The 4th Album,album,2022-09-16,12,spotify:album:6p80QT3z7kOHpYdnsItQTQ,playback,192546,False,8,...,6,-4.714,0,0.123,0.409,0.0,0.195,160.013,4,spotify:track:2138nuFxkpawpBEkhyzZee
8,NCT 127,2 Baddies - The 4th Album,album,2022-09-16,12,spotify:album:6p80QT3z7kOHpYdnsItQTQ,tasty,214746,False,9,...,11,-3.933,0,0.136,0.00694,0.000124,0.0706,107.946,4,spotify:track:3NVskN6estUcP26ZaMcMMu
9,NCT 127,2 Baddies - The 4th Album,album,2022-09-16,12,spotify:album:6p80QT3z7kOHpYdnsItQTQ,vitamin,184386,False,10,...,7,-3.139,1,0.045,0.152,0.0,0.0359,90.149,4,spotify:track:71mbhY822FlcBmxMgeuUgA
10,NCT 127,2 Baddies - The 4th Album,album,2022-09-16,12,spotify:album:6p80QT3z7kOHpYdnsItQTQ,lol (laugh-out-loud),219533,False,11,...,9,-3.101,1,0.0499,0.0945,0.0,0.0693,89.889,4,spotify:track:5P188CGPr5TKnhTrTGWme1


Do the same for other artists

In [78]:
nctu = tracks_albums_single[tracks_albums_single['artist_name']=='NCT U']
print(nctu.shape)
nctu_cleaned = nctu.drop_duplicates(['name'],keep='last').reset_index(drop=True)
print(nctu_cleaned.shape)

(84, 24)
(70, 24)


In [80]:
nctdream = tracks_albums_single[tracks_albums_single['artist_name']=='NCT DREAM']
print(nctdream.shape)
nctdream_cleaned = nctdream.drop_duplicates(['name'],keep='last').reset_index(drop=True)
print(nctdream_cleaned.shape)

(112, 24)
(90, 24)


In [85]:
wayv = tracks_albums_single[tracks_albums_single['artist_name']=='WayV']
print(wayv.shape)
wayv_cleaned = wayv.drop_duplicates(['name'],keep='last').reset_index(drop=True)
print(wayv_cleaned.shape)

(35, 24)
(32, 24)


Combine all tracks from all artists

In [89]:
tracks_albums_single_cleaned = pd.concat([nct127_cleaned, nctu_cleaned, nctdream_cleaned, wayv_cleaned]).reset_index(drop=True)
tracks_albums_single_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 333 entries, 0 to 332
Data columns (total 24 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   artist_name       333 non-null    object 
 1   album_name        333 non-null    object 
 2   album_type        333 non-null    object 
 3   release_date      333 non-null    object 
 4   total_tracks      333 non-null    object 
 5   album_uri         333 non-null    object 
 6   name              333 non-null    object 
 7   duration_ms       333 non-null    object 
 8   explicit          333 non-null    object 
 9   track_number      333 non-null    object 
 10  popularity        333 non-null    object 
 11  valence           333 non-null    float64
 12  danceability      333 non-null    float64
 13  energy            333 non-null    float64
 14  key               333 non-null    object 
 15  loudness          333 non-null    float64
 16  mode              333 non-null    object 
 1

In [91]:
tracks_albums_single_cleaned.to_csv('/home/projects/2022_oct_03_spotify/results/tracks_albums_single_cleaned.csv', index=False)