# Data preparation

In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

## Data Prep

In [5]:
# read and process the charts dataset
charts_df = pd.read_csv('data/spotify_daily_charts.csv')
#transform date column into a datetime column
charts_df['date'] = pd.to_datetime(charts_df['date'])
charts_df.head()

Unnamed: 0,date,position,track_id,track_name,artist,streams
0,2017-01-01,1,0kN8xEmgMW9mh7UmDYHlJP,Versace on the Floor,Bruno Mars,185236
1,2017-01-01,2,5uCax9HTNlzGybIStD3vDh,Say You Won't Let Go,James Arthur,180552
2,2017-01-01,3,7BKLCZ1jbUBVqRi2FVlTVw,Closer,The Chainsmokers,158720
3,2017-01-01,4,2rizacJSyD9S1IQUxUxnsK,All We Know,The Chainsmokers,130874
4,2017-01-01,5,5MFzQMkrl1FOOng9tq6R9r,Don't Wanna Know,Maroon 5,129656


In [6]:
# read and process the tracks dataset
tracks_df = pd.read_csv('data/spotify_daily_charts_tracks.csv')
tracks_df.head()

Unnamed: 0,track_id,track_name,artist_id,artist_name,album_id,duration,release_date,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,0kN8xEmgMW9mh7UmDYHlJP,Versace on the Floor,0du5cEVh5yTK9QJze8zA0C,Bruno Mars,4PgleR09JVnm3zY1fW3XBA,261240,2016-11-17,76,0.578,0.574,2.0,-6.209,1.0,0.0454,0.196,0.0,0.083,0.301,174.152
1,5uCax9HTNlzGybIStD3vDh,Say You Won't Let Go,4IWBUUAFIplrNtaOHcJPRM,James Arthur,7oiJYvEJHsmYtrgviAVIBD,211466,2016-10-28,85,0.358,0.557,10.0,-7.398,1.0,0.059,0.695,0.0,0.0902,0.494,85.043
2,7BKLCZ1jbUBVqRi2FVlTVw,Closer,69GGBxA162lTqCwzJG5jLp,The Chainsmokers,0rSLgV8p5FzfnqlEk4GzxE,244960,2016-07-29,85,0.748,0.524,8.0,-5.599,1.0,0.0338,0.414,0.0,0.111,0.661,95.01
3,2rizacJSyD9S1IQUxUxnsK,All We Know,69GGBxA162lTqCwzJG5jLp,The Chainsmokers,0xmaV6EtJ4M3ebZUPRnhyb,194080,2016-09-29,69,0.662,0.586,0.0,-8.821,1.0,0.0307,0.097,0.00272,0.115,0.296,90.0
4,5MFzQMkrl1FOOng9tq6R9r,Don't Wanna Know,04gDigrS5kc9YWfZHwBETP,Maroon 5,0fvTn3WXF39kQs9i3bnNpP,214480,2016-10-11,0,0.783,0.623,7.0,-6.126,1.0,0.08,0.338,0.0,0.0975,0.447,100.048


In [7]:
#merge the charts and tracks dataset
df = charts_df.merge(tracks_df, on='track_id', how='left')
#
df = df.drop(columns='track_name_y')
df = df.rename(columns={'track_name_x':'track_name'})
df.head()

Unnamed: 0,date,position,track_id,track_name,artist,streams,artist_id,artist_name,album_id,duration,...,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,2017-01-01,1,0kN8xEmgMW9mh7UmDYHlJP,Versace on the Floor,Bruno Mars,185236,0du5cEVh5yTK9QJze8zA0C,Bruno Mars,4PgleR09JVnm3zY1fW3XBA,261240,...,0.574,2.0,-6.209,1.0,0.0454,0.196,0.0,0.083,0.301,174.152
1,2017-01-01,2,5uCax9HTNlzGybIStD3vDh,Say You Won't Let Go,James Arthur,180552,4IWBUUAFIplrNtaOHcJPRM,James Arthur,7oiJYvEJHsmYtrgviAVIBD,211466,...,0.557,10.0,-7.398,1.0,0.059,0.695,0.0,0.0902,0.494,85.043
2,2017-01-01,3,7BKLCZ1jbUBVqRi2FVlTVw,Closer,The Chainsmokers,158720,69GGBxA162lTqCwzJG5jLp,The Chainsmokers,0rSLgV8p5FzfnqlEk4GzxE,244960,...,0.524,8.0,-5.599,1.0,0.0338,0.414,0.0,0.111,0.661,95.01
3,2017-01-01,4,2rizacJSyD9S1IQUxUxnsK,All We Know,The Chainsmokers,130874,69GGBxA162lTqCwzJG5jLp,The Chainsmokers,0xmaV6EtJ4M3ebZUPRnhyb,194080,...,0.586,0.0,-8.821,1.0,0.0307,0.097,0.00272,0.115,0.296,90.0
4,2017-01-01,5,5MFzQMkrl1FOOng9tq6R9r,Don't Wanna Know,Maroon 5,129656,04gDigrS5kc9YWfZHwBETP,Maroon 5,0fvTn3WXF39kQs9i3bnNpP,214480,...,0.623,7.0,-6.126,1.0,0.08,0.338,0.0,0.0975,0.447,100.048


## Save to File CSV

In [None]:
df.to_csv("data/spotify_combined.csv" index=False)

### Load the Artists csv

In [5]:
# already in the spotify_daily_charts_artists.csv
df_artists = pd.read_csv("data/spotify_daily_charts_artists.csv")
df_artists.head()

Unnamed: 0,artist_id,artist_name,total_followers,genres,popularity
0,0du5cEVh5yTK9QJze8zA0C,Bruno Mars,30585244,"['dance pop', 'pop']",95
1,4IWBUUAFIplrNtaOHcJPRM,James Arthur,8035001,"['pop', 'post-teen pop', 'talent show', 'uk pop']",87
2,69GGBxA162lTqCwzJG5jLp,The Chainsmokers,17865117,"['dance pop', 'edm', 'electropop', 'pop', 'pop...",86
3,04gDigrS5kc9YWfZHwBETP,Maroon 5,30759087,"['pop', 'pop rock']",92
4,5p7f24Rk5HkUZsaS3BLG5F,Hailee Steinfeld,6760239,"['dance pop', 'pop', 'post-teen pop']",79


In [6]:
#merge the df dataset with df_artists
df_with_genres = df.merge(df_artists, on='artist_id', how='left')
df_with_genres.columns.sort_values()


Index(['acousticness', 'album_id', 'artist', 'artist_id', 'artist_name_x',
       'artist_name_y', 'danceability', 'date', 'duration', 'energy', 'genres',
       'instrumentalness', 'key', 'liveness', 'loudness', 'mode',
       'popularity_x', 'popularity_y', 'position', 'release_date',
       'speechiness', 'streams', 'tempo', 'total_followers', 'track_id',
       'track_name', 'valence'],
      dtype='object')

In [7]:
df_with_genres.drop(columns=['artist_name_y','popularity_y'],inplace=True)
df_with_genres.columns.sort_values()

Index(['acousticness', 'album_id', 'artist', 'artist_id', 'artist_name_x',
       'danceability', 'date', 'duration', 'energy', 'genres',
       'instrumentalness', 'key', 'liveness', 'loudness', 'mode',
       'popularity_x', 'position', 'release_date', 'speechiness', 'streams',
       'tempo', 'total_followers', 'track_id', 'track_name', 'valence'],
      dtype='object')

In [8]:
df_with_genres = df_with_genres.rename(columns={'artist_name_x':'artist_name'})
df_with_genres.columns.sort_values()

Index(['acousticness', 'album_id', 'artist', 'artist_id', 'artist_name',
       'danceability', 'date', 'duration', 'energy', 'genres',
       'instrumentalness', 'key', 'liveness', 'loudness', 'mode',
       'popularity_x', 'position', 'release_date', 'speechiness', 'streams',
       'tempo', 'total_followers', 'track_id', 'track_name', 'valence'],
      dtype='object')

In [9]:
df_with_genres = df_with_genres.rename(columns={'popularity_x':'popularity'})
df_with_genres.columns.sort_values()

Index(['acousticness', 'album_id', 'artist', 'artist_id', 'artist_name',
       'danceability', 'date', 'duration', 'energy', 'genres',
       'instrumentalness', 'key', 'liveness', 'loudness', 'mode', 'popularity',
       'position', 'release_date', 'speechiness', 'streams', 'tempo',
       'total_followers', 'track_id', 'track_name', 'valence'],
      dtype='object')

### Save with genres

In [10]:
# Run only ones
df_with_genres.to_csv("data/spotify_combined_with_genres.csv", index=False)

In [11]:
df_with_genres.head()

Unnamed: 0,date,position,track_id,track_name,artist,streams,artist_id,artist_name,album_id,duration,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,total_followers,genres
0,2017-01-01,1,0kN8xEmgMW9mh7UmDYHlJP,Versace on the Floor,Bruno Mars,185236,0du5cEVh5yTK9QJze8zA0C,Bruno Mars,4PgleR09JVnm3zY1fW3XBA,261240,...,-6.209,1.0,0.0454,0.196,0.0,0.083,0.301,174.152,30585244,"['dance pop', 'pop']"
1,2017-01-01,2,5uCax9HTNlzGybIStD3vDh,Say You Won't Let Go,James Arthur,180552,4IWBUUAFIplrNtaOHcJPRM,James Arthur,7oiJYvEJHsmYtrgviAVIBD,211466,...,-7.398,1.0,0.059,0.695,0.0,0.0902,0.494,85.043,8035001,"['pop', 'post-teen pop', 'talent show', 'uk pop']"
2,2017-01-01,3,7BKLCZ1jbUBVqRi2FVlTVw,Closer,The Chainsmokers,158720,69GGBxA162lTqCwzJG5jLp,The Chainsmokers,0rSLgV8p5FzfnqlEk4GzxE,244960,...,-5.599,1.0,0.0338,0.414,0.0,0.111,0.661,95.01,17865117,"['dance pop', 'edm', 'electropop', 'pop', 'pop..."
3,2017-01-01,4,2rizacJSyD9S1IQUxUxnsK,All We Know,The Chainsmokers,130874,69GGBxA162lTqCwzJG5jLp,The Chainsmokers,0xmaV6EtJ4M3ebZUPRnhyb,194080,...,-8.821,1.0,0.0307,0.097,0.00272,0.115,0.296,90.0,17865117,"['dance pop', 'edm', 'electropop', 'pop', 'pop..."
4,2017-01-01,5,5MFzQMkrl1FOOng9tq6R9r,Don't Wanna Know,Maroon 5,129656,04gDigrS5kc9YWfZHwBETP,Maroon 5,0fvTn3WXF39kQs9i3bnNpP,214480,...,-6.126,1.0,0.08,0.338,0.0,0.0975,0.447,100.048,30759087,"['pop', 'pop rock']"


In [12]:
df_with_genres.isnull().sum()

date                 0
position             0
track_id             0
track_name          20
artist              20
streams              0
artist_id            0
artist_name         20
album_id             0
duration             0
release_date         0
popularity           0
danceability        63
energy              63
key                 63
loudness            63
mode                63
speechiness         63
acousticness        63
instrumentalness    63
liveness            63
valence             63
tempo               63
total_followers      0
genres               0
dtype: int64

In [13]:
df.isnull().sum()

date                 0
position             0
track_id             0
track_name          10
artist              10
streams              0
artist_id            0
artist_name         10
album_id             0
duration             0
release_date         0
popularity           0
danceability        63
energy              63
key                 63
loudness            63
mode                63
speechiness         63
acousticness        63
instrumentalness    63
liveness            63
valence             63
tempo               63
dtype: int64