In [None]:
import requests
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

# Authorization

In [None]:
# Create a request session
session = requests.Session()

In [None]:
client_id = '8c34b572ec864291a0899a2a1a209c9c'

In [None]:
# Save the client_secret into a variable without displaying it
with open('spotify_secret.txt') as f:
    client_secret = f.read().strip()

In [None]:
token_endpoint = 'https://accounts.spotify.com/api/token'

In [None]:
authorization = session.post(token_endpoint,
                             data = {'grant_type': 'client_credentials',
                                     'client_id':client_id,
                                     'client_secret':client_secret
                                    }
                            )

In [None]:
# Check Authorization Response
authorization.status_code

In [None]:
access_token = authorization.json()['access_token']

In [None]:
header={'Authorization':f'Bearer {access_token}'}

# Load Track Data CSV

Let's load the **csv** file we saved in Part 2.

In [None]:
track_df = pd.read_csv('tracks.csv', index_col=0)

In [None]:
track_df.head()

# Data Collection - Collating Data

Let's review **masking** and using the `isin` function (very useful, separately and together).

In [None]:
artist_ids = [
    '3Nrfpe0tUJi4K4DXYWgMUX',
    '7n2Ycct7Beij7Dj7meI4X0',
    '3HqSLMAZ3g3d5poNaI7GOU',
    '6jJ0s89eD6GaHleKKya26X',
    '4dpARuHxo51G3z768sgnrY',
    '6vWDO969PvNqNYHIOW5v0m']

In [None]:
mask = track_df['artist id'].isin(artist_ids)

In [None]:
track_df = track_df[mask]

In [None]:
track_df.head()

In [None]:
len(track_df)

## Get Track Information

In [None]:
track_id_list = track_df['track id'].tolist()

In [None]:
track_info_data = {}
for track_id in tqdm(track_id_list):
    track_info = session.get(
                             f'https://api.spotify.com/v1/tracks/{track_id}',
                             headers=header
                            ).json()
    track_audio = session.get(
                              f'https://api.spotify.com/v1/audio-features/{track_id}',
                              headers=header
                             ).json()
    
    track_audio['popularity'] = track_info['popularity']
    
    track_info_data[track_id] = track_audio

In [None]:
track_info_df = pd.DataFrame(track_info_data).T.reset_index()\
                                     .rename(columns={'index':'track id'})

In [None]:
track_info_df.head()

Let's merge the new dataset with our original track dataset we will merge them using the `track id` column:

<img src="https://miro.medium.com/max/1400/1*-uSHoxrzM57syqnKnms2iA.png"/>

<center><em><font size="1">Image from: https://towardsdatascience.com/3-key-differences-between-merge-and-concat-functions-of-pandas-ab2bab224b59</font></em></center>

In [None]:
df = pd.merge(track_df, track_info_df, on='track id')

In [None]:
df = track_df_final[track_df_final['artist id'].isin(artist_ids)]

Let's get a view of our resulting dataset:

In [None]:
df.head()

There are some columns that you won't be needing so remove them using `.drop`.

In [None]:
df.drop(['type','id','uri','track_href','analysis_url'], axis=1,
                   inplace=True)

Save this data set. You will be using this for the next workshop sessions:

In [None]:
df.to_csv('track_df.csv')

In [None]:
df.info()

# Exploratory Data Analysis

Let's do a little EDA in this section. These are some quick methods to get a little idea about your data. But first, let's drop the id columns. We won't be needing them for this section.

In [None]:
eda_df = df.drop(['track id','artist id', 'album id'], axis=1)
eda_df = eda_df[['liveness', 'valence', 'tempo', 'popularity', 'artist']]

In [None]:
eda_df.info()

Some visualizations only accept floating points, so let's make sure all necessary columns are in this data type:

In [None]:
dtype_cols = ['liveness', 'valence', 'tempo', 'popularity']

In [None]:
eda_df[dtype_cols] = eda_df[dtype_cols].astype('float64')

In [None]:
eda_df.info()

You can use `.describe()` to get a few stats of the entire dataset:

In [None]:
eda_df.describe()

To see the number of times a unique value occurs which in this case correponds to the number of tracks of each artist, we use `.value_counts()`:

In [None]:
eda_df['artist'].value_counts()

Let's visualize this in a graph by using `.value_counts()` together with `.plot.bar()`:

In [None]:
eda_df['artist'].value_counts().plot.bar();

We can also get the mean values of each column by using `.groupby().mean()`. Note that `.groupby()` cannot occur alone and must be instructed how to group the values (e.g. mean).

In [None]:
eda_df.groupby('artist').mean()

Let's view the artists' popularities in a graph. But to help us visualize it better, let's sort the values from highest to lowest popularity using `.sort_values()`:

In [None]:
eda_df.groupby('artist').mean()[['popularity']]\
.sort_values('popularity', ascending=False).plot.bar();

View the data in a pairplot:

In [None]:
sns.pairplot(eda_df, hue='artist');