# Part 1 - Data Ingestion and Transformation

## Imports

In [1]:
import pandas as pd
import numpy as np
import spotipy as sp
import spotipy
import sys
from spotipy.oauth2 import SpotifyClientCredentials
import pprint
import sqlite3
import math

import jupyter_black

jupyter_black.load()

# Load environment variables from .env files which is named in the .gitignore file to prevent accidental upload to Github
import os
from dotenv import load_dotenv

load_dotenv()

spotify = spotipy.Spotify(auth_manager=SpotifyClientCredentials())

## API call

### Artists

In [2]:
def build_artist_table(artists: list):
    """Returns a dataframe of artist information from the list of artists"""
    artist_tables = []
    for artist in artists:
        artist_tables.append(return_artist_info(artist))
    return pd.DataFrame(artist_tables)

In [3]:
def return_artist_info(artist: str):
    """Returns a dictionary of artist information from Spotify API.
    If there are multiple genres or images, it chooses the first option."""
    name = artist.lower()
    results = spotify.search(q="artist:" + name, type="artist")
    items = results["artists"]["items"]
    try:
        artist = items[0]
    except:
        raise Exception("No artist found")
    artist_table = {
        "artist_id": artist["id"],
        "artist_name": artist["name"],
        "external_url": artist["external_urls"]["spotify"],
        "genre": artist["genres"][0],
        "image_url": artist["images"][0]["url"],
        "followers": artist["followers"]["total"],
        "popularity": artist["popularity"],
        "type": artist["type"],
        "artist_uri": artist["uri"],
    }
    return artist_table

In [4]:
def return_artist_info(artist: str):
    """Returns a dictionary of artist information from Spotify API.
    If there are multiple genres or images, it chooses the first option."""
    name = artist.lower()
    results = spotify.search(q="artist:" + name, type="artist")
    items = results["artists"]["items"]
    try:
        artist = items[0]
    except:
        raise Exception("No artist found")
    artist_table = {
        "artist_id": artist["id"],
        "artist_name": artist["name"],
        "external_url": artist["external_urls"]["spotify"],
        "genre": artist["genres"][0],
        "image_url": artist["images"][0]["url"],
        "followers": artist["followers"]["total"],
        "popularity": artist["popularity"],
        "type": artist["type"],
        "artist_uri": artist["uri"],
    }
    return artist_table

### Album

In [5]:
def build_album_table(artists_table: pd.DataFrame):
    """Returns a dataframe of all albums for the artists in the given artists_table, which was generate using build_artist_table."""
    for i in artists_table.index:
        artist_uri = artists_table.loc[i, "artist_uri"]
        artist_id = artists_table.loc[i, "artist_id"]
        artist_album_table = return_artist_album_table(artist_uri, artist_id)
        if i == 0:
            album_table = artist_album_table
        else:
            album_table = pd.concat([album_table, artist_album_table]).reset_index(
                drop=True
            )
    album_table = return_unique_albums(album_table)
    return album_table

In [6]:
def return_unique_albums(album_table: pd.DataFrame):
    """Returns a dataframe of that includes only the earliest version of an album, for example excludes deluxe versions or live concerts"""
    album_table.sort_values("release_date", ascending=True, inplace=True)
    album_table["temp_album_name"] = [
        x[0].rstrip() for x in album_table["album_name"].str.split("(")
    ]
    for i in album_table.index:
        if i not in album_table.index:
            continue
        album = album_table.loc[i, "temp_album_name"]
        artist = album_table.loc[i, "artist_id"]
        duplicate_albums = album_table.query(
            "(album_name.str.find(@album) != -1) & (artist_id == @artist)"
        )
        if duplicate_albums.shape[0] > 1:
            first_album_index = duplicate_albums.index[0]
            duplicate_album_indices = duplicate_albums.index[1:]
            album_table.drop(duplicate_album_indices, inplace=True, errors="ignore")
    live_albums_indices = album_table.query('album_name.str.find("Live") != -1').index
    album_table.drop(live_albums_indices, inplace=True, errors="ignore")
    album_table = (
        album_table.drop_duplicates(subset="temp_album_name")
        .drop(columns=["temp_album_name"])
        .reset_index(drop=True)
    )
    return album_table

In [7]:
def return_artist_album_table(artist_uri: str, artist_id: str):
    """Returns a dataframe of album(s) information for a given artist from Spotify API"""
    results = spotify.artist_albums(
        artist_id=artist_uri, album_type="album", country="US"
    )
    items = results["items"]
    if len(items) == 0:
        raise Exception("No albums found")
    albums_table = []
    for album in items:
        albums_table.append(return_album_info(album, artist_id))
    return pd.DataFrame(albums_table)

In [8]:
def return_album_info(album: dict, artist_id: str):
    """Returns a dictionary of album information from the provided artist item from the Spotify API file"""
    album_table = {
        "album_id": album["id"],
        "album_name": album["name"],
        "external_url": album["external_urls"]["spotify"],
        "image_url": album["images"][0]["url"],
        "release_date": album["release_date"],
        "total_tracks": album["total_tracks"],
        "type": album["type"],
        "album_uri": album["uri"],
        "artist_id": artist_id,
    }
    return album_table

### Tracks

In [9]:
def build_track_table(album_table: pd.DataFrame):
    """Returns a dataframe of all tracks for the albums in the given album_table, which was generate using build_album_table."""
    for i in album_table.index:
        album_id = album_table.loc[i, "album_id"]
        album_tracks_table = return_album_tracks(album_id)
        if i == 0:
            tracks_table = album_tracks_table
        else:
            tracks_table = pd.concat([tracks_table, album_tracks_table])
    return tracks_table.reset_index(drop=True)

In [10]:
def return_album_tracks(album_id: str):
    """Returns a dataframe of album tracks for a given album_id"""
    results = spotify.album_tracks(album_id=album_id, limit=50, offset=0)
    items = results["items"]
    if len(items) == 0:
        raise Exception("No tracks found")
    tracks_table = []
    for track in items:
        tracks_table.append(return_track_info(track, album_id))
    return pd.DataFrame(tracks_table)

In [11]:
def return_track_info(track: dict, album_id: str):
    """Returns a dictionary of track information from the provided track item from the Spotify API file"""
    track_table = {
        "track_id": track["id"],
        "song_name": track["name"],
        "external_url": track["external_urls"]["spotify"],
        "duration_ms": track["duration_ms"],
        "explicit": track["explicit"],
        "disc_number": track["disc_number"],
        "type": track["type"],
        "song_uri": track["uri"],
        "album_id": album_id,
    }
    return track_table

### Track Feature

In [12]:
def build_track_feature_table(track_table: pd.DataFrame):
    """Returns a dataframe of all tracks features for the track in the given track_table, which was generated using build_track_table."""
    # Can only call 100 ids at a time
    hundreds_of_tracks = math.ceil(track_table.shape[0] / 100)
    for i in range(0, hundreds_of_tracks):
        track_ids = track_table.loc[(i * 100) : ((i + 1) * 100) - 1, "track_id"]
        new_track_feature_table = return_track_feature_table(track_ids)
        if i == 0:
            track_feature_table = new_track_feature_table
        else:
            track_feature_table = pd.concat(
                [track_feature_table, new_track_feature_table]
            )
    return track_feature_table.reset_index(drop=True)

In [13]:
def return_track_feature_table(track_ids: list):
    """Returns a dataframe of track features from a given list of track_ids"""
    results = spotify.audio_features(track_ids)
    if len(results) == 0:
        raise Exception("No tracks found")
    tracks_table = []
    for track_id, track in zip(track_ids, results):
        tracks_table.append(return_track_feature_info(track, track_id))
    return pd.DataFrame(tracks_table)

In [14]:
def return_track_feature_info(track: dict, track_id: str):
    """Returns a dictionary of track feature information for a given track_id from results of a Spotipy API call"""
    if track is None:
        keys = [
            "track_id",
            "danceability",
            "energy",
            "instrumentalness",
            "liveness",
            "loudness",
            "speechiness",
            "tempo",
            "type",
            "valence",
            "song_uri",
        ]
        track_feature_dict = dict(zip(keys, [np.nan] * 11))
        track_feature_dict["song_uri"] = track_id
    track_feature_dict = {
        "track_id": track["id"],
        "danceability": track["danceability"],
        "energy": track["energy"],
        "instrumentalness": track["instrumentalness"],
        "liveness": track["liveness"],
        "loudness": track["loudness"],
        "speechiness": track["speechiness"],
        "tempo": track["tempo"],
        "type": track["type"],
        "valence": track["valence"],
        "song_uri": track["uri"],
    }
    return track_feature_dict

# Example API Call

In [15]:
artists = [
    "Taylor Swift",
    "Shakira",
    "Phil Collins",
    "Billie Eilish",
    "Camilo",
    "Bad Bunny",
    "Harry Styles",
    "Sublime",
    "Eric Clapton",
    "Lizzo",
    "Adele",
    "Maroon 5",
    "Ed Sheeran",
    "Enrique Iglesias",
    "Coldplay",
    "Lady Gaga",
    "Beyonce",
    "Britney Spears",
    "Queen",
    "Eagles",
]

artist_table = build_artist_table(artists)
album_table = build_album_table(artist_table)
track_table = build_track_table(album_table)
track_feature_table = build_track_feature_table(track_table)

In [16]:
artist_table

Unnamed: 0,artist_id,artist_name,external_url,genre,image_url,followers,popularity,type,artist_uri
0,06HL4z0CvFAxyc27GXpf02,Taylor Swift,https://open.spotify.com/artist/06HL4z0CvFAxyc...,pop,https://i.scdn.co/image/ab6761610000e5ebfcf7c3...,58988141,94,artist,spotify:artist:06HL4z0CvFAxyc27GXpf02
1,0EmeFodog0BfCgMzAIvKQp,Shakira,https://open.spotify.com/artist/0EmeFodog0BfCg...,colombian pop,https://i.scdn.co/image/ab6761610000e5eb284894...,25002007,85,artist,spotify:artist:0EmeFodog0BfCgMzAIvKQp
2,4lxfqrEsLX6N1N4OCSkILp,Phil Collins,https://open.spotify.com/artist/4lxfqrEsLX6N1N...,mellow gold,https://i.scdn.co/image/31fbe64783eb5d49316164...,4740687,75,artist,spotify:artist:4lxfqrEsLX6N1N4OCSkILp
3,6qqNVTkY8uBg9cP3Jd7DAH,Billie Eilish,https://open.spotify.com/artist/6qqNVTkY8uBg9c...,art pop,https://i.scdn.co/image/ab6761610000e5ebd8b998...,68915938,88,artist,spotify:artist:6qqNVTkY8uBg9cP3Jd7DAH
4,28gNT5KBp7IjEOQoevXf9N,Camilo,https://open.spotify.com/artist/28gNT5KBp7IjEO...,colombian pop,https://i.scdn.co/image/ab6761610000e5ebc85ae7...,16418116,81,artist,spotify:artist:28gNT5KBp7IjEOQoevXf9N
5,4q3ewBCX7sLwd24euuV69X,Bad Bunny,https://open.spotify.com/artist/4q3ewBCX7sLwd2...,reggaeton,https://i.scdn.co/image/ab6761610000e5eb8ee9a6...,56219718,100,artist,spotify:artist:4q3ewBCX7sLwd24euuV69X
6,6KImCVD70vtIoJWnq6nGn3,Harry Styles,https://open.spotify.com/artist/6KImCVD70vtIoJ...,pop,https://i.scdn.co/image/ab6761610000e5ebf7db7c...,23451498,91,artist,spotify:artist:6KImCVD70vtIoJWnq6nGn3
7,0EdvGhlC1FkGItLOWQzG4J,Sublime,https://open.spotify.com/artist/0EdvGhlC1FkGIt...,reggae fusion,https://i.scdn.co/image/ab6761610000e5ebe72628...,2331770,70,artist,spotify:artist:0EdvGhlC1FkGItLOWQzG4J
8,6PAt558ZEZl0DmdXlnjMgD,Eric Clapton,https://open.spotify.com/artist/6PAt558ZEZl0Dm...,blues rock,https://i.scdn.co/image/ab6772690000c46ca60e8f...,5015253,71,artist,spotify:artist:6PAt558ZEZl0DmdXlnjMgD
9,56oDRnqbIiwx4mymNEv7dS,Lizzo,https://open.spotify.com/artist/56oDRnqbIiwx4m...,dance pop,https://i.scdn.co/image/ab6761610000e5eb0d66b3...,5019711,80,artist,spotify:artist:56oDRnqbIiwx4mymNEv7dS


In [17]:
artist_table.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   artist_id     20 non-null     object
 1   artist_name   20 non-null     object
 2   external_url  20 non-null     object
 3   genre         20 non-null     object
 4   image_url     20 non-null     object
 5   followers     20 non-null     int64 
 6   popularity    20 non-null     int64 
 7   type          20 non-null     object
 8   artist_uri    20 non-null     object
dtypes: int64(2), object(7)
memory usage: 1.5+ KB


In [32]:
album_table

Unnamed: 0,album_id,album_name,external_url,image_url,release_date,total_tracks,type,album_uri,artist_id
0,51B7LbLWgYLKBVSpkan8Z7,Eagles (2013 Remaster),https://open.spotify.com/album/51B7LbLWgYLKBVS...,https://i.scdn.co/image/ab67616d0000b273c13acd...,1972,10,album,spotify:album:51B7LbLWgYLKBVSpkan8Z7,0ECwFtbIWEVNwjlrfc6xoL
1,09WBxbis5Sixt01FVMs8UM,Desperado (2013 Remaster),https://open.spotify.com/album/09WBxbis5Sixt01...,https://i.scdn.co/image/ab67616d0000b2732d73b1...,1973,11,album,spotify:album:09WBxbis5Sixt01FVMs8UM,0ECwFtbIWEVNwjlrfc6xoL
2,2iCHyD9XHtA3vJFJIuXzqu,On the Border (2013 Remaster),https://open.spotify.com/album/2iCHyD9XHtA3vJF...,https://i.scdn.co/image/ab67616d0000b273a7606c...,1974,10,album,spotify:album:2iCHyD9XHtA3vJFJIuXzqu,0ECwFtbIWEVNwjlrfc6xoL
3,0F77QekrNe8vVAjU2sepja,One of These Nights (2013 Remaster),https://open.spotify.com/album/0F77QekrNe8vVAj...,https://i.scdn.co/image/ab67616d0000b2735d0a8e...,1975,9,album,spotify:album:0F77QekrNe8vVAjU2sepja,0ECwFtbIWEVNwjlrfc6xoL
4,5NMAdQPrKw5nutWnGEzfpn,Hotel California (40th Anniversary Expanded Ed...,https://open.spotify.com/album/5NMAdQPrKw5nutW...,https://i.scdn.co/image/ab67616d0000b273d66618...,1976-12-08,19,album,spotify:album:5NMAdQPrKw5nutWnGEzfpn,0ECwFtbIWEVNwjlrfc6xoL
...,...,...,...,...,...,...,...,...,...
142,5r36AJ6VOJtp00oxSkBZ5h,Harry's House,https://open.spotify.com/album/5r36AJ6VOJtp00o...,https://i.scdn.co/image/ab67616d0000b2732e8ed7...,2022-05-20,13,album,spotify:album:5r36AJ6VOJtp00oxSkBZ5h,6KImCVD70vtIoJWnq6nGn3
143,3tjIKRAPBy5Qu4z8F5HmBz,Top Gun: Maverick (Music From The Motion Picture),https://open.spotify.com/album/3tjIKRAPBy5Qu4z...,https://i.scdn.co/image/ab67616d0000b27302701c...,2022-05-27,12,album,spotify:album:3tjIKRAPBy5Qu4z8F5HmBz,1HY2Jd0NmPuamShAr6KMms
144,1KtDsGsSRGbnmH07v5hB1I,Special,https://open.spotify.com/album/1KtDsGsSRGbnmH0...,https://i.scdn.co/image/ab67616d0000b273caa75a...,2022-07-14,12,album,spotify:album:1KtDsGsSRGbnmH07v5hB1I,56oDRnqbIiwx4mymNEv7dS
145,6FJxoadUE4JNVwWHghBwnb,RENAISSANCE,https://open.spotify.com/album/6FJxoadUE4JNVwW...,https://i.scdn.co/image/ab67616d0000b2730e58a0...,2022-07-29,16,album,spotify:album:6FJxoadUE4JNVwWHghBwnb,6vWDO969PvNqNYHIOW5v0m


In [20]:
album_table.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 147 entries, 0 to 146
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   album_id      147 non-null    object
 1   album_name    147 non-null    object
 2   external_url  147 non-null    object
 3   image_url     147 non-null    object
 4   release_date  147 non-null    object
 5   total_tracks  147 non-null    int64 
 6   type          147 non-null    object
 7   album_uri     147 non-null    object
 8   artist_id     147 non-null    object
dtypes: int64(1), object(8)
memory usage: 10.5+ KB


In [21]:
track_table

Unnamed: 0,track_id,song_name,external_url,duration_ms,explicit,disc_number,type,song_uri,album_id
0,4yugZvBYaoREkJKtbG08Qr,Take It Easy - 2013 Remaster,https://open.spotify.com/track/4yugZvBYaoREkJK...,211577,False,1,track,spotify:track:4yugZvBYaoREkJKtbG08Qr,51B7LbLWgYLKBVSpkan8Z7
1,436yrzQWA32vb1sTZKXg9r,Witchy Woman - 2013 Remaster,https://open.spotify.com/track/436yrzQWA32vb1s...,250888,False,1,track,spotify:track:436yrzQWA32vb1sTZKXg9r,51B7LbLWgYLKBVSpkan8Z7
2,1uNrJfFUItP86a1m19Yr5n,Chug All Night - 2013 Remaster,https://open.spotify.com/track/1uNrJfFUItP86a1...,195655,False,1,track,spotify:track:1uNrJfFUItP86a1m19Yr5n,51B7LbLWgYLKBVSpkan8Z7
3,2j7KL0fcBKEh6P8z0VxIVr,Most of Us Are Sad - 2013 Remaster,https://open.spotify.com/track/2j7KL0fcBKEh6P8...,215420,False,1,track,spotify:track:2j7KL0fcBKEh6P8z0VxIVr,51B7LbLWgYLKBVSpkan8Z7
4,59OUN6rPXLBK6bkhhwQkP5,Nightingale - 2013 Remaster,https://open.spotify.com/track/59OUN6rPXLBK6bk...,245598,False,1,track,spotify:track:59OUN6rPXLBK6bkhhwQkP5,51B7LbLWgYLKBVSpkan8Z7
...,...,...,...,...,...,...,...,...,...
2256,2OlxJKrbyLmzAO14EgV53K,Naturaleza,https://open.spotify.com/track/2OlxJKrbyLmzAO1...,162880,False,1,track,spotify:track:2OlxJKrbyLmzAO14EgV53K,1UTDgnpHmthIsdzSxbhpV2
2257,7mCC2SBkb5uwmQ4Ozewtbc,NASA,https://open.spotify.com/track/7mCC2SBkb5uwmQ4...,186000,False,1,track,spotify:track:7mCC2SBkb5uwmQ4Ozewtbc,1UTDgnpHmthIsdzSxbhpV2
2258,0j3ULEzS14shcTGdiYwsOI,Pegao,https://open.spotify.com/track/0j3ULEzS14shcTG...,160680,False,1,track,spotify:track:0j3ULEzS14shcTGdiYwsOI,1UTDgnpHmthIsdzSxbhpV2
2259,3CzgTXrAROSCB6bd1bm5mJ,Pesadilla,https://open.spotify.com/track/3CzgTXrAROSCB6b...,189826,False,1,track,spotify:track:3CzgTXrAROSCB6bd1bm5mJ,1UTDgnpHmthIsdzSxbhpV2


In [22]:
track_table.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2261 entries, 0 to 2260
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   track_id      2261 non-null   object
 1   song_name     2261 non-null   object
 2   external_url  2261 non-null   object
 3   duration_ms   2261 non-null   int64 
 4   explicit      2261 non-null   bool  
 5   disc_number   2261 non-null   int64 
 6   type          2261 non-null   object
 7   song_uri      2261 non-null   object
 8   album_id      2261 non-null   object
dtypes: bool(1), int64(2), object(6)
memory usage: 143.6+ KB


In [23]:
track_feature_table

Unnamed: 0,track_id,danceability,energy,instrumentalness,liveness,loudness,speechiness,tempo,type,valence,song_uri
0,4yugZvBYaoREkJKtbG08Qr,0.575,0.670,0.000005,0.1290,-10.390,0.0318,139.191,audio_features,0.740,spotify:track:4yugZvBYaoREkJKtbG08Qr
1,436yrzQWA32vb1sTZKXg9r,0.555,0.568,0.001670,0.1160,-12.205,0.0341,98.507,audio_features,0.583,spotify:track:436yrzQWA32vb1sTZKXg9r
2,1uNrJfFUItP86a1m19Yr5n,0.537,0.770,0.000046,0.0970,-8.896,0.0534,138.471,audio_features,0.644,spotify:track:1uNrJfFUItP86a1m19Yr5n
3,2j7KL0fcBKEh6P8z0VxIVr,0.559,0.251,0.005730,0.0866,-14.288,0.0262,109.551,audio_features,0.285,spotify:track:2j7KL0fcBKEh6P8z0VxIVr
4,59OUN6rPXLBK6bkhhwQkP5,0.647,0.625,0.006360,0.2760,-9.347,0.0288,135.046,audio_features,0.679,spotify:track:59OUN6rPXLBK6bkhhwQkP5
...,...,...,...,...,...,...,...,...,...,...,...
2256,2OlxJKrbyLmzAO14EgV53K,0.891,0.506,0.000016,0.0692,-1.980,0.1590,148.016,audio_features,0.938,spotify:track:2OlxJKrbyLmzAO14EgV53K
2257,7mCC2SBkb5uwmQ4Ozewtbc,0.751,0.638,0.000000,0.3570,-5.569,0.1600,149.943,audio_features,0.791,spotify:track:7mCC2SBkb5uwmQ4Ozewtbc
2258,0j3ULEzS14shcTGdiYwsOI,0.812,0.502,0.000000,0.1530,-4.763,0.0632,90.057,audio_features,0.883,spotify:track:0j3ULEzS14shcTGdiYwsOI
2259,3CzgTXrAROSCB6bd1bm5mJ,0.805,0.484,0.000000,0.0979,-4.984,0.1490,143.873,audio_features,0.653,spotify:track:3CzgTXrAROSCB6bd1bm5mJ


In [24]:
track_feature_table.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2261 entries, 0 to 2260
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   track_id          2261 non-null   object 
 1   danceability      2261 non-null   float64
 2   energy            2261 non-null   float64
 3   instrumentalness  2261 non-null   float64
 4   liveness          2261 non-null   float64
 5   loudness          2261 non-null   float64
 6   speechiness       2261 non-null   float64
 7   tempo             2261 non-null   float64
 8   type              2261 non-null   object 
 9   valence           2261 non-null   float64
 10  song_uri          2261 non-null   object 
dtypes: float64(8), object(3)
memory usage: 194.4+ KB


## Load to Database

In [25]:
def store_tables_in_db(table_names: [str], tables: [pd.DataFrame], db: str):
    """Saves the tables as the provided table_names into the given db"""
    conn = sqlite3.connect(f"../{db}.db")
    c = conn.cursor()
    for table_name, table in zip(table_names, tables):
        insert_table(table_name, table, c, conn)

In [26]:
def insert_table(
    table_name: str, table: pd.DataFrame, c: sqlite3.Cursor, conn: sqlite3.Connection
):
    columns = ", ".join(table.columns)
    c.execute(f"""CREATE TABLE IF NOT EXISTS {table_name} ({columns})""")
    table.to_sql(table_name, conn, if_exists="replace", index=False)

In [27]:
table_names = ["artist", "album", "track", "track_feature"]
tables = [artist_table, album_table, track_table, track_feature_table]

store_tables_in_db(table_names, tables, "spotify")