# Part 1 - Data Ingestion and Transformation

## Imports

In [30]:
import pandas as pd
import numpy as np
import spotipy as sp
import spotipy
import sys
from spotipy.oauth2 import SpotifyClientCredentials
import pprint
import sqlite3

import jupyter_black

jupyter_black.load()

# Load environment variables from .env files which is named in the .gitignore file to prevent accidental upload to Github
import os
from dotenv import load_dotenv

load_dotenv()

spotify = spotipy.Spotify(auth_manager=SpotifyClientCredentials())

## API call

### Artists

In [31]:
def build_artist_table(artists: list):
    """Returns a dataframe of artist information from the list of artists"""
    artist_tables = []
    for artist in artists:
        artist_tables.append(return_artist_info(artist))
    return pd.DataFrame(artist_tables)

In [32]:
def return_artist_info(artist: str):
    """Returns a dictionary of artist information from Spotify API.
    If there are multiple genres or images, it chooses the first option."""
    name = artist.lower()
    results = spotify.search(q="artist:" + name, type="artist")
    items = results["artists"]["items"]
    try:
        artist = items[0]
    except:
        raise Exception("No artist found")
    artist_table = {
        "artist_id": artist["id"],
        "artist_name": artist["name"],
        "external_url": artist["external_urls"]["spotify"],
        "genre": artist["genres"][0],
        "image_url": artist["images"][0]["url"],
        "followers": artist["followers"]["total"],
        "popularity": artist["popularity"],
        "type": artist["type"],
        "artist_uri": artist["uri"],
    }
    return artist_table

### Album

In [33]:
def build_album_table(artists_table: pd.DataFrame):
    """Returns a dataframe of all albums for the artists in the given artists_table, which was generate using build_artist_table."""
    for i in artists_table.index:
        artist_uri = artists_table.loc[i, "artist_uri"]
        artist_id = artists_table.loc[i, "artist_id"]
        artist_album_table = return_artist_album_table(artist_uri, artist_id)
        if i == 0:
            album_table = artist_album_table
        else:
            album_table = pd.concat([album_table, artist_album_table])
    return album_table.reset_index(drop=True)

In [34]:
def return_artist_album_table(artist_uri: str, artist_id: str):
    """Returns a dataframe of album(s) information for a given artist from Spotify API"""
    results = spotify.artist_albums(
        artist_id=artist_uri, album_type="album", country="US"
    )
    items = results["items"]
    if len(items) == 0:
        raise Exception("No albums found")
    albums_table = []
    for album in items:
        albums_table.append(return_album_info(album, artist_id))
    return pd.DataFrame(albums_table)

In [35]:
def return_album_info(album: dict, artist_id: str):
    """Returns a dictionary of album information from the provided artist item from the Spotify API file"""
    album_table = {
        "album_id": album["id"],
        "album_name": album["name"],
        "external_url": album["external_urls"]["spotify"],
        "image_url": album["images"][0]["url"],
        "release_date": album["release_date"],
        "total_tracks": album["total_tracks"],
        "type": album["type"],
        "album_uri": album["uri"],
        "artist_id": artist_id,
    }
    return album_table

### Tracks

In [36]:
def build_track_table(album_table: pd.DataFrame):
    """Returns a dataframe of all tracks for the albums in the given album_table, which was generate using build_album_table."""
    for i in album_table.index:
        album_id = album_table.loc[i, "album_id"]
        album_tracks_table = return_album_tracks(album_id)
        if i == 0:
            tracks_table = album_tracks_table
        else:
            tracks_table = pd.concat([tracks_table, album_tracks_table])
    return tracks_table.reset_index(drop=True)

In [37]:
def return_album_tracks(album_id: str):
    """Returns a dataframe of album tracks for a given album_id"""
    results = spotify.album_tracks(album_id=album_id, limit=50, offset=0)
    items = results["items"]
    if len(items) == 0:
        raise Exception("No tracks found")
    tracks_table = []
    for track in items:
        tracks_table.append(return_track_info(track, album_id))
    return pd.DataFrame(tracks_table)

In [38]:
def return_track_info(track: dict, album_id: str):
    """Returns a dictionary of track information from the provided track item from the Spotify API file"""
    track_table = {
        "track_id": track["id"],
        "song_name": track["name"],
        "external_url": track["external_urls"]["spotify"],
        "duration_ms": track["duration_ms"],
        "explicit": track["explicit"],
        "disc_number": track["disc_number"],
        "type": track["type"],
        "song_uri": track["uri"],
        "album_id": album_id,
    }
    return track_table

### Track Feature

In [39]:
def build_track_feature_table(track_table: pd.DataFrame):
    """Returns a dataframe of all tracks features for the track in the given track_table, which was generated using build_track_table."""
    # Can only call 100 ids at a time
    hundreds_of_tracks = track_table.shape[0] % 100 + 1
    for i in range(0, hundreds_of_tracks):
        track_ids = track_table.loc[(i * 100) : ((i + 1) * 100) - 1, "track_id"]
        new_track_feature_table = return_track_feature_table(track_ids)
        if i == 0:
            track_feature_table = new_track_feature_table
        else:
            track_feature_table = pd.concat(
                [track_feature_table, new_track_feature_table]
            )
    return track_feature_table.reset_index(drop=True)

In [40]:
def return_track_feature_table(track_ids: list):
    """Returns a dataframe of track features from a given list of track_ids"""
    results = spotify.audio_features(track_ids)
    if len(results) == 0:
        raise Exception("No tracks found")
    tracks_table = []
    for track_id, track in zip(track_ids, results):
        tracks_table.append(return_track_feature_info(track, track_id))
    return pd.DataFrame(tracks_table)

In [41]:
def return_track_feature_info(track: dict, track_id: str):
    """Returns a dictionary of track feature information for a given track_id from results of a Spotipy API call"""
    if track is None:
        keys = [
            "track_id",
            "danceability",
            "energy",
            "instrumentalness",
            "liveness",
            "loudness",
            "speechiness",
            "tempo",
            "type",
            "valence",
            "song_uri",
        ]
        track_feature_dict = dict(zip(keys, [np.nan] * 11))
        track_feature_dict["song_uri"] = track_id
    track_feature_dict = {
        "track_id": track["id"],
        "danceability": track["danceability"],
        "energy": track["energy"],
        "instrumentalness": track["instrumentalness"],
        "liveness": track["liveness"],
        "loudness": track["loudness"],
        "speechiness": track["speechiness"],
        "tempo": track["tempo"],
        "type": track["type"],
        "valence": track["valence"],
        "song_uri": track["uri"],
    }
    return track_feature_dict

# Example API Call

In [42]:
artists = [
    "Taylor Swift",
    "Shakira",
    "Phil Collins",
    "Billie Eilish",
    "Camilo",
    "Bad Bunny",
    "Harry Styles",
    "Sublime",
    "Eric Clapton",
    "Lizzo",
    "Adele",
    "Maroon 5",
    "Ed Sheeran",
    "Enrique Iglesias",
    "Coldplay",
    "Lady Gaga",
    "Beyonce",
    "Britney Spears",
    "Queen",
    "Eagles",
]

artist_table = build_artist_table(artists)
album_table = build_album_table(artist_table)
track_table = build_track_table(album_table)
track_feature_table = build_track_feature_table(track_table)

In [43]:
artist_table

Unnamed: 0,artist_id,artist_name,external_url,genre,image_url,followers,popularity,type,artist_uri
0,06HL4z0CvFAxyc27GXpf02,Taylor Swift,https://open.spotify.com/artist/06HL4z0CvFAxyc...,pop,https://i.scdn.co/image/ab6761610000e5ebfcf7c3...,58840286,94,artist,spotify:artist:06HL4z0CvFAxyc27GXpf02
1,0EmeFodog0BfCgMzAIvKQp,Shakira,https://open.spotify.com/artist/0EmeFodog0BfCg...,colombian pop,https://i.scdn.co/image/ab6761610000e5eb284894...,24972709,85,artist,spotify:artist:0EmeFodog0BfCgMzAIvKQp
2,4lxfqrEsLX6N1N4OCSkILp,Phil Collins,https://open.spotify.com/artist/4lxfqrEsLX6N1N...,mellow gold,https://i.scdn.co/image/31fbe64783eb5d49316164...,4735839,75,artist,spotify:artist:4lxfqrEsLX6N1N4OCSkILp
3,6qqNVTkY8uBg9cP3Jd7DAH,Billie Eilish,https://open.spotify.com/artist/6qqNVTkY8uBg9c...,art pop,https://i.scdn.co/image/ab6761610000e5ebd8b998...,68743016,88,artist,spotify:artist:6qqNVTkY8uBg9cP3Jd7DAH
4,28gNT5KBp7IjEOQoevXf9N,Camilo,https://open.spotify.com/artist/28gNT5KBp7IjEO...,colombian pop,https://i.scdn.co/image/ab6761610000e5ebc85ae7...,16394686,81,artist,spotify:artist:28gNT5KBp7IjEOQoevXf9N
5,4q3ewBCX7sLwd24euuV69X,Bad Bunny,https://open.spotify.com/artist/4q3ewBCX7sLwd2...,reggaeton,https://i.scdn.co/image/ab6761610000e5eb8ee9a6...,56038789,100,artist,spotify:artist:4q3ewBCX7sLwd24euuV69X
6,6KImCVD70vtIoJWnq6nGn3,Harry Styles,https://open.spotify.com/artist/6KImCVD70vtIoJ...,pop,https://i.scdn.co/image/ab6761610000e5ebf7db7c...,23394701,91,artist,spotify:artist:6KImCVD70vtIoJWnq6nGn3
7,0EdvGhlC1FkGItLOWQzG4J,Sublime,https://open.spotify.com/artist/0EdvGhlC1FkGIt...,reggae fusion,https://i.scdn.co/image/ab6761610000e5ebe72628...,2329786,70,artist,spotify:artist:0EdvGhlC1FkGItLOWQzG4J
8,6PAt558ZEZl0DmdXlnjMgD,Eric Clapton,https://open.spotify.com/artist/6PAt558ZEZl0Dm...,blues rock,https://i.scdn.co/image/ab6772690000c46ca60e8f...,5011125,71,artist,spotify:artist:6PAt558ZEZl0DmdXlnjMgD
9,56oDRnqbIiwx4mymNEv7dS,Lizzo,https://open.spotify.com/artist/56oDRnqbIiwx4m...,dance pop,https://i.scdn.co/image/ab6761610000e5eb0d66b3...,5011449,80,artist,spotify:artist:56oDRnqbIiwx4mymNEv7dS


In [44]:
artist_table.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   artist_id     20 non-null     object
 1   artist_name   20 non-null     object
 2   external_url  20 non-null     object
 3   genre         20 non-null     object
 4   image_url     20 non-null     object
 5   followers     20 non-null     int64 
 6   popularity    20 non-null     int64 
 7   type          20 non-null     object
 8   artist_uri    20 non-null     object
dtypes: int64(2), object(7)
memory usage: 1.5+ KB


In [45]:
album_table

Unnamed: 0,album_id,album_name,external_url,image_url,release_date,total_tracks,type,album_uri,artist_id
0,6kZ42qRrzov54LcAk4onW9,Red (Taylor's Version),https://open.spotify.com/album/6kZ42qRrzov54Lc...,https://i.scdn.co/image/ab67616d0000b273318443...,2021-11-12,30,album,spotify:album:6kZ42qRrzov54LcAk4onW9,06HL4z0CvFAxyc27GXpf02
1,6x9s2ObPdpATZgrwxsk9c0,Red (Taylor's Version),https://open.spotify.com/album/6x9s2ObPdpATZgr...,https://i.scdn.co/image/ab67616d0000b273563151...,2021-11-12,30,album,spotify:album:6x9s2ObPdpATZgrwxsk9c0,06HL4z0CvFAxyc27GXpf02
2,4hDok0OAJd57SGIT8xuWJH,Fearless (Taylor's Version),https://open.spotify.com/album/4hDok0OAJd57SGI...,https://i.scdn.co/image/ab67616d0000b273a48964...,2021-04-09,26,album,spotify:album:4hDok0OAJd57SGIT8xuWJH,06HL4z0CvFAxyc27GXpf02
3,6AORtDjduMM3bupSWzbTSG,evermore (deluxe version),https://open.spotify.com/album/6AORtDjduMM3bup...,https://i.scdn.co/image/ab67616d0000b27390fd97...,2021-01-07,17,album,spotify:album:6AORtDjduMM3bupSWzbTSG,06HL4z0CvFAxyc27GXpf02
4,1DT6fDJL6AWPJxe7Lq1dPb,evermore (deluxe version),https://open.spotify.com/album/1DT6fDJL6AWPJxe...,https://i.scdn.co/image/ab67616d0000b273566d7b...,2021-01-07,17,album,spotify:album:1DT6fDJL6AWPJxe7Lq1dPb,06HL4z0CvFAxyc27GXpf02
...,...,...,...,...,...,...,...,...,...
261,5NMAdQPrKw5nutWnGEzfpn,Hotel California (40th Anniversary Expanded Ed...,https://open.spotify.com/album/5NMAdQPrKw5nutW...,https://i.scdn.co/image/ab67616d0000b273d66618...,1976-12-08,19,album,spotify:album:5NMAdQPrKw5nutWnGEzfpn,0ECwFtbIWEVNwjlrfc6xoL
262,0F77QekrNe8vVAjU2sepja,One of These Nights (2013 Remaster),https://open.spotify.com/album/0F77QekrNe8vVAj...,https://i.scdn.co/image/ab67616d0000b2735d0a8e...,1975,9,album,spotify:album:0F77QekrNe8vVAjU2sepja,0ECwFtbIWEVNwjlrfc6xoL
263,2iCHyD9XHtA3vJFJIuXzqu,On the Border (2013 Remaster),https://open.spotify.com/album/2iCHyD9XHtA3vJF...,https://i.scdn.co/image/ab67616d0000b273a7606c...,1974,10,album,spotify:album:2iCHyD9XHtA3vJFJIuXzqu,0ECwFtbIWEVNwjlrfc6xoL
264,09WBxbis5Sixt01FVMs8UM,Desperado (2013 Remaster),https://open.spotify.com/album/09WBxbis5Sixt01...,https://i.scdn.co/image/ab67616d0000b2732d73b1...,1973,11,album,spotify:album:09WBxbis5Sixt01FVMs8UM,0ECwFtbIWEVNwjlrfc6xoL


In [46]:
album_table.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 266 entries, 0 to 265
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   album_id      266 non-null    object
 1   album_name    266 non-null    object
 2   external_url  266 non-null    object
 3   image_url     266 non-null    object
 4   release_date  266 non-null    object
 5   total_tracks  266 non-null    int64 
 6   type          266 non-null    object
 7   album_uri     266 non-null    object
 8   artist_id     266 non-null    object
dtypes: int64(1), object(8)
memory usage: 18.8+ KB


In [47]:
track_table

Unnamed: 0,track_id,song_name,external_url,duration_ms,explicit,disc_number,type,song_uri,album_id
0,6lzc0Al0zfZOIFsFvBS1ki,State Of Grace (Taylor's Version),https://open.spotify.com/track/6lzc0Al0zfZOIFs...,295413,False,1,track,spotify:track:6lzc0Al0zfZOIFsFvBS1ki,6kZ42qRrzov54LcAk4onW9
1,4OAuvHryIVv4kMDNSLuPt6,Red (Taylor's Version),https://open.spotify.com/track/4OAuvHryIVv4kMD...,223093,False,1,track,spotify:track:4OAuvHryIVv4kMDNSLuPt6,6kZ42qRrzov54LcAk4onW9
2,3S7HNKPakdwNEBFIVTL6dZ,Treacherous (Taylor's Version),https://open.spotify.com/track/3S7HNKPakdwNEBF...,242720,False,1,track,spotify:track:3S7HNKPakdwNEBFIVTL6dZ,6kZ42qRrzov54LcAk4onW9
3,6AtZLIzUINvExIUy4QhdjP,I Knew You Were Trouble (Taylor's Version),https://open.spotify.com/track/6AtZLIzUINvExIU...,219760,False,1,track,spotify:track:6AtZLIzUINvExIUy4QhdjP,6kZ42qRrzov54LcAk4onW9
4,3nsfB1vus2qaloUdcBZvDu,All Too Well (Taylor's Version),https://open.spotify.com/track/3nsfB1vus2qaloU...,329160,False,1,track,spotify:track:3nsfB1vus2qaloUdcBZvDu,6kZ42qRrzov54LcAk4onW9
...,...,...,...,...,...,...,...,...,...
4329,5ro7xAxDVbtabTl8t3MzHz,Train Leaves Here This Morning - 2013 Remaster,https://open.spotify.com/track/5ro7xAxDVbtabTl...,250456,False,1,track,spotify:track:5ro7xAxDVbtabTl8t3MzHz,51B7LbLWgYLKBVSpkan8Z7
4330,0J8Q2BOEzphO2tTUlfCUln,Take the Devil - 2013 Remaster,https://open.spotify.com/track/0J8Q2BOEzphO2tT...,240827,False,1,track,spotify:track:0J8Q2BOEzphO2tTUlfCUln,51B7LbLWgYLKBVSpkan8Z7
4331,0cuiu7deGyY5kSKZgMEyaJ,Earlybird - 2013 Remaster,https://open.spotify.com/track/0cuiu7deGyY5kSK...,179943,False,1,track,spotify:track:0cuiu7deGyY5kSKZgMEyaJ,51B7LbLWgYLKBVSpkan8Z7
4332,40h65HAR8COEoqkMwUUQHu,Peaceful Easy Feeling - 2013 Remaster,https://open.spotify.com/track/40h65HAR8COEoqk...,257962,False,1,track,spotify:track:40h65HAR8COEoqkMwUUQHu,51B7LbLWgYLKBVSpkan8Z7


In [48]:
track_table.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4334 entries, 0 to 4333
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   track_id      4334 non-null   object
 1   song_name     4334 non-null   object
 2   external_url  4334 non-null   object
 3   duration_ms   4334 non-null   int64 
 4   explicit      4334 non-null   bool  
 5   disc_number   4334 non-null   int64 
 6   type          4334 non-null   object
 7   song_uri      4334 non-null   object
 8   album_id      4334 non-null   object
dtypes: bool(1), int64(2), object(6)
memory usage: 275.2+ KB


In [49]:
track_feature_table

Unnamed: 0,track_id,danceability,energy,instrumentalness,liveness,loudness,speechiness,tempo,type,valence,song_uri
0,6lzc0Al0zfZOIFsFvBS1ki,0.594,0.713,0.000000,0.1140,-5.314,0.0503,129.958,audio_features,0.328,spotify:track:6lzc0Al0zfZOIFsFvBS1ki
1,4OAuvHryIVv4kMDNSLuPt6,0.516,0.777,0.000002,0.0761,-4.908,0.0375,125.047,audio_features,0.408,spotify:track:4OAuvHryIVv4kMDNSLuPt6
2,3S7HNKPakdwNEBFIVTL6dZ,0.645,0.593,0.000127,0.1300,-6.506,0.0288,109.984,audio_features,0.299,spotify:track:3S7HNKPakdwNEBFIVTL6dZ
3,6AtZLIzUINvExIUy4QhdjP,0.584,0.557,0.000000,0.0576,-6.371,0.0342,154.008,audio_features,0.767,spotify:track:6AtZLIzUINvExIUy4QhdjP
4,3nsfB1vus2qaloUdcBZvDu,0.440,0.528,0.002030,0.2340,-7.809,0.0317,185.972,audio_features,0.132,spotify:track:3nsfB1vus2qaloUdcBZvDu
...,...,...,...,...,...,...,...,...,...,...,...
3495,41N6nyybHwTKKES7engoiR,0.143,0.315,0.008880,0.9160,-12.587,0.0368,75.195,audio_features,0.255,spotify:track:41N6nyybHwTKKES7engoiR
3496,4OuVVcw3PV6tg0f7N6bARL,0.358,0.641,0.000046,0.7630,-8.823,0.0752,124.061,audio_features,0.171,spotify:track:4OuVVcw3PV6tg0f7N6bARL
3497,0656Ba4hb119dMJkx7zJ3N,0.628,0.894,0.000035,0.9220,-5.448,0.1370,97.083,audio_features,0.198,spotify:track:0656Ba4hb119dMJkx7zJ3N
3498,2laS6W8SYTUbBJsq6HbYNE,0.261,0.913,0.000241,0.8630,-5.795,0.4330,143.024,audio_features,0.217,spotify:track:2laS6W8SYTUbBJsq6HbYNE


In [50]:
track_feature_table.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3500 entries, 0 to 3499
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   track_id          3500 non-null   object 
 1   danceability      3500 non-null   float64
 2   energy            3500 non-null   float64
 3   instrumentalness  3500 non-null   float64
 4   liveness          3500 non-null   float64
 5   loudness          3500 non-null   float64
 6   speechiness       3500 non-null   float64
 7   tempo             3500 non-null   float64
 8   type              3500 non-null   object 
 9   valence           3500 non-null   float64
 10  song_uri          3500 non-null   object 
dtypes: float64(8), object(3)
memory usage: 300.9+ KB


## Load to Database

In [51]:
def store_tables_in_db(table_names: [str], tables: [pd.DataFrame], db: str):
    """Saves the tables as the provided table_names into the given db"""
    conn = sqlite3.connect(f"../{db}.db")
    c = conn.cursor()
    for table_name, table in zip(table_names, tables):
        insert_table(table_name, table, c, conn)

In [52]:
def insert_table(
    table_name: str, table: pd.DataFrame, c: sqlite3.Cursor, conn: sqlite3.Connection
):
    columns = ", ".join(table.columns)
    c.execute(f"""CREATE TABLE IF NOT EXISTS {table_name} ({columns})""")
    table.to_sql(table_name, conn, if_exists="replace", index=False)

In [53]:
table_names = ["artist", "album", "track", "track_feature"]
tables = [artist_table, album_table, track_table, track_feature_table]

store_tables_in_db(table_names, tables, "spotify")