# Spotify Recommender Part 1 - Data Preprocessing

To do:
- check API reference for additional data that can be pulled in for artist/genres
- join data via sql
- clean data

In [1]:
import os
import pandas as pd
import numpy as np
import sqlite3

from secrets import SPOTIFY_HISTORY_DB

First we setup the connection to the SQL DB which contains 3 tables:
- **streaming_history**: each song listened to starting 02-01-2021
- **genres**: each of the artists from streaming_history, tagged with their corresponding genre
- **song_features**: each track from streaming_history, with details at the song level 
    - details include corresponding track values (liveness, tempo, valence etc.)

In [2]:
conn = sqlite3.connect(SPOTIFY_HISTORY_DB)

df_sh = pd.read_sql_query("SELECT * FROM streaming_history", conn)
df_genres = pd.read_sql_query("SELECT * FROM genres", conn)
df_features = pd.read_sql_query("SELECT * FROM song_features", conn)

In [10]:
df_sh.tail(1)

Unnamed: 0,spotify_id,uri,song_name,artist_name,played_at,duration_ms,date,ETL_DTTM,artist_uri
4320,2C1mTvW7io67sQFHE8C8Vv,spotify:track:2C1mTvW7io67sQFHE8C8Vv,Seconda mano,Nerone,2021-09-26 12:20:47.709000,147555,2021-09-26,"2021-09-27, 00:05:08",7kG6A2lZMXeaD5YkubF5Kn


In [8]:
df_genres.head(1)

Unnamed: 0,spotify_url,total_followers,genres,artist_id,artist_name,popularity,uri,ETL_DTTM
0,https://open.spotify.com/artist/3SFVIUlipGj3Rp...,48610,"['bass trap', 'traprun']",3SFVIUlipGj3RpWCKe9s73,Lox Chatterbox,58,spotify:artist:3SFVIUlipGj3RpWCKe9s73,"2021-02-14, 09:25:59"


In [9]:
df_features.head(1)

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature,ETL_DTTM
0,0.754,0.763,1,-6.57,1,0.295,0.32,0.0,0.143,0.704,159.963,audio_features,5ZcmLAogrj4HF8Odrz9XA2,spotify:track:5ZcmLAogrj4HF8Odrz9XA2,https://api.spotify.com/v1/tracks/5ZcmLAogrj4H...,https://api.spotify.com/v1/audio-analysis/5Zcm...,265693,3,"2021-02-04, 23:13:23"


The final query merges the data from the 3 tables into one master table, removing any potential duplicate records. If the same song was played over multiple days, there would be many.

In [4]:
merged_query = """
SELECT DISTINCT *
FROM streaming_history sh 
LEFT JOIN (SELECT DISTINCT * 
    FROM (SELECT DISTINCT artist_id 
                ,artist_name 
                ,genres
                ,row_number() over (partition by artist_id order by artist_id, ETL_DTTM) as dup
            FROM genres) 
    WHERE dup = 1) g
ON sh.artist_uri = g.artist_id
LEFT JOIN (SELECT DISTINCT id -- spotify track id
	,danceability 
	,energy 
	,"key" 
	,loudness 
	,mode 
	,speechiness 
	,acousticness 
	,instrumentalness 
	,liveness 
	,valence 
	,tempo 
	,duration_ms 
	,time_signature 
FROM song_features) sf
ON sh.spotify_id = sf.id
order by played_at
"""

df_spotify = pd.read_sql_query(merged_query, conn)

In [12]:
df_spotify.tail()

Unnamed: 0,spotify_id,uri,song_name,artist_name,played_at,duration_ms,date,ETL_DTTM,artist_uri,artist_id,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms.1,time_signature
4316,4mcwRpFuSWaFbdcVMmAs2D,spotify:track:4mcwRpFuSWaFbdcVMmAs2D,No Don (feat. Chip & Not3s) - Remix,Lotto Boyzz,2021-09-26 19:32:21.074000,211768,2021-09-26,"2021-09-27, 00:05:08",6TlhWcs3imNDmxFviZjpDX,6TlhWcs3imNDmxFviZjpDX,...,-5.317,0.0,0.243,0.261,0.0,0.134,0.657,106.042,211768.0,4.0
4317,5BhRfzO3MokDQVJUKurRTV,spotify:track:5BhRfzO3MokDQVJUKurRTV,YRF (feat. Fredo & Not3s),GRM Daily,2021-09-26 19:50:20.239000,244669,2021-09-26,"2021-09-27, 00:05:08",4PCeJ2EIn3YtzYYRsHHWyy,4PCeJ2EIn3YtzYYRsHHWyy,...,-5.834,0.0,0.242,0.211,7.8e-05,0.11,0.35,103.955,244669.0,4.0
4318,5f14ylFBAuwqVHXvafqtmd,spotify:track:5f14ylFBAuwqVHXvafqtmd,079ME,B Young,2021-09-26 19:53:55.749000,211852,2021-09-26,"2021-09-27, 00:05:08",7JMwO9tyFPMsb2KnsJqZlp,7JMwO9tyFPMsb2KnsJqZlp,...,-7.508,0.0,0.309,0.404,0.0,0.0582,0.612,199.921,211853.0,4.0
4319,5hVglkplQomidvP415oUxt,spotify:track:5hVglkplQomidvP415oUxt,False 9,AJ Tracey,2021-09-26 19:57:15.943000,199654,2021-09-26,"2021-09-27, 00:05:08",4Xi6LSfFqv26XgP9NKN26U,4Xi6LSfFqv26XgP9NKN26U,...,-6.045,1.0,0.344,0.114,0.0,0.119,0.741,140.865,199654.0,4.0
4320,7AnniUuXOgiVCRzdLEnm5c,spotify:track:7AnniUuXOgiVCRzdLEnm5c,"London's Calling (feat. Skrapz, Avelino, Asco,...",GRM Daily,2021-09-26 20:01:22.770000,246450,2021-09-26,"2021-09-27, 00:05:08",4PCeJ2EIn3YtzYYRsHHWyy,4PCeJ2EIn3YtzYYRsHHWyy,...,-3.635,1.0,0.326,0.184,0.0,0.629,0.692,99.56,246450.0,4.0


In [13]:
df_spotify.dtypes

spotify_id           object
uri                  object
song_name            object
artist_name          object
played_at            object
duration_ms           int64
date                 object
ETL_DTTM             object
artist_uri           object
artist_id            object
artist_name          object
genres               object
dup                 float64
id                   object
danceability        float64
energy              float64
key                 float64
loudness            float64
mode                float64
speechiness         float64
acousticness        float64
instrumentalness    float64
liveness            float64
valence             float64
tempo               float64
duration_ms         float64
time_signature      float64
dtype: object