# SmartList Pipeline Notebook
This notebook walks through the full data pipeline for the SmartList playlist generator, from data loading to playlist creation.

In [1]:
import os
import sys
# Add project root to sys.path
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.insert(0, project_root)

# Imports
import pandas as pd
import random
import hashlib
from scripts.data_extraction import load_data
from scripts.clustering import (
    k_means_clustering,
    add_popularity_scores,
    generate_cluster_playlists,
    summarize_clusters,
    calculate_silhouette_score,
    get_top_artists_per_cluster,
    get_top_songs_per_cluster
)


## 1. Load Data

In [2]:
# Set data path and load
data_path = '/Users/zoehightower/Desktop/SmartList/data/Raw Data'
history_data, song_data = load_data(data_path)

# Inspect
print(f"History data shape: {history_data.shape}")
print(f"Song data shape: {song_data.shape}")

History data shape: (85547, 8)
Song data shape: (8876, 8)


In [3]:
history_data.tail()

Unnamed: 0,dow_sin,dow_cos,time_sin,time_cos,track,artist,album,uri
85542,-0.781831,0.62349,-0.446198,0.894934,tolerate it,Taylor Swift,evermore,spotify:track:0PurA4JVJ8YQgSVopY8fn6
85543,-0.781831,0.62349,-0.446198,0.894934,No Friends,CADMIUM,No Friends,spotify:track:4U7G8dgUodMEVSv96QRcDb
85544,-0.781831,0.62349,-0.442289,0.896873,Are You Satisfied?,MARINA,The Family Jewels,spotify:track:6lKRMylSZMtA7EqPl0pcdI
85545,-0.781831,0.62349,-0.442289,0.896873,This Feeling Will Pass,Take Care,Reject,spotify:track:6pLGnO6JyJdtp4WlX5ixkI
85546,-0.781831,0.62349,-0.442289,0.896873,Tired,beabadoobee,Patched Up,spotify:track:6F5mZpEEjhsAW8UEqbIpz1


In [4]:
song_data.head()

Unnamed: 0,count,track,artist,album,min_listened,shuffle,skip,uri
0,640,Cool About It,boygenius,the record,1398.7229,293,169,spotify:track:5PJH1U5Iie893v48Fl9yaC
1,470,The Cue,Sarah and the Sundays,Like A Damn Dog,1543.4857,328,125,spotify:track:3O3cErOLH8t8hiqlXADzJN
2,325,Words I Used,The Backseat Lovers,Waiting to Spill,1217.751083,230,107,spotify:track:34nYWTBYOKfTFYKIhVrqdT
3,317,"Good Luck, Babe!",Chappell Roan,"Good Luck, Babe!",890.411933,200,78,spotify:track:0WbMK4wrZ1wFSty9F7FCgu
4,311,Growing Sideways,Noah Kahan,Stick Season,1069.759783,189,53,spotify:track:1JcIXOir94YUYBt2cXTzn2


## 2. Clustering and Silhouette Score

In [5]:
# Perform K-Means clustering
clustered_df = k_means_clustering(history_data, n_clusters=50)

# Calculate silhouette score
sil_score = calculate_silhouette_score(clustered_df)
print(f"Silhouette Score: {sil_score:.3f}")

# Show sample of clustered data
clustered_df.tail()


Silhouette Score: 0.532


Unnamed: 0,dow_sin,dow_cos,time_sin,time_cos,track,artist,album,uri,cluster
85542,-0.781831,0.62349,-0.446198,0.894934,tolerate it,Taylor Swift,evermore,spotify:track:0PurA4JVJ8YQgSVopY8fn6,8
85543,-0.781831,0.62349,-0.446198,0.894934,No Friends,CADMIUM,No Friends,spotify:track:4U7G8dgUodMEVSv96QRcDb,8
85544,-0.781831,0.62349,-0.442289,0.896873,Are You Satisfied?,MARINA,The Family Jewels,spotify:track:6lKRMylSZMtA7EqPl0pcdI,8
85545,-0.781831,0.62349,-0.442289,0.896873,This Feeling Will Pass,Take Care,Reject,spotify:track:6pLGnO6JyJdtp4WlX5ixkI,8
85546,-0.781831,0.62349,-0.442289,0.896873,Tired,beabadoobee,Patched Up,spotify:track:6F5mZpEEjhsAW8UEqbIpz1,8


## 3. Summarize Clusters

In [6]:
# Generate cluster summaries
summaries = summarize_clusters(clustered_df)
print("Cluster summaries:")
summaries.head()


Cluster summaries:


Unnamed: 0,cluster,day,time_range,count,name
0,0,Friday,12:18–15:29,2052,Friday lazy afternoon
1,1,Tuesday,17:27–20:10,1911,Tuesday moody evening
2,2,Friday,00:00–23:58,1553,Friday familiar all day
3,3,Monday,06:42–11:21,1410,Monday mellow morning
4,4,Sunday,16:45–19:19,2192,Sunday hazy evening


## 4. Process Time Labels & Add Gradients

In [7]:
# Define helper functions
def process_time_label(time_range):
    h, m = time_range.split('–')[0].split(':')
    start_minutes = int(h) * 60 + int(m)
    if   300  <= start_minutes <  720: return 'morning'
    elif 720  <= start_minutes < 1020: return 'afternoon'
    elif 1020 <= start_minutes < 1320: return 'evening'
    else:                              return 'late_night'

# Apply labels and compute start time in minutes
summaries['time_label']    = summaries['time_range'].apply(process_time_label)
summaries['start_minutes'] = summaries['time_range'].apply(
    lambda tr: int(tr.split('–')[0].split(':')[0]) * 60 
             + int(tr.split('–')[0].split(':')[1])
)

# If you don't need any color info, stop here:
print("Summaries without gradients:")
summaries.head()


Summaries without gradients:


Unnamed: 0,cluster,day,time_range,count,name,time_label,start_minutes
0,0,Friday,12:18–15:29,2052,Friday lazy afternoon,afternoon,738
1,1,Tuesday,17:27–20:10,1911,Tuesday moody evening,evening,1047
2,2,Friday,00:00–23:58,1553,Friday familiar all day,late_night,0
3,3,Monday,06:42–11:21,1410,Monday mellow morning,morning,402
4,4,Sunday,16:45–19:19,2192,Sunday hazy evening,afternoon,1005


## 5. Group Clusters by Day

In [8]:
days = ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']
clusters_by_day = {
    day: sorted(
        [r for r in summaries.to_dict(orient='records') if r['day'] == day],
        key=lambda r: r['start_minutes']
    ) for day in days
}

# Display as a table for Monday
mon_df = pd.DataFrame(clusters_by_day['Monday'])
print("Clusters for Monday:")
display(mon_df.head())


Clusters for Monday:


Unnamed: 0,cluster,day,time_range,count,name,time_label,start_minutes
0,21,Monday,00:00–23:58,1794,Monday familiar all day,late_night,0
1,35,Monday,01:42–06:41,488,Monday electric late night,late_night,102
2,3,Monday,06:42–11:21,1410,Monday mellow morning,morning,402
3,34,Monday,11:21–14:00,2188,Monday wandering afternoon,morning,681
4,20,Monday,14:02–16:39,2025,Monday bright afternoon,afternoon,842


## 6. Top Artists & Songs

In [9]:
# Add popularity scores and get top lists
song_data_scored = add_popularity_scores(song_data)
top_artists = get_top_artists_per_cluster(clustered_df)
top_songs   = get_top_songs_per_cluster(clustered_df, top_n=50)

# Display top lists for cluster 0 as tables
print("Top artists for cluster 0:")
display(top_artists[0].head())

print("Top songs for cluster 0:")
display(top_songs[0].head())


Top artists for cluster 0:


Unnamed: 0,artist,listen_count
0,Gracie Abrams,116
1,Taylor Swift,113
2,The Strokes,80
3,Sabrina Carpenter,79
4,Noah Kahan,72


Top songs for cluster 0:


Unnamed: 0,track,artist,uri,album,listen_count
51,Amoeba,Clairo,spotify:track:0HAqq2GcQKyi3s87GuN7jU,Sling,53
396,I Told You Things,Gracie Abrams,spotify:track:5V6mK1pEu22104f22m8KkX,The Secret of Us,42
137,Busy Woman,Sabrina Carpenter,spotify:track:0b0Dz0Gi86SVdBxYeiQcCP,Short n' Sweet,19
329,Guess featuring Billie Eilish,Charli xcx,spotify:track:3WOhcATHxK2SLNeP5W3v1v,Guess featuring Billie Eilish,17
312,Girl You Know It's True,Milli Vanilli,spotify:track:6CTnIURom7GvzLqXuFHrXA,"Girl, You Know It's True",13


## 7. Generate Playlists

In [11]:
# Create playlists for each cluster
playlists = generate_cluster_playlists(clustered_df, song_data_scored, top_songs, top_n=30)

# Show playlist for cluster 0
print("Playlist for cluster 0:")
playlists[0].head()


Playlist for cluster 0:


Unnamed: 0,track,artist,album,uri,popularity_score
0,Sweet Dream,Sarah and the Sundays,So You're Mad About the Cups,spotify:track:6DdJ2TjyiVcgjpYjrjyZBJ,
1,She Calls Me Back,Noah Kahan,Stick Season,spotify:track:1LvU6IFqQnXOIwJyBDb2io,
2,Casual,Chappell Roan,The Rise and Fall of a Midwest Princess,spotify:track:3WSOUb3U7tqURbBSgZTrZX,
3,Somebody Come Through,Wasia Project,Isotope,spotify:track:4Sz61x2L4mYmtw47NBrzfC,
4,Beanie,Chezile,Beanie,spotify:track:5e0b9LgOfi3aJSKXFcOWRe,
