# Clean dataset


#### Load data


In [22]:
import pandas as pd

df = pd.read_csv("./data/playlist_stats.csv")

In [23]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

from spotify.utils import load_credentials

credentials = load_credentials("spotify")
client_credentials_manager = SpotifyClientCredentials(
    client_id=credentials["SPOTIPY_CLIENT_ID"],
    client_secret=credentials["SPOTIPY_CLIENT_SECRET"],
)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

In [24]:
from spotify.utils import fetch_user_names

df = fetch_user_names(sp, df)

In [25]:
from notebook_functions import show

show(df)

Unnamed: 0,name,artist,album,added_by,added_at
0,Rumble,"Skrillex, Fred again.., Flowdan",Rumble,Thomas Brouwer,2023-01-09T10:23:07Z
1,Strung Out Johnny,Iggy Pop,EVERY LOSER,hvdpal58,2023-01-09T20:40:04Z
2,Kill Bill,SZA,SOS,hvdpal58,2023-01-09T20:50:12Z
3,Sail On By,Blanco White,Sail On By,svdpal,2023-01-13T12:15:11Z
4,Be On Your Way,Daughter,Be On Your Way,svdpal,2023-01-13T12:19:16Z
...,...,...,...,...,...
252,Kapot,Véras,Het Niets,hvdpal58,2023-12-27T13:03:20Z
253,Child Of Sin,"Kovacs, Till Lindemann",Child Of Sin,hvdpal58,2023-12-27T13:05:36Z
254,Blinds,Robin Kester,Honeycomb Shades,hvdpal58,2023-12-27T13:17:15Z
255,Always You,Depeche Mode,Memento Mori,hvdpal58,2023-12-27T13:24:47Z


#### Rename columns


In [26]:
# Rename added_by columns
df.loc[df["added_by"] == "svdpal", "added_by"] = "Sandra"
df.loc[df["added_by"] == "hvdpal58", "added_by"] = "Hans"
df.loc[df["added_by"] == "Joline Charlotte", "added_by"] = "Joline"
df.loc[df["added_by"] == "Thomas Brouwer", "added_by"] = "Thomas"

#### Drop duplicates

Based on `name` and `artist` columns.


In [27]:
# Count rows per name and artist
df_duplicates = df.groupby(["name", "artist"]).size().reset_index(name="counts")
df_duplicates = df_duplicates.sort_values(by="counts", ascending=False)
# filter where counts > 1
df_duplicates = df_duplicates[df_duplicates["counts"] > 1]

# locate duplicates in original dataframe
duplicates = df[df.duplicated(subset=["name", "artist"], keep=False)].sort_values(
    by=["name", "artist"]
)
show(duplicates, 10)

Unnamed: 0,name,artist,album,added_by,added_at
221,Allang Niet Meer Van Jou,"Mula B, Goldband",Narcopop,Joline,2023-12-12T07:38:27Z
231,Allang Niet Meer Van Jou,"Mula B, Goldband",Narcopop,Thomas,2023-12-19T16:19:41Z
4,Be On Your Way,Daughter,Be On Your Way,Sandra,2023-01-13T12:19:16Z
34,Be On Your Way,Daughter,Swim Back,Sandra,2023-03-17T11:33:48Z
137,Escapism.,"RAYE, 070 Shake",My 21st Century Blues,Hans,2023-08-28T09:57:11Z
236,Escapism.,"RAYE, 070 Shake",Escapism.,Joline,2023-12-21T10:32:34Z
219,Fly To You (feat. Grimes and Dido),"Caroline Polachek, Grimes, Dido","Desire, I Want To Turn Into You",Hans,2023-12-11T17:12:31Z
248,Fly To You (feat. Grimes and Dido),"Caroline Polachek, Grimes, Dido","Desire, I Want To Turn Into You",Thomas,2023-12-23T21:14:29Z
30,Not Worth It,Maria Mena,Not Worth It,Sandra,2023-03-12T05:29:47Z
146,Not Worth It,Maria Mena,And Then Came You,Joline,2023-09-15T06:24:00Z


In [28]:
# remove duplicates and keep first occurence
df = df.drop_duplicates(subset=["name", "artist"], keep="first")

#### Drop old songs


In [29]:
#### Export to csv

df_old = df[df["release_date"] < "2023-01-01"]
df = df[df["release_date"] >= "2023-01-01"]

df_old[["release_date", "name", "artist", "added_by"]].sort_values(
    by="release_date", ascending=True
).head(20)

Unnamed: 0,release_date,name,artist,added_by
109,2022-08-12,Burn Dem Bridges,Skin On Skin,Thomas
116,2022-08-19,STRUT,EMELINE,Sandra
104,2022-09-06,Obsessed (Demo),LAUREL,Sandra
187,2022-09-06,Berwyn (all that i got is you),"Fred again.., BERWYN, Dermot Kennedy, Guante",Joline
108,2022-11-04,Relax My Eyes,"ANOTR, Abel Balder",Thomas
69,2022-11-11,Open Window,Warhaus,Thomas
155,2022-12-07,Escapism. - Sped Up,"RAYE, 070 Shake",Thomas
2,2022-12-09,Kill Bill,SZA,Hans
16,2022-12-23,The Butcha,"Corinda, Wes Lee",Thomas


#### Export to csv


In [30]:
# save to csv
df.to_csv("./data/playlist_stats_clean.csv", index=False)