# IMPORTS, CLASSES, FUNCTIONS, AUTHENTICATION, SETUP

In [6]:
from dotenv import load_dotenv
import os
load_dotenv()
spotify_client_id = os.getenv("SPOTIFY_CLIENT_ID")
spotify_client_secret = os.getenv("SPOTIFY_CLIENT_SECRET")
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
spotify = spotipy.Spotify(client_credentials_manager=SpotifyClientCredentials(spotify_client_id, spotify_client_secret))
from matplotlib import pyplot as plt
from scipy import stats as st
import pandas as pd
import numpy as np
import re
import ast
from pathlib import Path
def tracks(track):
    tracks_data = {}
    audio_features_data = spotify.audio_features(track['id'])[0]
    audio_features = {key: audio_features_data[key] for key in ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms']}
    tracks_data.update(audio_features)
    album_data = {key: track['album'][key] for key in ['release_date']}
    artists_data = {"artists": [artist['name'] for artist in track['artists']]}
    available_markets_data = spotify.album(track['album']['id'])['available_markets']
    other_data = {key: track[key] for key in ['popularity', 'explicit', 'id', 'name']} 
    tracks_data.update(other_data)
    tracks_data.update(album_data)
    tracks_data.update(artists_data)
    tracks_data.update({'available_markets': len(available_markets_data)})
    return tracks_data
def to_dataframe(tracks_artist):
    tracks_data_list = []
    for track in tracks_artist['tracks']:
        tracks_data = tracks(track)
        tracks_data_list.append(tracks_data)
    return pd.DataFrame(tracks_data_list)
def to_series(tracks_data):
    return pd.DataFrame(tracks_data)
def normalize_ordered_data(df, feature):
    z_feature = (df[feature]-df[feature].mean())/df[feature].std()
    z_bins, z_edges = np.histogram(z_feature, bins = 6)
    binned_feature = np.digitize(z_feature, z_edges, right = True)
    binned_feature[binned_feature > 6] == 6
    binned_feature[binned_feature < 1] == 1
    binned_feature = pd.Series(binned_feature, name = feature)
    df.drop(feature, axis = 1, inplace = True)
    df = pd.concat([df, binned_feature], axis = 1)
    return df
def normalize_numeric_data(df):    
    if(df["available_markets"].max() <= 78):
        df["available_markets"] = df["available_markets"]/78
    else:
        max_value = df["available_markets"].max()
        df["available_markets"] = df["available_markets"]/max_value
    df["popularity"] = df["popularity"]/100
    return df
def dummy_boolean_value(df, feature):
    df[feature] = df[feature].apply(lambda x: x*(1))
    return df
def duration_ms_to_min(df):
    duration_min = round(df["duration_ms"]/60000).astype(int)
    duration_min.name = "duration_min"
    df.drop("duration_ms", axis = 1, inplace = True)
    df = pd.concat([df, duration_min], axis = 1)
    return df
def preprocess(track_artist):
    df = to_dataframe(track_artist)
    df = normalize_ordered_data(df, "tempo")
    df = normalize_ordered_data(df, "loudness")
    df = normalize_numeric_data(df)
    df = dummy_boolean_value(df, "explicit")
    df = duration_ms_to_min(df)
    return df
def save_file(df, file_name):    
        if(os.path.exists(file_name)):
            if(os.path.getsize(file_name) != 0):
                existing_df = pd.read_csv(file_name)
                existing_df = existing_df.append(df, ignore_index=True)
                if(file_name == "artists_df.csv"):
                    pass
                else:
                    existing_df.drop_duplicates(subset = "id", keep = "last", inplace = True)
                existing_df.to_csv(file_name, mode = "w", index = None)
                return existing_df
            else:
                df.to_csv(file_name, mode = "w", index = None)
                return df
        else:
            Path(file_name).touch()
            df.to_csv(file_name, mode = "w", index = None)
            return df
def artist_save(artist_list):
    for artist_id in artist_list:
        track_artist = spotify.artist_top_tracks(artist_id)
        ready_df = preprocess(track_artist)
        return save_file(ready_df, "tracks_df.csv")
def df_artists_timeline(df):
    df_timeline = df.set_index(["release_date"]).sort_values(by = ["release_date"], ascending = True)
    listreal = []
    listrealid = []
    for i, ser in enumerate(df_timeline["artists"]):
        if(type(ser) == str):
            strser = ser.split(",")
            strser[0] = strser[0].replace("[", "")
            strser[-1] = strser[-1].replace("]", "")
            strser = [serstr.replace("\'", "").strip() for serstr in strser]
            ser = strser
        if(len(ser) != 1):
            listrealid.append(i)
        for artist in ser:
            df_timeline = df_timeline.copy()
            df_timeline.iat[i, np.array([int(x == "artists") for x in df_timeline.columns]).argmax()] = artist
            listreal.append(df_timeline.iloc[i])
    dfreal = pd.DataFrame(listreal)
    collab1 = pd.Series(np.ones(len(dfreal.index), dtype = int), index = dfreal.index, name = "collab")
    dfreal = pd.concat([dfreal, collab1], axis = 1)
    df_timeline.drop(df_timeline.loc[df_timeline['id'].isin(listrealid)].index, inplace=True)
    df_timeline.drop_duplicates(subset="id", keep="last", inplace=True)
    collab0 = pd.Series(np.zeros(len(df_timeline.index), dtype = int), index = df_timeline.index, name = "collab")
    df_timeline = pd.concat([df_timeline, collab0], axis = 1)
    df_timeline = df_timeline.append(dfreal)
    df_timeline["artists"] = df_timeline["artists"].astype(str)
    df_artists = df_timeline.groupby(["artists"]).mean().sort_values(by = ["popularity"], ascending = False)
    track_count = pd.Series(df_timeline["artists"].value_counts(), name = "track_count")
    track_count.reindex(df_artists.index)
    df_artists = pd.concat([df_artists, track_count], axis = 1)            
    df_artists.to_csv("artists_df.csv", mode = "w")
    df_timeline.to_csv("timeline_df.csv", mode = "w")
def create_dfs(artist_list):
    if(type(artist_list) == str):
        artist = artist_list
        artist_list = []
        artist_list.append(artist)
    df_tracks = artist_save(artist_list)
    df_artists_timeline(df_tracks)
def query_artist(string = "--"):
    for i, artist in enumerate(df_artists.index):
        if(len(re.findall("(.*?)"+string+"(.*?)", "".join(artist))) != 0):
            print("Name: ", df_artists.index[i], 
                  "\nSingles: ", int(df_artists["track_count"][i]*(1-df_artists["collab"][i])),
                  "\nCollabs: ", int(df_artists["track_count"][i]*df_artists["collab"][i]))
def copy_paste_links(list_copy_paste):
    list_ready = [mat.replace("spotify:artist:", "") for mat in list_copy_paste]
    for ready in list_ready:
        if(len(spotify.artist_top_tracks(ready)["tracks"]) == 1):
            print([x["name"] for x in spotify.artist_top_tracks(ready)["artists"]], "has/have very few songs")
        else:
            create_dfs(ready)

In [8]:
df_artists = pd.read_csv("artists_df.csv")
df_artists.set_index(df_artists.columns[0], inplace = True)
df_artists.rename_axis("artist", axis = 0, inplace = True)
# --- Enter any substring to query ---
query_artist("Kh")

Name:  Khalid 
Singles:  9 
Collabs:  14
Name:  Wiz Khalifa 
Singles:  1 
Collabs:  3
Name:  Khaled 
Singles:  1 
Collabs:  1
Name:  Khruangbin 
Singles:  6 
Collabs:  10
Name:  DJ Khaled 
Singles:  1 
Collabs:  1
Name:  Khontkar 
Singles:  0 
Collabs:  1


In [7]:
#--- Copy-Paste Spotify URI of the artist ---
copy_paste_links(["spotify:artist:7z5WFjZAIYejWy0NI5lv4T", "spotify:artist:2HPaUgqeutzr3jx5a9WyDV", "spotify:artist:5yy76ufVriyvidNSvXlRU1", "spotify:artist:6qqNVTkY8uBg9cP3Jd7DAH", "spotify:artist:0ErzCpIMyLcjPiwT4elrtZ", "spotify:artist:2cWZOOzeOm4WmBJRnD5R7I"])