# IMPORTS, CLASSES, FUNCTIONS, AUTHENTICATION, SETUP

In [5]:
from dotenv import load_dotenv
import os
load_dotenv()
spotify_client_id = os.getenv("SPOTIFY_CLIENT_ID")
spotify_client_secret = os.getenv("SPOTIFY_CLIENT_SECRET")
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
spotify = spotipy.Spotify(client_credentials_manager=SpotifyClientCredentials(spotify_client_id, spotify_client_secret))
from matplotlib import pyplot as plt
from scipy import stats as st
import pandas as pd
import numpy as np
import re
class Prepare_data:
    def __init__(self, tracks_artist):
        self.tracks_artist = tracks_artist
    def to_dataframe(self):
        tracks_data_list = []
        for track in self.tracks_artist['tracks']:
            tracks_data = {}
            audio_features_data = spotify.audio_features(track['id'])[0]
            audio_features = {key: audio_features_data[key] for key in ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms']}
            tracks_data.update(audio_features)
            album_data = {key: track['album'][key] for key in ['release_date']}
            artists_data = {"artists": [artist['name'] for artist in track['artists']]}
            available_markets_data = spotify.album(track['album']['id'])['available_markets']
            other_data = {key: track[key] for key in ['popularity', 'explicit', 'id', 'name']} 
            tracks_data.update(other_data)
            tracks_data.update(album_data)
            tracks_data.update(artists_data)
            tracks_data.update({'available_markets': len(available_markets_data)})
            tracks_data_list.append(tracks_data)
        track_df = pd.DataFrame(tracks_data_list)
        return track_df
def dummy_key(df):
    new_key = pd.DataFrame(np.zeros((len(df.index), 12), dtype = int))
    new_key.columns = ["key: "+str(i) for i in range(12)]
    dummy_key = pd.get_dummies(df["key"])
    dummy_key.columns = ["key: "+str(column) for column in dummy_key.columns]
    new_key.drop(dummy_key.columns, axis = 1, inplace = True)
    dummied_key = pd.concat([new_key, dummy_key], axis = 1)
    dummied_str = " ".join(dummied_key.columns)
    list_of_nums = map(int, re.findall('\d+', dummied_str))
    dummied_key_columns = ["key: "+str(i) for i in sorted(list_of_nums)]
    dummied_key = dummied_key.reindex(dummied_key_columns, axis=1)
    df.drop("key", axis = 1, inplace=True)
    df = pd.concat([df, dummied_key], axis = 1)
    df.drop("key: 11", axis = 1, inplace = True)
    return df
def dummy_time_signature(df):
    time_signature = df["time_signature"]
    time_signature = time_signature[time_signature == 4].replace(4, 1)
    time_signature = pd.Series(time_signature, name = "time_signature: 4")
    df.drop("time_signature", axis = 1, inplace = True)
    df = pd.concat([df, time_signature], axis = 1)
    return df
def normalize_ordered_data(df, feature):
    z_feature = (df[feature]-df[feature].mean())/df[feature].std()
    z_bins, z_edges = np.histogram(z_feature, bins = 6)
    z_edges[-1] = z_edges[-1] + 0.000001
    binned_feature = pd.Series(np.digitize(z_feature, z_edges))
    binned_feature.name = feature
    df.drop(feature, axis = 1, inplace = True)
    df = pd.concat([df, binned_feature], axis = 1)
    return df
def normalize_numeric_data(df):    
    if(df["available_markets"].max() <= 78):
        df["available_markets"] = df["available_markets"]/78
    else:
        max_value = df["available_markets"].max()
        df["available_markets"] = df["available_markets"]/max_value
    df["popularity"] = df["popularity"]/100
    return df
def dummy_boolean_value(df, feature):
    df[feature] = df[feature].apply(lambda x: x*(1))
    return df
def duration_ms_to_min(df):
    duration_min = round(df["duration_ms"]/60000).astype(int)
    duration_min.name = "duration_min"
    df.drop("duration_ms", axis = 1, inplace = True)
    df = pd.concat([df, duration_min], axis = 1)
    return df
def preprocess(df):
    df = normalize_ordered_data(df, "tempo")
    df = normalize_ordered_data(df, "loudness")
    #df = dummy_key(df)
    #df = dummy_time_signature(df)
    df = normalize_numeric_data(df)
    df = dummy_boolean_value(df, "explicit")
    df = duration_ms_to_min(df)
    return df
def artist_save(artist_list):
    for artist_id in artist_list:
        track_artist = spotify.artist_top_tracks(artist_id)
        track_df = Prepare_data(track_artist).to_dataframe()
        ready_df = preprocess(track_df)
        if(artist_id == artist_list[0]):
            existing_df = ready_df
            existing_df.to_csv("tracks_df.csv", mode = "w", index = None)
        else:
            existing_df = pd.read_csv("tracks_df.csv") 
            existing_df = existing_df.append(ready_df, ignore_index=True)
            existing_df.drop_duplicates(subset = "id", keep = "first", inplace = True)
            existing_df.to_csv("tracks_df.csv", mode = "w", index = None)

In [6]:
artist_list = ["085pc2PYOi8bGKj0PNjekA", "738wLrAtLtCtFOLvQBXOXp", "1i8SpTcr7yvPOmcqrbnVXY", "1vCWHaC5f2uS3yhpwWbIA6", "66CXWjxzNUsdJxJ2JdwvnR", "3Isy6kedDrgPYoTS1dazA9", "0z4gvV4rjIZ9wHck67ucSV", "6MF9fzBmfXghAz953czmBC", "6vXTefBL93Dj5IqAWq6OTv", "04gDigrS5kc9YWfZHwBETP", "07YZf4WDAMNwqr4jfgOZ8y", "0c173mlxpT3dSFRgMO8XPh", "4xRYI6VqpkE3UwrDrAZL8L", "0jnsk9HBra6NMjO2oANoPY", "3sgFRtyBnxXD5ESfmbK4dl", "1uNFoZAHBGtllmzznpCI3s", "7dGJo4pcD2V6oG8kP0tJRR", "6eUKZXaKkcviH0Ku9w2n3V", "6vWDO969PvNqNYHIOW5v0m", "246dkjvS1zLTtiykXe5h60", "5pKCCKE2ajJHZ9KAiaK11H", "1Cs0zKBU1kc0i8ypK3B9ai", "0hCNtLu0JehylgoiP8L4Gh", "6MDME20pz9RveH9rEXvrOM", "0Y5tJX1MQlPlqiwlOH1tJY", "4nDoRrQiYLoBzwC5BhVJzF", "2wY79sveU1sp5g7SokKOiI", "1Xyo4u8uXC1ZmMpatF05PJ", "0TnOYISbd1XYRBk9myaseg", "4IJczjB0fJ04gs4uvP0Fli"]
artist_list2 = ["6XyY86QOPPrYVGvF9ch6wz", "3nFkdlSjzX9mRTtwJOzDYB", "0L8ExT028jH3ddEcZwqJJ5", "7Ln80lUS6He07XvHI8qqHH", "7oPftvlwr6VrsViSDV7fJY", "6olE6TJLqED3rqDCT0FyPh", "6FBDaR13swtiWwGhX1WQsP", "5LfGQac0EIXyAN8aUwmNAQ", "7jy3rLJdDQY21OgRLCZ9sD", "0PhqM7UAxtvWYi5j4MwxSl", "1Hy8tdpH9ygqoohumHZqVl", "0Aoou2kIWXrzTOfhFzmNqa", "5K4W6rqBFWDnAN6FQUkS6x", "7IgzsWd4tWIn70uh6dAq8d", "4DG2pTwQBor7a6wtoEABau", "0Ty63ceoRnnJKVEYP0VQpk", "0NSO0g40h9CTj13hKPskeb", "5ixQSDvAMa5O758xG8MWXT", "0oSGxfWSnnOXhD2fKuz2Gy", "07XSN3sPlIlB2L2XNcTwJw", "2Hkut4rAAyrQxRdof7FVJq", "2cnMpRsOVqtPMfq7YiFE6K", "36QJpDe2go2KgaRleHCDTp", "3Z814BstLaKbLqKHXsht8O", "4XP7cGw4t8BqZ8Du5q3bHg", "4q2SZIdLq6YTc9cZLCclWc", "7rEIUw67hRTgievwuKQGSj", "7CW2eGwAuElNq09rVtZYsM", "3nF0yXDatdq9xV279nEs5X", "1g4J8P1JWwanNyyXckRX5W"]
artist_list3 = ["6RTC1abMgBC7Krg6qJQHJh", "7gobcoscOfsY0nOeqqFzvU", "2QslFlDyZVpLYwfqyRDkNs", "1DFr97A9HnbV3SKTJFu62M", "0nmQIMXWTXfhgOBdNzhGOs", "2ye2Wgw4gimLv2eAKyk1NB", "3qm84nBOXUEQ2vnTfUTTFC", "711MCceyCBcFnzjGY4Q7Un", "776Uo845nYHJpNaStv1Ds4", "64tNsm6TnZe2zpcMVMOoHL", "5gFPi3KWXEwA9bLEO47Ow0", "6LnJKrtFnTEGdbWQ2riWCL", "2h1EEQ0lD01lPKDRGisvL4", "1nFJWl30l3PAHei86OxN0i", "1vWlMjGmurAKBMOOMW5yMD", "0FUsrstJwmg4WVHQMTYuUA", "2kS0jWMkkFBL0mrl0VotD0", "757aE44tKEUQEqRuT6GnEB", "5NGO30tJxFlKixkPSgXcFE", "15UsOTVnJzReFVN1VCnxy4", "0du5cEVh5yTK9QJze8zA0C", "2yMN0IP20GOaN6q0p0zL5k", "2RQ8NtUmg5y6tfbvCwX8jI", "4W31XN2JH8mC54NkHdh04s", "1PpRPZXSS5ka7m5NW2TO7q", "0wCKNMsqYasJBFVagjay49", "08mjMUUjyTchMHCW7evc3R", "1PpRPZXSS5ka7m5NW2TO7q", "21E3waRsmPlU7jZsS13rcj", "0C8ZW7ezQVs4URX5aX7Kqx"]
artist_list = list(set(artist_list+ artist_list2 + artist_list3))

In [7]:
artist_save(artist_list)