# Artist Exploration

## Imports

In [1]:
import sqlite3
import pandas as pd

## Load Data

In [2]:
# Load the 'artists' table into a DataFrame
def create_connection(db_file):
    """ create a database connection to the SQLite database
        specified by db_file"""
    conn = None
    try:
        conn = sqlite3.connect(db_file)
        print("Connected to database")
        return True, conn
    except sqlite3.Error as e:
        print(f"Failed to connect: {e}")
        return False, None

# path to database file
status, conn  = create_connection("/Users/fried/Documents/DataScience/4season/DSP/Spotify/DSP_project/playlist_analysis.db")

if status:
    df_artists = pd.read_sql_query("SELECT * FROM artists;", conn)
else:
    print("Failed to establish a connection.")

# Display first few rows of the DataFrame to get an overview
df_artists.head()

Connected to database


Unnamed: 0,artist_uri,artist_name,popularity,followers,genres
0,spotify:artist:2wIVse2owClT7go1WT98tk,Missy Elliott,74.0,2180138.0,"dance pop, hip hop, hip pop, pop rap, r&b, rap..."
1,spotify:artist:26dSoYclwsYLMAKD3tpOr4,Britney Spears,82.0,12071720.0,"dance pop, pop"
2,spotify:artist:6vWDO969PvNqNYHIOW5v0m,Beyoncé,88.0,34051083.0,"dance pop, pop, r&b"
3,spotify:artist:31TPClRtHm23RisEBtV3X7,Justin Timberlake,81.0,12752558.0,"dance pop, pop"
4,spotify:artist:5EvFsr3kj42KNv97ZEnqij,Shaggy,75.0,1909526.0,"dance pop, pop rap, reggae fusion"


## Fill missing values

### Fetch data from Spotify API

In [3]:
missing = df_artists.isnull().sum()
print(missing)

artist_uri       0
artist_name      0
popularity     563
followers      563
genres         563
dtype: int64


In [4]:
import requests
import time
import sqlite3
import json

# Connect to the database
def create_connection(db_file):
    conn = None
    try:
        conn = sqlite3.connect(db_file)
        return conn
    except sqlite3.Error as e:
        print(e)
        return None

# Get all artist URIs from the database
def get_all_artist_uris(conn):
    cur = conn.cursor()
    cur.execute("SELECT artist_uri FROM artists WHERE popularity IS NULL OR followers IS NULL OR genres IS NULL;")
    return cur.fetchall()

# Create Spotify access token
def get_spotify_access_token(client_id, client_secret):
    auth_url = 'https://accounts.spotify.com/api/token'
    auth_response = requests.post(auth_url, {
        'grant_type': 'client_credentials',
        'client_id': client_id,
        'client_secret': client_secret,
    })
    auth_response_data = auth_response.json()
    return auth_response_data['access_token'], auth_response_data['expires_in']

# Check if the token is about to expire
def is_token_expired(token_time, expires_in):
    current_time = time.time()
    return current_time - token_time >= expires_in - 10  # 10 seconds buffer

# Alter the artists table
def alter_artists_table(conn):
    try:
        cur = conn.cursor()
        cur.execute(f"ALTER TABLE artists ADD COLUMN last_updated TEXT;")
        conn.commit()
        print(f"Artists table altered successfully. Added column UPDATED.")
    except sqlite3.Error as e:
        print(f"SQL error: {e}")
# Update artist information using batch requests
def update_artists_info_batch(conn, artist_uris, access_token):
    headers = {'Authorization': f'Bearer {access_token}'}
    artist_ids = ','.join([uri.split(":")[-1] for uri in artist_uris])  # Extract the artist IDs from the URIs
    r = requests.get(f"https://api.spotify.com/v1/artists?ids={artist_ids}", headers=headers)

    # Handle rate limiting
    if r.status_code == 429:
        retry_after = int(r.headers.get('Retry-After', 60))
        print(f"Rate limit exceeded. Retrying in {retry_after} seconds.")
        time.sleep(retry_after)
        update_artists_info_batch(conn, artist_uris, access_token)  # Recursive retry
        return

    elif r.status_code != 200:
        print(f"Batch Request Error: Code: {r.status_code} : {r.text}")
        return

    else:
        artists_data = r.json()['artists']
        cur = conn.cursor()
        for artist_data in artists_data:
            popularity = artist_data.get('popularity', None)
            followers = artist_data.get('followers', {}).get('total', None)
            genres = ', '.join(artist_data.get('genres', []))
            artist_uri = f"spotify:artist:{artist_data['id']}"
            
            cur.execute("UPDATE artists SET popularity = ?, followers = ?, genres = ? WHERE artist_uri = ?;",
                        (popularity, followers, genres, artist_uri))
            conn.commit()
            print(f"Updating artist: {artist_uri}")
            print(f"Data received from API: {artist_data}")


# Main function
if __name__ == '__main__':
    # Initialize
    conn = create_connection('playlist_analysis.db')
    client_id = '954461108b34465b8c6e33d9f2b20f95'
    client_secret = '80bf594500674a0396c45f57a779c43d'
    access_token, expires_in = get_spotify_access_token(client_id, client_secret)
    token_time = time.time()  # Store the current time
    
    # Get all artist URIs
    all_artist_uris = get_all_artist_uris(conn)
    total_artists = len(all_artist_uris)

    # Initialize rate limiting variables
    rate_limit_window = 25  # seconds
    max_requests = 20  # Maximum number of requests in rate_limit_window
    request_times = []

    # Update artists in batches
    for i in range(0, len(all_artist_uris), 50):
        batch = all_artist_uris[i:i+50]
        
        # Check if the token is about to expire
        if is_token_expired(token_time, expires_in):
            print("Refreshing token...")
            access_token, expires_in = get_spotify_access_token(client_id, client_secret)
            token_time = time.time()

        # Check rate limit
        current_time = time.time()
        request_times = [t for t in request_times if current_time - t < rate_limit_window]
        if len(request_times) >= max_requests:
            sleep_time = rate_limit_window - (current_time - min(request_times))
            print(f"Rate limit reached. Sleeping for {sleep_time:.2f} seconds.")
            time.sleep(sleep_time)
        
        update_artists_info_batch(conn, [uri for uri, in batch], access_token)
        request_times.append(time.time())
        
        if (i + len(batch)) % 10 == 0:
            print(f"Processed {i + len(batch)} of {total_artists} artists.")

    conn.close()

Updating artist: spotify:artist:5kPedB70NoaucwCvU7awih
Data received from API: {'external_urls': {'spotify': 'https://open.spotify.com/artist/5kPedB70NoaucwCvU7awih'}, 'followers': {'href': None, 'total': 223}, 'genres': [], 'href': 'https://api.spotify.com/v1/artists/5kPedB70NoaucwCvU7awih', 'id': '5kPedB70NoaucwCvU7awih', 'images': [], 'name': 'Artist Page Redirect Cleanup 2023', 'popularity': 5, 'type': 'artist', 'uri': 'spotify:artist:5kPedB70NoaucwCvU7awih'}
Updating artist: spotify:artist:6udveWUgX4vu75FF0DTrXV
Data received from API: {'external_urls': {'spotify': 'https://open.spotify.com/artist/6udveWUgX4vu75FF0DTrXV'}, 'followers': {'href': None, 'total': 1263606}, 'genres': ['k-pop', 'korean r&b'], 'href': 'https://api.spotify.com/v1/artists/6udveWUgX4vu75FF0DTrXV', 'id': '6udveWUgX4vu75FF0DTrXV', 'images': [{'height': 640, 'url': 'https://i.scdn.co/image/ab6761610000e5eb496189630cd3cb0c7b593fee', 'width': 640}, {'height': 320, 'url': 'https://i.scdn.co/image/ab67616100005174

In [5]:
# path to database file
conn = create_connection("playlist_analysis.db")

if conn is not None:
    df_artists = pd.read_sql_query("SELECT * FROM artists;", conn)
else:
    print("Failed to establish a connection.")

missing = df_artists.isnull().sum()
print(missing)

artist_uri       0
artist_name      0
popularity     563
followers      563
genres         563
dtype: int64
