**OBJECTIVE:** use the requests library to collect data about the top 100 artists from the Spotify API and save the individual raw JSON responses to files in the data/raw/ directory.

**AUTHOR:** Matthew Thoomkuzhy

**LAST EDITED:** *25/11/2024*

---

# Data Collection

* This notebook documents my process of collecting data using the spotify API to collect data regarding the relationships of the top 100 listened to artists of 2023.

* The top 100 listened to artists of 2023 can be found [here](https://chartmasters.org/spotify-most-streamed-artists-of-2023/).

* I have used Chat-GPT to automatically turn this information into a list for data input.


### import necessary modules:

In [2]:
import requests
import dotenv
from dotenv import load_dotenv
import os
import base64
from requests import post
import json

top_100_artists = [
    "Taylor Swift",
    "The Weeknd",
    "Drake",
    "Bad Bunny",
    "Ed Sheeran",
    "Justin Bieber",
    "Ariana Grande",
    "Eminem",
    "Billie Eilish",
    "Post Malone",
    "BTS",
    "J Balvin",
    "Kanye West",
    "Rihanna",
    "Doja Cat",
    "Olivia Rodrigo",
    "Dua Lipa",
    "Travis Scott",
    "Kendrick Lamar",
    "Lil Nas X",
    "Shawn Mendes",
    "Harry Styles",
    "Maroon 5",
    "Imagine Dragons",
    "Cardi B",
    "Juice WRLD",
    "Lil Baby",
    "SZA",
    "Future",
    "The Kid LAROI",
    "Badshah",
    "ANITTA",
    "Camila Cabello",
    "Selena Gomez",
    "Halsey",
    "Nicki Minaj",
    "Lil Wayne",
    "21 Savage",
    "Megan Thee Stallion",
    "Khalid",
    "Lizzo",
    "Billie Eilish",
    "J. Cole",
    "Adele",
    "Bruno Mars",
    "Chris Brown",
    "Marshmello",
    "DJ Snake",
    "Daddy Yankee",
    "Maluma",
    "Ozuna",
    "Karol G",
    "Becky G",
    "Nicky Jam",
    "Sech",
    "Myke Towers",
    "Rauw Alejandro",
    "Farruko",
    "Jhay Cortez",
    "Lunay",
    "Tainy",
    "Arcangel",
    "Bryant Myers",
    "De La Ghetto",
    "Yandel",
    "Wisin",
    "Zion & Lennox",
    "Natti Natasha",
    "Manuel Turizo",
    "Reik",
    "CNCO",
    "Sebastián Yatra",
    "Morat",
    "Piso 21",
    "Mau y Ricky",
    "Camilo",
    "Kany García",
    "Ricardo Arjona",
    "Alejandro Sanz",
    "Pablo Alborán",
    "Luis Fonsi",
    "Enrique Iglesias",
    "Shakira",
    "Thalía",
    "Paulina Rubio",
    "Gloria Trevi",
    "Alejandra Guzmán",
    "LA INDIA",
    "Ivy Queen",
    "Celia Cruz",
    "Marc Anthony",
    "Romeo Santos",
    "Prince Royce",
    "Aventura",
    "Gente de Zona",
    "Chino & Nacho",
    "Wisin & Yandel",
    "Plan B",
    "Jowell & Randy",
    "Alexis & Fido"
]  

___

### Activate API token:
Next I am going to create a function which allows me to quickly obtain my API token so I can easily pass it as a parameter into functions in the future


In [None]:


# Load environment variable
load_dotenv(dotenv_path="../.env") 
CLIENT_ID = os.getenv("SPOTIFY_CLIENT_ID")
CLIENT_SECRET = os.getenv("SPOTIFY_CLIENT_SECRET")

if not CLIENT_ID or not CLIENT_SECRET:
    raise EnvironmentError("CLIENT_ID or CLIENT_SECRET is not set. Check your .env file.")

def get_token():
    # Encode CLIENT_ID and CLIENT_SECRET in Base64
    auth_string = CLIENT_ID + ":" + CLIENT_SECRET
    auth_bytes = auth_string.encode("utf-8")
    auth_base64 = str(base64.b64encode(auth_bytes), "utf-8")
    
    # Define the URL and headers
    base_url = "https://accounts.spotify.com/api/token"
    headers = {
        "Authorization": "Basic " + auth_base64,
        "Content-Type": "application/x-www-form-urlencoded"
    }
    
    # Define the data payload
    data = {"grant_type": "client_credentials"}
    

    result = requests.post(base_url, headers=headers, data=data)
    if result.status_code != 200:
        raise Exception(f"Failed to get token: {result.status_code}, {result.text}")
    
    # Parse the JSON response
    json_result = result.json()  # Parse response as JSON
    token = json_result.get("access_token")
    if not token:
        raise Exception("Failed to retrieve access token.")
    return token

# Retrieve token
token = get_token()

def get_auth_header(token):
    return {"Authorization": "Bearer " + token}

print("Token retrieved successfully!")


Token retrieved successfully!


---

### Collecting General Artist Data:
* I created and called a function which allows me to input a list of artists. 
* Then extract general information for each artist in the list. 
* Then concatenate the information into a larger combined JSON.

In [None]:
def collect_artists_data(token, artists, file_path):
    base_search_url = "https://api.spotify.com/v1/search?"
    headers = get_auth_header(token)

    # Dictionary to hold  artist data
    all_artists_data = {}

    for artist in artists:
        # Construct the request URL for the current artist
        request = f"q={artist}&type=artist&limit=1"
        request_url = base_search_url + request
        result = requests.get(request_url, headers=headers)
        json_result = json.loads(result.content)
        all_artists_data[artist] = json_result

    os.makedirs(os.path.dirname(file_path), exist_ok=True)

    # Write the concatenated JSON to the file
    with open(file_path, 'w', encoding='utf-8') as file:
        json.dump(all_artists_data, file, ensure_ascii=False, indent=4)

    print(f"Data for {len(artists)} artists saved to {file_path}")

token = get_token()

  

# Code I ran to Obtain file:
file_path = "../data/raw/T100_artists_data.json"
collect_artists_data(token, top_100_artists, file_path)


Data for 100 artists saved to ../data/raw/T100_artists_data1.json


---

### *Further collection*- Related T100 artists of T100 artists:

* I am going to be finding the number of other top 100 artists present in the related artists for each of the top 100 artists.

* This data can be useful for establishing relationships/ idenitfying patterns within the top 100 Artists

##### Note to marker:
* For this data my endpoint was depracated but I was able to collect the data beforehand, so this code no longer works but the files have been collected

In [None]:

def collect_related_artist_counts(token, artists, file_path):
    base_search_url = "https://api.spotify.com/v1/search?"
    related_artists_url = "https://api.spotify.com/v1/artists/{id}/related-artists"
    headers = get_auth_header(token)

    # Dictionary to hold analysis results
    related_artists_analysis = {}

    for artist in artists:
        # Get the artist's ID by searching for  artist
        search_request = f"q={artist}&type=artist&limit=1"
        search_url = base_search_url + search_request

        # GET request to fetch artist ID
        search_result = requests.get(search_url, headers=headers)
        search_json = json.loads(search_result.content)

        if search_json.get('artists') and search_json['artists']['items']:
            artist_id = search_json['artists']['items'][0]['id']
            artist_name = search_json['artists']['items'][0]['name']

            # Using the artist ID to fetch related artists
            related_url = related_artists_url.format(id=artist_id)
            related_result = requests.get(related_url, headers=headers)
            related_json = json.loads(related_result.content)

            # Parse related artists' names
            related_artists = [ra['name'] for ra in related_json.get('artists', [])]

            # Count matches with the original top 100 list
            matches_with_top_100 = [ra for ra in related_artists if ra in artists]
            match_count = len(matches_with_top_100)

            related_artists_analysis[artist_name] = {
                "artist_id": artist_id,
                "related_artists_count": len(related_artists),
                "matches_with_top_100": matches_with_top_100,
                "match_count": match_count
            }
        else:
            # Handle cases where the artist is not found
            related_artists_analysis[artist] = {
                "error": "Artist not found in Spotify."
            }

    os.makedirs(os.path.dirname(file_path), exist_ok=True)

    with open(file_path, 'w', encoding='utf-8') as file:
        json.dump(related_artists_analysis, file, ensure_ascii=False, indent=4)

    print(f"Related artist analysis for {len(artists)} artists saved to {file_path}")

token = get_token()


# Code ran to obtain file:
T100RA_file_path = "../data/raw/T100_related_artists_count.json"
collect_related_artist_counts(token, top_100_artists, T100RA_file_path)

Related artist analysis for 100 artists saved to ../data/raw/T100_related_artists_count.json


---

### Final Data to collect- top 100 artists top tracks

* I am collecting data about the top 100 artists top tracks because I want to analyse how explicit songs are and whether there are patterns with artists and explicitness

In [None]:
import requests
import json
import os

def get_auth_header(token):
    return {
        "Authorization": f"Bearer {token}"
    }

def collect_artists_top_tracks(token, artists, file_path, market="US"):
    base_search_url = "https://api.spotify.com/v1/search?"
    base_top_tracks_url = "https://api.spotify.com/v1/artists/"
    headers = get_auth_header(token)

    # Create a dictionary to hold all artists' top tracks data
    all_artists_top_tracks = {}

    for artist in artists:
        # Get artist ID using the search endpoint
        search_request = f"q={artist}&type=artist&limit=1"
        search_url = base_search_url + search_request

        search_result = requests.get(search_url, headers=headers)
        search_json = search_result.json()

        # Check if the search returned an artist
        if not search_json.get('artists', {}).get('items'):
            print(f"No artist found for {artist}. Skipping.")
            continue

        artist_id = search_json['artists']['items'][0]['id']

        # Get artist's top tracks using the artist ID
        top_tracks_url = f"{base_top_tracks_url}{artist_id}/top-tracks?market={market}"
        top_tracks_result = requests.get(top_tracks_url, headers=headers)
        top_tracks_json = top_tracks_result.json()

        # Add the top tracks data to the dictionary using the artist's name as the key
        all_artists_top_tracks[artist] = top_tracks_json

    os.makedirs(os.path.dirname(file_path), exist_ok=True)

    with open(file_path, 'w', encoding='utf-8') as file:
        json.dump(all_artists_top_tracks, file, ensure_ascii=False, indent=4)

    print(f"Top tracks for {len(artists)} artists saved to {file_path}")
    
# Code ran to obtain file
file_path = "../data/raw/T100_artists_top_tracks.json"
collect_artists_top_tracks(token, top_100_artists, file_path)


Top tracks for 100 artists saved to ../data/raw/T100_artists_top_tracks.json
