#Load Modules

In [77]:
from dotenv import load_dotenv
import os
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import pycountry
from collections import defaultdict
import pandas as pd
import time
from spotipy.exceptions import SpotifyException
from collections import Counter

#Log in Spotify API

In [80]:
with open(".env", "w") as env_file:
    env_file.write("SPOTIPY_CLIENT_ID=63799c6d926d4adfb02144afd910b085\n")
    env_file.write("SPOTIPY_CLIENT_SECRET=7ada6a33b195445aaa3962da45db695a\n")

In [82]:
# Load environment variables from the .env file
load_dotenv()

# Retrieve credentials from environment variables
client_id = os.getenv("SPOTIPY_CLIENT_ID")
client_secret = os.getenv("SPOTIPY_CLIENT_SECRET")

# Initialize Spotipy with client credentials
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=client_id,
                                                           client_secret=client_secret))

In [84]:
#testing the login
featured_playlists = sp.featured_playlists(limit=5)
for playlist in featured_playlists['playlists']['items']:
    print(playlist['name'], "-", playlist['id'])

Éxitos España - 37i9dQZF1DXaxEKcoCdWHD
PEGAO - 37i9dQZF1DX1HCSfq0nSal
Viva Latino - 37i9dQZF1DX10zKzsJ2jva
Viral España 2024 - 37i9dQZF1DWVJv1UsWItkB
Pop con Ñ - 37i9dQZF1DX3sCT1ItXgNd


#Data wrangling

In [87]:
def get_country_codes():
    """
    Generate a flattened list of country codes from the regions dictionary we created.
    
    Returns:
    - country_codes (list): List of ISO country codes from the regions dictionary.
    """
    regions = {
    "North America": 'US',
    "South America": 'BR', 
    "Europe": {
        "Western Europe": 'DE', 
        "Northern Europe": 'GB', 
        "Southern Europe": 'IT', 
    },
    "Africa": {
        "North Africa": 'EG',
        "Sub-Saharan Africa": 'NG', 
    },
    "Oceania": 'AU',
    "Asia": {
        "Southeast Asia": 'ID',
        "South Asia": 'IN',
    },
    "Middle East": 'SA', 
    }
    
    country_codes = []
    for value in regions.values():
        if isinstance(value, dict):
            country_codes.extend(value.values())
        else:
            country_codes.append(value)
    return country_codes

def get_top_playlist_id_for_country(sp, country_code):
    """
    Search for the top "Top 50" playlist for the specified country and return its ID.
    
    Parameters:
    - sp (Spotify): Authenticated Spotify client object.
    - country_code (str): ISO country code to search for the top playlist.

    Returns:
    - playlist_id (str): ID of the top playlist for the specified country, or None if not found.
    """
    try:
        results = sp.search(q=f"Top 50 - {country_code}", type="playlist", limit=1)
        return results['playlists']['items'][0]['id']
    except Exception as e:
        print(f"Error fetching playlist for {country_code}: {e}")
        return None

def get_artists_from_playlist(sp, playlist_id, country_code):
    """
    Extract unique artists from the top 50 playlist and return a list of artist data dictionaries.
    
    Parameters:
    - sp (Spotify): Authenticated Spotify client object.
    - playlist_id (str): Spotify playlist ID to retrieve tracks from.
    - country_code (str): ISO country code associated with the playlist for tagging the artists.

    Returns:
    - artists_list (list): List of dictionaries, each containing information about a unique artist from the playlist.
                           Each dictionary has keys 'artist_id', 'name', 'popularity_rating', 'genre', and 'country_code'.
    """
    artists = {}
    try:
        playlist_tracks = sp.playlist_tracks(playlist_id, limit=50)
        
        for item in playlist_tracks['items']:
            track = item['track']
            if track:
                for artist in track['artists']:
                    artist_id = artist['id']
                    artist_name = artist['name']
                    artist_details = sp.artist(artist_id)
                    popularity = artist_details['popularity']
                    genres = artist_details['genres']
                    first_genre = genres[0] if genres else ''  # Only the first genre
                    
                    if artist_id not in artists:  # Avoid duplicates
                        artists[artist_id] = {
                            'artist_id': artist_id,
                            'name': artist_name,
                            'popularity_rating': popularity,
                            'genre': first_genre,
                            'country_code': country_code
                        }

        return list(artists.values())[:25]  # Limit to top 25 unique artists
    except Exception as e:
        print(f"Error processing playlist {playlist_id}: {e}")
        return []

def collect_data(sp, country_codes):
    """
    Collect artist, country, and festival data for each country code by accessing top playlists.
    
    Parameters:
    - sp (Spotify): Authenticated Spotify client object.
    - country_codes (list): List of ISO country codes to collect data for.

    Returns:
    - artist_data (list): List of dictionaries containing artist information.
    - country_data (list): List of dictionaries containing artist-to-country mapping.
    - festival_data (list): List of dictionaries containing country-to-artist name mapping for festival data.
    """
    artist_data = []
    country_data = []
    festival_data = []

    for country_code in country_codes:
        playlist_id = get_top_playlist_id_for_country(sp, country_code)
        
        if playlist_id:
            top_artists = get_artists_from_playlist(sp, playlist_id, country_code)
            for artist in top_artists:
                # Append artist data
                artist_data.append(artist)

                # Append country data
                country_data.append({
                    'artist_id': artist['artist_id'],
                    'country_code': artist['country_code']
                })

                # Append festival data with artist name instead of artist ID
                festival_data.append({
                    'country_code': artist['country_code'],
                    'artist_name': artist['name']
                })

    return artist_data, country_data, festival_data

def save_to_csv(artist_data, country_data, festival_data):
    """
    Save the collected artist, country, and festival data to CSV files.
    
    Parameters:
    - artist_data (list): List of dictionaries containing artist information.
    - country_data (list): List of dictionaries containing artist-to-country mapping.
    - festival_data (list): List of dictionaries containing country-to-artist name mapping for festival data.
    
    Output:
    - Three CSV files ('artist.csv', 'country.csv', 'festival.csv') saved in the current directory.
    """
    artist_df = pd.DataFrame(artist_data).drop_duplicates()
    country_df = pd.DataFrame(country_data).drop_duplicates()
    festival_df = pd.DataFrame(festival_data).drop_duplicates()

    artist_df.to_csv('artist.csv', index=False)
    country_df.to_csv('country.csv', index=False)
    festival_df.to_csv('festival.csv', index=False)
    print("CSV files created for artist, country, and festival tables.")

# Get country codes
country_codes = get_country_codes()

# Collect data
artist_data, country_data, festival_data = collect_data(sp, country_codes)

# Save data to CSV files
save_to_csv(artist_data, country_data, festival_data)

CSV files created for artist, country, and festival tables.


In [None]:
display(artist_df)
display(country_df)
display(festival_df)

#EDA

In [25]:
#dataframe cleaning
artist_df_cleaned = artist_df.dropna()
artist_df_cleaned.to_csv('artist_cleaned.csv', index=False)


In [43]:
# Load the datasets
artist_df = pd.read_csv('artist_cleaned.csv')
country_df = pd.read_csv('country.csv')
festival_df = pd.read_csv('festival.csv')
top_genres_df = pd.read_csv('top_genres_cleaned.csv')

In [51]:
# Organizing genres by country using the cleaned artist_df
genres_by_country_cleaned = artist_df_cleaned.groupby('country_code')['genre'].apply(list)

# Calculating the top 3 genres for each country in list format for easy retrieval
top_genres_by_country_cleaned = {
    country: [genre for genre, count in Counter(genres).most_common(3)]
    for country, genres in genres_by_country_cleaned.items()
}

# Converting the result to a DataFrame with genres as a list
top_genres_list_df_cleaned = pd.DataFrame([
    {'country_code': country, 'top_genres': genres}
    for country, genres in top_genres_by_country_cleaned.items()
])

In [53]:
# Saving cleaned top_genres df
#top_genres_list_df_cleaned.to_csv('top_genres_cleaned.csv', index=False)

In [39]:
#displaying top_genres
top_genres_list_df_cleaned

Unnamed: 0,country_code,top_genres
0,AU,"[contemporary country, classic oklahoma countr..."
1,BR,"[hip hop, canadian contemporary r&b, art pop]"
2,DE,"[art pop, hip hop, canadian contemporary r&b]"
3,EG,"[arab pop, egyptian hip hop, egyptian viral pop]"
4,GB,"[glam rock, adult standards, classic rock]"
5,ID,"[italian hip hop, italian pop, italian adult pop]"
6,IN,"[contemporary country, modern country pop, dfw..."
7,IT,"[art pop, hip hop, canadian contemporary r&b]"
8,NG,"[art pop, hip hop, canadian contemporary r&b]"
9,SA,"[art pop, hip hop, canadian contemporary r&b]"


In [89]:
# Sort by country, genre, and popularity rating
top_artists_sorted = artist_df_cleaned.sort_values(['country_code', 'genre', 'popularity_rating'], ascending=[True, True, False])

# Retrieve top 15 artists per genre within each country
top_artists_by_genre_country = top_artists_sorted.groupby(['country_code', 'genre']).head(15)

# Display the top artists organized by country and genre
top_artists_by_genre_country

Unnamed: 0,artist_id,name,popularity_rating,genre,country_code
195,3oVcF3VdpMuvMvLLyHPT4t,Sierra Ferrell,65,alternative americana,AU
187,32vWCbZh0xZ4o9gkz4PsEU,Dolly Parton,75,classic country pop,AU
191,1UTPBmNbXNTittyMJrNkvw,Blake Shelton,73,classic oklahoma country,AU
199,3THMgU4KdL7LlO5TEREs2g,Joe Diffie,61,classic oklahoma country,AU
180,4oUHIQIBe0LHzYfvXNW4QM,Morgan Wallen,91,contemporary country,AU
186,718COspgdWOnwOFpJHRZHS,Luke Combs,87,contemporary country,AU
176,4YLtscXsxbVgi031ovDDdh,Chris Stapleton,83,contemporary country,AU
188,6tPHARSq45lQ8BSALCfkFC,Lainey Wilson,76,contemporary country,AU
196,5QNm7E7RU2m64l6Gliu8Oy,HARDY,75,contemporary country,AU
177,6roFdX1y5BYSbp60OTJWMd,Tim McGraw,73,contemporary country,AU
