# Part 1A: Data Collection Spotify (Audio Features)

In [1]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import pandas as pd
import requests
import shutil # to save it locally

### Set up Spotify API
To scrape data, we will use a Python API wrapper for Spotify, Spotipy. Personal Client ID and Secret are used to authorize the API. Additionally, we will create a function to facilitate data scraping for playlists.

In [6]:
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id= client_id,
                                                           client_secret= client_secret),
                    requests_timeout=30)

In [73]:
def get_playlist_id(playlist_name):
    results = sp.search(q=playlist_name, type='playlist')
    ids = [playlist.get('id') for playlist in results.get('playlists').get('items')]
    return ids[0]


def get_playlist_tracks(playlist_id, n_tracks):
    data = pd.DataFrame()
    #spotify API sets a default limit of 100 tracks, to get all we need to invoke offset
    for i in range(0,n_tracks,100):
        results = sp.playlist_tracks(playlist_id,limit=100, offset=i)
        #shape json into dataframe
        track_ids = [track.get('track').get('id') for track in results.get('items')]
        df = pd.DataFrame()
        df['id'] = track_ids
        df['track'] = [track.get('track').get('name') for track in results.get('items')]
        df['artist_id'] = [track.get('track').get('artists')[0].get('id') for track in results.get('items')]
        df['artist'] = [track.get('track').get('artists')[0].get('name') for track in results.get('items')]
        df['album_id'] = [track.get('track').get('album').get('id') for track in results.get('items')]
        df['album'] = [track.get('track').get('album').get('name') for track in results.get('items')]
        df['release_date'] = [track.get('track').get('album').get('release_date') for track in results.get('items')]
        df['link'] = [track.get('track').get('href') for track in results.get('items')]
        #get audio features, merge with df
        track_features = sp.audio_features(track_ids)
        df_features = pd.DataFrame.from_dict(track_features)
        df = df.merge(df_features, on='id', how='left')
        #merge with data
        data = pd.concat([data,df])
    return data

### Identify Playlists for Genres
Scrape data for playlists created by Every Noise at Once. Genres targeted include Dance Pop, Hip Hop, Rock, and Country. Spotify API requires a Playlist ID to collect tracks. Retrive the ID from the Playlist name before collecting the tracks.

In [60]:
playlist_count = {"The Sound of Dance Pop": 1133, 
             "The Sound of Hip Hop": 969, 
             "The Sound of Rock": 1173, 
             "The Sound of Country": 609}

playlists = [k for k in playlist_count]

In [48]:
playlist_ids = {}
for playlist in playlists:
    playlist_ids[playlist] = get_playlist_id(playlist)

playlist_ids

{'The Sound of Dance Pop': '2ZIRxkFuqNPMnlY7vL54uK',
 'The Sound of Hip Hop': '6MXkE0uYF4XwU4VTtyrpfP',
 'The Sound of Rock': '7dowgSWOmvdpwNkGFMUs6e',
 'The Sound of Country': '4mijVkpSXJziPiOrK7YX4M'}

In [61]:
hip_hop = get_playlist_tracks('6MXkE0uYF4XwU4VTtyrpfP',969)

In [63]:
dance_pop = get_playlist_tracks('2ZIRxkFuqNPMnlY7vL54uK',1133)

In [65]:
rock = get_playlist_tracks('7dowgSWOmvdpwNkGFMUs6e',1173)

In [66]:
country = get_playlist_tracks('4mijVkpSXJziPiOrK7YX4M',609)

### Combine Datasets for each genre
Combine four datasets to create one dataframe. Include a column, **genre**, to identify each song genre classification.

In [68]:
country['genre'] = 'country'
hip_hop['genre'] = 'hip_hop'
rock['genre'] = 'rock'
dance_pop['genre'] = 'dance_pop'

df_songs = pd.concat([country, hip_hop, rock,dance_pop])

In [69]:
df_songs.shape

(3884, 26)

In [77]:
df_songs = df_songs.drop_duplicates(subset=['id'])

In [78]:
df_songs.shape

(3831, 26)

In [79]:
df_songs.genre.value_counts()

rock         1173
dance_pop    1080
hip_hop       969
country       609
Name: genre, dtype: int64

### Export Data

In [80]:
df_songs.to_csv('../data/audio_features.csv', index=False)

### Artist Data

In [110]:
def get_artist_details(artist_id):
    d = {}
    results = sp.artist(artist_id)
    d['artist_id'] = artist_id
    d['artist'] = results.get('name')
    if len(results.get('images'))>0:
        d['image'] = results.get('images')[0].get('url')
    else:
        d['image'] = ''
    d['genres'] = str(results.get('genres'))
    d['popularity'] = results.get('popularity')
    d['link'] = results.get('href')
    if len(results.get('followers'))>0:
        d['followers'] = results.get('followers').get('total')
    else:
        d['followers'] = None
    return d

In [116]:
artists = list(set([artist for artist in df_songs['artist_id']]))

In [112]:
artist_data = []
for artist in artists:
    data = get_artist_details(artist)
    artist_data.append(data)

In [113]:
df_artists = pd.DataFrame(artist_data)

In [115]:
df_artists.to_csv('../data/artists.csv')

### Artist Image Downloads

In [120]:
#Code taken from - https://towardsdatascience.com/how-to-download-an-image-using-python-38a75cfa21c    
def get_image(image_url, filename):
    # Open the url image, set stream to True, this will return the stream content.
    r = requests.get(image_url, stream = True)

    # Check if the image was retrieved successfully
    if r.status_code == 200:
        # Set decode_content value to True, otherwise the downloaded image file's size will be zero.
        r.raw.decode_content = True

        # Open a local file with wb ( write binary ) permission.
        with open(filename,'wb') as f:
            shutil.copyfileobj(r.raw, f)
       # print('Image sucessfully Downloaded: ',filename)
    else:
        print('Image Couldn\'t be retreived', filename)

In [128]:
df_artists.set_index('artist_id',inplace=True)

In [142]:
for index in df_artists.index:
    artist_id = index
    link = df_artists.loc[index]['image']
    if 'http' in link:
        path = '../images/artists/' + artist_id + '.png'
        get_image(link, path)

In [202]:
#taken from StackOverflow https://stackoverflow.com/questions/51486297/cropping-an-image-in-a-circular-way-using-python
import numpy as np
from PIL import Image, ImageDraw


def crop_center(pil_img, crop_width, crop_height):
    img_width, img_height = pil_img.size
    return pil_img.crop(((img_width - crop_width) // 2,
                         (img_height - crop_height) // 2,
                         (img_width + crop_width) // 2,
                         (img_height + crop_height) // 2))

def crop_max_square(pil_img):
    return crop_center(pil_img, min(pil_img.size), min(pil_img.size))

def crop_image(image_path, new_path):
    img=Image.open(image_path).convert("RGB")
    img = crop_max_square(img)
    npImage=np.array(img)
    h,w=img.size
    alpha = Image.new('L', img.size,0)
    draw = ImageDraw.Draw(alpha)
    draw.pieslice([0,0,h,w],0,360,fill=255)
    npAlpha=np.array(alpha)
    npImage=np.dstack((npImage,npAlpha))
    Image.fromarray(npImage).save(new_path)

In [203]:
import os 
files = os.listdir('../images/artists')
file_paths = ['../images/artists/' + file for file in files]

for path in file_paths:
    new_path = '../images/cropped_artists/' + path[18:]
    crop_image(path, new_path)

In [182]:
crop_image('../images/artists/0DchahWJGQqrqr8PMM5zQD.png','testing.png')

In [168]:
print(min(h,w))

1000
