In [1]:
#############################
#          Imports          #
#############################

import os
import json
import spotipy
import numpy as np
import spotipy.util as util
import re
import pandas as pd
from bs4 import BeautifulSoup
from json.decoder import JSONDecodeError
import requests
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords


In [2]:
#setting up some of the needed objects for Spotify API
username = 'xxxxxx'
my_client_id = 'xxxxxx'
my_secret = 'xxxxxx'
my_redirect_uri = 'xxxxxx'
spotify = spotipy.Spotify()
scope = 'user-library-read'

#User permission
try:
    token = util.prompt_for_user_token(username, scope, my_client_id,
                                       my_secret, redirect_uri = my_redirect_uri)
except:
    os.remove(f".cache-{username}")
    token = util.prompt_for_user_token(username)
    
#getting authorization
if token:
    spotify_object = spotipy.Spotify(auth = token)
    print('Successfully authenticated')
else:
    print('Cannot get token for', username)

#saving user as an object
user = spotify_object.current_user()

#obtaining display name and follower count values for printing purposes
displayName = user['display_name']
follower_count = user['followers']['total']
artist_name =  'Frank Ocean'


Successfully authenticated


In [3]:
print(">>> Hello " + displayName + "!")
search_results = spotify_object.search(artist_name, 1, 0, "artist")
artist = search_results['artists']['items'][0]
print('Artist Name: ',artist['name'])
print('Genres:',artist['genres'][0])
#print('Genres:',artist['genres'][0].replace(' ', ', '))
print()
artistID = artist['id']

#albums and tracks
trackURIs = []

album_results = spotify_object.artist_albums(artistID)
album_results = album_results['items']

for album in album_results:
    albumID = album['id']
    #extracting tracks
    track_results = spotify_object.album_tracks(albumID)
    track_results = track_results['items']
    #nested iteration to add track uri to master list
    for song in track_results:
        #only appending songs where the artist is the main singer; i.e. first person listed as the artist, not feature
        if (artist_name in song['artists'][0]['name']):
            trackURIs.append(song['uri'])
    
    #if you want to include artists' collaborations and features, uncomment
    #for song in track_results:
        #for song_artist in song['artists']:
            #if song_artist['name'] == artist_name:
                #trackURIs.append(song['uri'])

>>> Hello Yoon Sung Hong!
Artist Name:  Frank Ocean
Genres: hip hop



In [4]:
#With Track URIs, getting song names
#Initializing trackNames list
trackNames = []
for URI in trackURIs:
    track = spotify_object.track(URI)
    trackNames.append(track['name'])

In [5]:
#########################
# Using the Genius API  #
#########################
YOUR_CLIENT_ID = 'xxxxxx'
genius_token = 'xxxxxx'
base_url = "http://api.genius.com"
search_url = base_url + "/search"
headers = {'Authorization': 'Bearer ' + genius_token}
def get_song_info(url, list_songs, headers):
    songs = []
    for song in list_songs:
        data = {'q': song}
        response = requests.get(url, params=data, headers=headers)
        json = response.json()
        #initializing song info
        song_info = None
        for hit in json["response"]["hits"]:
            #verifying the search result and its artist is the same as the artist of our choice (i.e. Frank Ocean)
            if hit["result"]["primary_artist"]["name"] == artist_name:
                song_info = hit
                break
        if song_info:
            #verifying that the song exists in Genius. If it exists, adds on to the list.
            songs.append(song_info)
    return songs
songs = get_song_info(search_url, trackNames, headers)

In [6]:
#finding the genius urls for all songs
#initializing a dictionary
url_dict = {}
genius_url = []
for song in songs:
    url_extension = song['result']['path']
    url = 'https://genius.com' + url_extension
    genius_url.append(url)
    url_dict[song['result']['title']] = url

In [7]:
#getting lyrics
lyrics_dict = {}
for URL in genius_url:
    page = requests.get(URL)
    html = BeautifulSoup(page.text, "html.parser") # Extract the page's HTML as a string
    # Scrape the song lyrics from the HTML
    lyrics = html.find("div", class_="lyrics").get_text()
    lyrics_dict[URL] = lyrics

In [8]:
#storing every alphabet for later purposes
english = "a b c d e f g h i j k l m n o p q r s t u v w x y z A B C D E F G H I J K L M N O P Q R S T U V W X Y Z".split()
#initializing an array
master_array = np.array([])
#initializing a master string for later purposes
master_string = ""
#initializing a dictionary for unique words count for each song (to be used in exploratory analysis later)
song_unique_dict = {}
#iterating through each song (and its URL) and using regex expression to clean the lyrics data
for URL in genius_url:
    pretty = lyrics_dict[URL].replace('\n', ' ')
    pretty = re.sub(r'\[.*?\]', '', pretty)
    pretty = re.sub(r'\s+', ' ', pretty)
    pretty = re.sub(r'"', '', pretty)
    pretty = re.sub(r'\(', '', pretty)
    pretty = re.sub(r'\)', '', pretty)
    pretty = re.sub(r',', '', pretty)
    pretty = re.sub(r'\?', '', pretty)
    pretty = re.sub(r"(you\'.+?.)", '', pretty) #regex format to filter out you've, you'll, etc.
    pretty = pretty.replace('\'', '’')
    word_list = pretty.lower().split()
    #filtering only english lyrics (one of the songs, Nikes, contains Japanese lyrics)
    is_english = [word[0] in english for word in word_list] 
    word_list = np.array(word_list)
    #recording the number of unique words used per song 
    song_unique_dict[URL] = len(np.unique(word_list)) #may exclude some of the stop words
    #appending the song lyrics to master array 
    master_array = np.append(master_array, word_list)
#checking if the words are stopwords
is_stopword = (np.isin(master_array, stopwords.words('english')) == False)
#subsetting only those words that are not stopwords(according to nltk pkg)
master_array = master_array[is_stopword]

array(['these', 'bitches', 'want', ..., 'good', 'don’t', 'die'],
      dtype='<U32')

In [11]:
unique, counts = np.unique(master_array, return_counts=True)
word_count = dict(zip(unique, counts))

In [17]:
#displaying the dictionary in descending order of word counts
word_count_ordered = []
for i in sorted(word_count, key=word_count.get, reverse=True):
    word_count_ordered.append(i + ": " + str(word_count[i]))
#number of different words used
len(word_count_ordered)

2760

In [None]:
#average number of unique words used
unique_mean = np.mean(list(song_unique_dict.values()))
#maximum number of unique words used
unique_max = np.max(list(song_unique_dict.values()))
print("Average # of words:", unique_mean, "| "
     "Maximum # of words:", unique_max)
#distribution of unique words used in each song
sns.distplot(list(song_unique_dict.values()), bins = 10)

In [None]:
#distribution of the word count (number of times a word was used in all Frank Ocean's songs)
sns.distplot(list(word_count.values()), bins = 10) #excludes stop words

In [None]:
#creating a new unique words count dictionary, but with song title instead of URL as key this time
song_unique_dict_new = {'song title': list(url_dict.keys()), 'unique words count': list(song_unique_dict.values())}
#converting dictionary into data frame
song_unique_df = pd.DataFrame(data = song_unique_dict_new)
#ordering the data frame by unique words count, descending. displaying first 20 songs
song_unique_df.sort_values(by='unique words count', ascending=False).head(20)

In [None]:
#############################
#  Named Entity Recognition #
#############################
#using stanford's NER tool
from nltk.tag.stanford import StanfordNERTagger
st = StanfordNERTagger('/Users/yoonsunghong/stanford-corenlp/stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz',
               '/Users/yoonsunghong/stanford-corenlp/stanford-ner/stanford-ner.jar')
#st.tag()