## WRUV Broadcasting Dataset Project
Authors: Sydney White and Zachary Hayes

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib as plt

from bs4 import BeautifulSoup
import requests

import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

### Data Collection, Cleaning, and Feature Engineering

In [2]:
# Collect data CSVs into a single dataframe
dfs = []
directory = "dataset" 
for file in os.listdir(directory):
    dfs.append(pd.read_csv(f'{directory}/{file}'))
df = pd.concat(dfs, axis=0)
df.sort_values("Date-time", inplace=True, ignore_index=True)
# Drop columns that we already know we don't need
df.drop(["Playlist Category", "Playlist Duration", "DJ Name", "DJ Email", "Date-time", "Composer"], axis=1, inplace=True)
df

Unnamed: 0,Playlist Title,DJ ID,Date,Time,Artist,Song,Release,New,Local,Genre,Label,UPC
0,Our Intent is All for Your Delight,64617,"Sep 23, 2019",8:04:12 AM,Khruangbin,People Everywhere (Still Alive),The Universe Smiles Upon You,,,Rock,Night Time Stories,4523132552217.0
1,Our Intent is All for Your Delight,64617,"Sep 23, 2019",8:07:21 AM,The Bubs,Golden Thread,Golden Thread,,,Rock,The Bubs,859732310242.0
2,Our Intent is All for Your Delight,64617,"Sep 23, 2019",8:11:38 AM,Y La Bamba,Rios Sueltos,Entre Los Dos,N,,Rock,Tender Loving Empire,703669152959.0
3,Our Intent is All for Your Delight,64617,"Sep 23, 2019",8:15:42 AM,Allah-Las,Prazer Em Te Conhecer,LAHS,,,Rock,Mexican Summer,184923126068.0
4,Our Intent is All for Your Delight,64617,"Sep 23, 2019",8:20:57 AM,Devendra Banhart,My Boyfriend's in the Band,Ma,,,Rock,Nonesuch,75597924053.0
...,...,...,...,...,...,...,...,...,...,...,...,...
284735,Long Distance Runner,160029,"Mar 22, 2024",4:05:08 PM,Bill Laswell,Buhala,Means of Deliverance,,,Jazz,Innerhythmic,0804699102423
284736,Long Distance Runner,160029,"Mar 22, 2024",4:08:25 PM,Remy Le Boeuf,Little Song,Heartland Radio,,,Jazz,ORCHARD - SoundSpore Records,0197190669947
284737,Long Distance Runner,160029,"Mar 22, 2024",4:14:10 PM,strongboi,fool around,fool around,,,,strongboi,5059449065564
284738,Long Distance Runner,160029,"Mar 22, 2024",4:18:52 PM,Miya Folick,Bad Thing,Bad Thing,,,,Nettwerk Music Group,0067003373463


In [3]:
# Count null values
df.isna().sum()


Playlist Title         0
DJ ID                  0
Date                   0
Time                   0
Artist                 4
Song                   2
Release               50
New               252062
Local             281156
Genre              97305
Label               1137
UPC                28232
dtype: int64

In [4]:
# Spins which are missing a UPC account for approximatly 10% of our dataset.
# We must have a UPC in order to identify other missing attributes of our data.
# Therefore, we need a way to determine this UPC

In [None]:
# function to scrape billboard site given artist name 
def web_scrape_bb(artist_name):
    # convert artist name to - if spaces
    artist_name = artist_name.lower()
    artist_name = artist_name.replace(' ', '-')
    base = 'https://www.billboard.com/artist/'
    url = base + artist_name
    # need to add in try/excepts for if the artist is too obscure to have a billboard page 
    page = requests.get(url)
    if page.reason == "Not Found":
        return 0
    else:
        soup = BeautifulSoup(page.content, "html.parser")
        num2 = soup.find_all("div", class_="lrv-u-background-color-brand-accent-yellow lrv-u-height-100p lrv-u-flex lrv-u-flex-direction-column lrv-u-justify-content-center lrv-u-align-items-center u-padding-tb-075" )
        results = soup.find_all("span", class_="c-span a-font-primary-bold u-font-size-34 u-line-height-120 u-letter-spacing-0063 artist-stat-3")
        if len(results) < 1: 
            return 0 
        #print(results[0].text)
        hits = results[0].text.strip()
    
    
        # if statement below is an attempt to get the songs that have been on the billboard top 100 (to see if there is a direct policy violation)
        # unfortunately, these tables are populated with javascript and difficult to scrape 
        #if hits != '0':
        #    url = url + "/chart-history/hsi"
        #    page = requests.get(url)
        #    soup = BeautifulSoup(page.content, "html.parser")
        #    songs = soup.find_all()
        
        # NOTE: currently coded to write flag or has or has not charted by artist not song 
        if hits > 1:
            return 1
        return 0


'''
hits = []

for i in range(len(df)):
    hits.append(web_scrape_bb(df['Artist'].loc[i]))
df['billboards hits'] = hits
df
'''
    

In [7]:
# function to interface with Spotify Web API app (created with id and secret below) STILL WIP!
def spotify_connect():
    # technically insecure to have client secret displayed like this (environment variable)
    CLIENT_ID = '344d3b062e344710a5bdb8427358a31d'
    CLIENT_SECRET = '9f886dde51184f989b1aff4f5ffb21f8'
    AUTH_URL = 'https://accounts.spotify.com/api/token'

    auth_manager = SpotifyClientCredentials(
        client_id=CLIENT_ID,
        client_secret=CLIENT_SECRET,
        requests_session=True)

    sp = spotipy.Spotify(auth_manager=auth_manager)
    # this is our connection which we will need for any future query 
    return sp

def spotify_search_artist(sp, artist_name):
    # spotify queries are all based on Spotify URI IDs and usually return in JSON
    results = sp.search(q='artist:' + artist_name, type='artist')
    genres = results['artists']['items'][0]['genres'] # for top result (in this case accurate, but probably not always)
    id = results['artists']['items'][0]['id']

    # TODO: return or add to database what is useful here 

def spotify_search_song(sp, song_name):
    # when i search california, it gives me teenage dream by katy Perry so not sure how well this works
    results = sp.search(q='track:' + song_name, type='track', limit=10)
    results_title = results['tracks']['items'][0]['name']
    results_album = results['tracks']['items'][0]['album']['name']
    results_artist = results['tracks']['items'][0]['artist']['name']
    # then if this results title matches our song name and the artist matches our artist, find the audio features and add them (by spotify uri id for song)

    uri = results['tracks']['items'][0]['uri'] 
    # TODO: return or add to database whatever is useful 
    print(f'{results_title} from {results_album} by {results_artist}. URI: {uri}')
    
token = spotify_connect()
# note: Radiohead was in the example online, I did not think oh yes, Radiohead lol
spotify_search_artist(token, 'Radiohead')
spotify_search_song(token, 'California')

KeyError: 'artist'