## Creating the Optimal Gov's Ball Scedule

In [3]:
#Bring in the neccessary libraries 
import pandas as pd
from io import StringIO
import urllib
from bs4 import BeautifulSoup 
import sqlite3 
import time
import numpy 
import signal
from urllib.request import Request, urlopen
from IPython.display import clear_output
import numpy as np

In [4]:
# Load package for progress bar
%run "progress_bar.py"
print ('Progress Bar Loaded')

Progress Bar Loaded


### Get Gov's Ball Artist Info

Could also look at Bonaroo and Lollapolloozas schedules

In [None]:
#Grab the artist info from the Gov's ball website
with urllib.request.urlopen("https://www.governorsballmusicfestival.com/lineup/interactive-lineup/") as url:
    s = url.read()
soup = BeautifulSoup(s, "lxml")


In [None]:
info = soup.findAll('div', {"class":"c-lineup__artist"})
#Put those names in a list, then a dataframe
govs_ball_data = pd.DataFrame([])
for i in info:
    artist = i.attrs['data-title']
    date = i.attrs["data-day-titles"]
    govs_ball_data = govs_ball_data.append(pd.DataFrame({'artist': artist, #Create the table
                                                         'date'  : date},
                                                  index=[0]),
                                     ignore_index=True)
govs_ball_data.head()

In [None]:
govs_ball_data["day"] = ((govs_ball_data["date"].str.slice(-5, -4)).astype(int) -1).astype(str)
govs_ball_data["date"] = govs_ball_data["date"].str.slice(2, -2)
govs_ball_data

#### Add Names to a SQLite Database

In [None]:
con = sqlite3.connect("pitchfork-data.db")

In [None]:
govs_ball_data.to_sql("govs_ball_data", con,if_exists='replace')
con.close() #close db connection

## Pitchfork Crawler

The crawling happens in two distinct stages. In Stage 1, the code loops throught the reviews page on pitchfork.com to find links to all the reviews. Stage 2 goes to each link and pull various bits of information. There's lots more to pull, but this is a solid starting place. 

### Stage 1: 

In [55]:
#Stage 1
con = sqlite3.connect("pitchfork-data.db") #connect to db

for i in log_progress(range(0,13), every=1): 
    #Use the range function to decide how many pages you want to go through
    #In this case, I'm going through the latest 100 reviews (12 per page)
    page_no = str(i)
    link = ('http://pitchfork.com/reviews/albums/?page=' + page_no) #create the link
    t0 = time.time()
    req = Request(link, headers={ 'User-Agent': 'Firefox/24.0' })
    webpage = urlopen(req).read()
    response_delay = time.time() - t0
    time.sleep(10*response_delay)  # wait 10x longer than it took them to respond
    soup = BeautifulSoup(webpage, "lxml") #create the soup
    info = soup.findAll('a', {"class":"album-link"}) #pull the album link
    for j in info:
            link = pd.DataFrame({'link': j.attrs['href']}, #Create the table
                         index=[0]).to_sql("link_table",
                                           con,
                                           if_exists = "append")
    clear_output() #clear ouput before rewriting progress
    print (i)

con.close() #close db connection

12


### Stage 2:

In [56]:
BASE_URL = 'http://www.pitchfork.com'
con = sqlite3.connect("pitchfork-data.db")
links_table = pd.read_sql_query("SELECT DISTINCT * from link_table", con)
links = links_table["link"]
iterator = 0

class Timeout(Exception):  # handles timeout errors (e.g., server request is taking too long)
    pass

for i in log_progress(links[-156:], every=1):
    link = BASE_URL + i
    t0 = time.time()
    req = Request(link, headers={ 'User-Agent': 'Firefox/24.0' })
    webpage = urlopen(req).read()
    response_delay = time.time() - t0
    time.sleep(4*response_delay)  # wait Nx longer than it took them to respond
    soup = BeautifulSoup(webpage, "lxml") #same as above
    artist_info = soup.findAll('ul', {"class":"artist-links artist-list"}) #Artist Name
    album_info = soup.findAll('h1', {"class":"review-title"}) #Album Name
    score_info = soup.findAll('div', {"class":"score-circle"}) #Score
    pub_info = soup.findAll('span', {"class":"pub-date"}) # Publication Date
    genre_info = soup.findAll('ul', {"class":"genre-list before"}) #Genre
    for j in artist_info:
        artist = j.text
    for k in album_info:
        album = k.text
    for l in score_info:
        score = l.text
    for m in pub_info:
        pub_date = m.text
    for n in genre_info:
        genre = n.text
    clear_output()
    print (artist, ", ", album, response_delay, iterator)
    pd.DataFrame({'artist': artist, #Create the table
                  'album'  : album,
                  'score' : score,
                  'pub_date' : pub_date,
                  'genre' : genre,
                  'link' : link},
                 index=[0]).to_sql("album_table",
                                   con,
                                   if_exists = "append")
    iterator= iterator +1

con.close() 

Tennis ,  Yours Conditionally 0.5886037349700928 155


## Data Management

In [57]:
con = sqlite3.connect("pitchfork-data.db")
#Pull the table we just wrote back it
album_table = pd.read_sql_query("SELECT * from album_table", con)
#Drop any duplicates that may have happened
album_table = album_table.drop_duplicates()
#Delete the index
del album_table["index"]
#reupload as album_table_clean
album_table.to_sql("album_table_clean", con, if_exists = "replace")

#### Cleaning Some Data

In [58]:
album_table = pd.read_sql_query("SELECT * from album_table_clean", con)
album_table["artist"] = album_table["artist"].str.upper()

In [59]:
artists = album_table["artist"].drop_duplicates()

In [60]:
gb_data = pd.read_sql_query("SELECT * from govs_ball_data", con)
gb_data["artist"] = gb_data["artist"].str.upper()
gb_data.append([{"index":"11",
                 "artist":"KEVIN PARKER",
                 "date":"Saturday, June 3rd",
                 "day":"2"}])
gb_data[~gb_data.artist.isin(artists)].shape


(28, 4)

In [61]:
gb_data["artist"] = np.where(gb_data["artist"] == "CHARLES BRADLEY & HIS EXTRAORDINAIRES",
                            "CHARLES BRADLEY",
                            gb_data["artist"])
gb_data["artist"] = np.where(gb_data["artist"] == "MARK RONSON VS KEVIN PARKER",
                            "MARK RONSON",
                            gb_data["artist"])

gb_data[~gb_data.artist.isin(artists)].shape
#x number of artists missing

(26, 4)

## Spotify API

In [62]:
import spotipy
import sys
import json

In [63]:
#Feed the Gov's Ball Artists into Spotify
spotify = spotipy.Spotify()#Create spotify object from spotipy 

count = 0
for name in log_progress(gb_data["artist"], every = 1):
    search = spotify.search(q='artist:' + name, type='artist') #Search and grab the first result
    spot_name = pd.read_json(json.dumps(search["artists"]))["items"][0]["name"] #Name for reference
    popularity = pd.read_json(json.dumps(search["artists"]))["items"][0]["popularity"] #Popularity Metric
    genre = pd.read_json(json.dumps(search["artists"]))["items"][0]["genres"] #Genre list
    followers = pd.read_json(json.dumps(search["artists"]))["items"][0]["followers"]["total"] #Social Followers
    if len(genre) > 0:
        genre = "/".join(genre)
    else:
        genre = "Unknown"
    #Add that dat to the gov's ball table
    gb_data.loc[gb_data['artist'] == name, 'followers'] = followers
    gb_data.loc[gb_data['artist'] == name, 'genre'] = genre
    gb_data.loc[gb_data['artist'] == name, 'popularity'] = popularity
    count = count+1


In [64]:
#reupload as govs_ball_enriched
del gb_data["index"]
gb_data.to_sql("govs_ball_enriched", con, if_exists = "replace")

### Merge in Pitchfork Reviews

In [65]:
query = """
        SELECT g.artist,
               p.album, p.score, p.genre 
        FROM govs_ball_enriched g 
        INNER JOIN album_table_clean p 
        ON (upper(p.artist) = g.artist)
        """


In [66]:
pitch_gov = pd.read_sql_query(query, con)
pitch_gov["score"] = pitch_gov["score"].astype(float)

In [67]:
score_metrics = pitch_gov.groupby(["artist", "genre"])["score"].agg(['mean', "max","min", 'count' ]).reset_index()
score_metrics.columns = [["artist", "pf_genre", "pf_mean", "pf_max", "pf_min", "pf_count"]]
score_metrics = round(score_metrics,1)
score_metrics.head()

Unnamed: 0,artist,pf_genre,pf_mean,pf_max,pf_min,pf_count
0,A$AP FERG,Rap,7.1,7.5,6.4,3
1,AIR,Electronic,5.9,7.8,4.0,6
2,BANKS,Pop/R&B,5.0,5.1,5.0,2
3,BEACH HOUSE,Rock,8.5,9.1,8.1,6
4,CAR SEAT HEADREST,Rock,8.3,8.5,8.1,2


In [80]:
#write back to the data
con = sqlite3.connect("pitchfork-data.db")
gb = pd.read_sql_query("SELECT * FROM govs_ball_enriched", con)
gbe = gb.merge(score_metrics, on = "artist", how = "left")
gbe["genre"] = np.where(gbe["pf_genre"].isnull(), gbe["genre"], gbe["pf_genre"])
del gbe["pf_genre"]
#Clean up Genre Row
gbe["genre"] = np.where(gbe["genre"].isin(["Rap", "Pop/R&B", "Rock", "Electronic", "Metal"]),
                  gbe["genre"],
                  np.where(gbe["genre"].str.contains("hip hop"),
                     "Rap",
                     np.where(gbe["genre"].str.contains("RockElectronic"),
                        "Rock",
                        np.where(gbe["genre"].str.contains("rock"),
                           "Rock",
                           np.where(gbe["genre"].str.contains("pop"),
                              "Pop/R&B",
                              np.where(gbe["genre"].str.contains("electronic"),
                                 "Electronic",
                                 np.where(gbe["genre"].str.contains("electronic"),
                                    "Electronic",
                                    "Rock")))))))

gbe["genre"] = np.where(gbe["genre"].isin(["Rock"]),
                        "Rock/Indie",
                        gbe["genre"])
con.close()
gbe["popularity_rank"] = gbe["popularity"].rank(ascending = False)
gbe["fan_base_rank"] = gbe["followers"].rank(ascending = False)
#Bump up artisits who have been reviewed several times
gbe["critical_rank"] = (gbe["pf_mean"]+(gbe["pf_count"])).rank(ascending = False)
gbe["critical_rank"] = np.where(gbe["critical_rank"].isnull(),
                                gbe["critical_rank"].mean(),
                                gbe["critical_rank"])
gbe["total_ranking_score"] = gbe.iloc[:,-3:].sum(axis = 1)
gbe["total_rank"] = gbe["total_ranking_score"].rank(ascending = True)
gbe.sort_values("total_rank")

Unnamed: 0,index,artist,date,day,followers,genre,popularity,pf_mean,pf_max,pf_min,pf_count,popularity_rank,fan_base_rank,critical_rank,total_ranking_score,total_rank
7,7,WIZ KHALIFA,"Sunday, June 4th",3,4345424.0,Rap,86.0,6.0,7.2,4.8,6.0,1.0,1.0,6.0,8.0,1.0
1,1,CHANCE THE RAPPER,"Friday, June 2nd",1,1373092.0,Rap,84.0,8.8,9.1,8.4,2.0,3.0,5.0,13.0,21.0,2.0
13,13,SCHOOLBOY Q,"Friday, June 2nd",1,1022789.0,Rap,79.0,8.2,8.4,7.8,3.0,10.0,7.0,10.0,27.0,3.0
16,16,RAE SREMMURD,"Saturday, June 3rd",2,1460568.0,Rap,83.0,7.7,7.8,7.6,2.0,4.0,4.0,20.0,28.0,4.0
2,2,PHOENIX,"Saturday, June 3rd",2,1060156.0,Rock/Indie,66.0,7.2,8.5,6.0,6.0,28.0,6.0,2.0,36.0,5.5
4,4,LORDE,"Friday, June 2nd",1,2080313.0,Pop/R&B,82.0,7.3,7.3,7.3,1.0,5.0,2.0,29.0,36.0,5.5
5,5,FLUME,"Friday, June 2nd",1,1016864.0,Electronic,77.0,6.5,7.4,5.8,4.0,13.0,8.0,17.0,38.0,7.0
27,27,YG,"Saturday, June 3rd",2,641920.0,Rap,78.0,7.8,8.1,7.2,3.0,11.5,15.0,13.0,39.5,8.0
18,18,TOVE LO,"Friday, June 2nd",1,921050.0,Pop/R&B,80.0,7.1,7.2,7.0,2.0,8.0,10.0,24.0,42.0,9.0
29,29,CHARLI XCX,"Friday, June 2nd",1,564918.0,Pop/R&B,74.0,6.3,8.3,4.5,5.0,16.5,17.0,9.0,42.5,10.0
