## Creating the Optimal Gov's Ball Scedule

In [None]:
#Bring in the neccessary libraries 
import pandas as pd
from io import StringIO
import urllib
from bs4 import BeautifulSoup 
import sqlite3 
import time
import numpy 
import signal

### Get Gov's Ball Artist Info

In [None]:
#Grab the artist info from the Gov's ball website
with urllib.request.urlopen("https://www.governorsballmusicfestival.com/lineup/interactive-lineup/") as url:
    s = url.read()
soup = BeautifulSoup(s, "lxml")

info = soup.findAll('div', {"class":"c-lineup__artist"})

In [None]:
#Put those names in a list, then a dataframe
names = []
for i in info:
    names.append(i.attrs['data-title'])

names_df = pd.DataFrame(names)
names_df.columns = ["artist"]
names_df.head()

#### Add Names to a SQLite Database

In [207]:
con = sqlite3.connect("pitchfork-data.db")

In [208]:
names_df.to_sql("govs_ball_artists", con,if_exists='replace')

## Pitchfork Crawler

The crawling happens in two distinct stages. In Stage 1, the code loops throught the reviews page on pitchfork.com to find links to all the reviews. Stage 2 goes to each link and pull various bits of information. There's lots more to pull, but this is a solid starting place. 

In [209]:
#Stage 1
links = [] #Create an empty list to fill with links
AVERAGE_SECONDS_BETWEEN_REQUESTS = 1 #Don't go too hard on Pitchfork's servers

for i in range(1,500): #Use the range function to decide how many pages you want to go through 
    page_no = str(i)
    link = ('http://pitchfork.com/reviews/albums/?page=' + page_no) #create the link
    req = Request(link, headers={'User-Agent': 'Mozilla/5.0'}) #Mask the bot
    webpage = urlopen(req).read()
    soup = BeautifulSoup(webpage, "lxml") #create the soup
    info = soup.findAll('a', {"class":"album-link"}) #pull the album link
    for j in info:
        links.append(j.attrs['href']) #grab all the link attributes
    time.sleep(numpy.random.exponential(AVERAGE_SECONDS_BETWEEN_REQUESTS, 1))  # pause between server requests
    
links

['/reviews/albums/22936-the-tourist/',
 '/reviews/albums/22003-planetary-prince/',
 '/reviews/albums/22939-machine-response/',
 '/reviews/albums/22878-various-artists-outro-tempo-electronic-and-contemporary-music-from-brazil-1978-1992/',
 '/reviews/albums/22978-hndrxx/',
 '/reviews/albums/22920-buried-wish/',
 '/reviews/albums/22938-hopes-of-failure/',
 '/reviews/albums/22979-youngish-american/',
 '/reviews/albums/22921-vagabon-infinite-worlds/',
 '/reviews/albums/22927-the-iceberg/',
 '/reviews/albums/22929-man-vs-sofa/',
 '/reviews/albums/22935-headnod-suite/',
 '/reviews/albums/22891-power-trip-nightmare-logic/',
 '/reviews/albums/22926-sick-scenes/',
 '/reviews/albums/22889-why-love-now/',
 '/reviews/albums/22893-forget/',
 '/reviews/albums/22924-common-as-light-and-love-are-red-valleys-of-blood/',
 '/reviews/albums/22917-gang-signs-prayer/',
 '/reviews/albums/22919-book-of-changes/',
 '/reviews/albums/22928-burning-the-threshold/',
 '/reviews/albums/22965-flume-skin-companion-ep-i

In [211]:
album_table = pd.DataFrame([]) #Create an empty dataframe that'll hold the info for each album
BASE_URL = 'http://www.pitchfork.com'

for i in links:
    link = BASE_URL + i
    req = Request(link, headers={'User-Agent': 'Mozilla/5.0'})
    webpage = urlopen(req).read()
    soup = BeautifulSoup(webpage, "lxml") #same as above
    artist_info = soup.findAll('ul', {"class":"artist-links artist-list"}) #Artist Name
    album_info = soup.findAll('h1', {"class":"review-title"}) #Album Name
    score_info = soup.findAll('div', {"class":"score-circle"}) #Score
    pub_info = soup.findAll('span', {"class":"pub-date"}) # Publication Date
    genre_info = soup.findAll('ul', {"class":"genre-list before"}) #Genre
    for j in artist_info:
        artist = j.text
    for k in album_info:
        album = k.text
    for l in score_info:
        score = l.text
    for m in pub_info:
        pub_date = m.text
    for n in genre_info:
        genre = n.text
    album_table = album_table.append(pd.DataFrame({'artist': artist, #Create the table
                                                   'album'  : album,
                                                   'score' : score,
                                                   'pub_date' : pub_date, 
                                                   'genre' : genre},
                                                  index=[0]),
                                     ignore_index=True)
    #time.sleep(numpy.random.exponential(AVERAGE_SECONDS_BETWEEN_REQUESTS, 1)) 

album_table

Unnamed: 0,album,artist,genre,pub_date,score
0,The Tourist,Clap Your Hands Say Yeah,Rock,14 hrs ago,7.5
1,Planetary Prince,Cameron Graves,Jazz,14 hrs ago,7.7
2,Machine Response,Career Suicide,Rock,14 hrs ago,7.5
3,Outro Tempo: Electronic and Contemporary Music...,Various Artists,Rock,14 hrs ago,8.8
4,HNDRXX,Future,Rap,March 3 2017,7.8
5,Buried Wish,PC Worship,Rock,March 3 2017,7.3
6,Hopes of Failure,Aseethe,Rock,March 3 2017,6.0
7,Youngish American,Dams of the West,Rock,March 3 2017,4.6
8,Infinite Worlds,Vagabon,Rock,March 2 2017,8.5
9,The Iceberg,Oddisee,Rap,March 2 2017,7.0


In [212]:
album_table.to_sql("album_table", con,if_exists='replace')