## Creating the Optimal Gov's Ball Scedule

In [7]:
#Bring in the neccessary libraries 
import pandas as pd
from io import StringIO
import urllib
from bs4 import BeautifulSoup 
import sqlite3 
import time
import numpy 
import signal
from urllib.request import Request, urlopen

### Get Gov's Ball Artist Info

In [8]:
#Grab the artist info from the Gov's ball website
with urllib.request.urlopen("https://www.governorsballmusicfestival.com/lineup/interactive-lineup/") as url:
    s = url.read()
soup = BeautifulSoup(s, "lxml")


In [9]:
info = soup.findAll('div', {"class":"c-lineup__artist"})
#Put those names in a list, then a dataframe
govs_ball_data = pd.DataFrame([])
for i in info:
    artist = i.attrs['data-title']
    date = i.attrs["data-day-titles"]
    govs_ball_data = govs_ball_data.append(pd.DataFrame({'artist': artist, #Create the table
                                                         'date'  : date},
                                                  index=[0]),
                                     ignore_index=True)
govs_ball_data.head()

Unnamed: 0,artist,date
0,Tool,"[""Sunday, June 4th""]"
1,Chance The Rapper,"[""Friday, June 2nd""]"
2,Phoenix,"[""Saturday, June 3rd""]"
3,Childish Gambino,"[""Saturday, June 3rd""]"
4,Lorde,"[""Friday, June 2nd""]"


In [10]:
govs_ball_data["day"] = ((govs_ball_data["date"].str.slice(-5, -4)).astype(int) -1).astype(str)
govs_ball_data["date"] = govs_ball_data["date"].str.slice(2, -2)
govs_ball_data

Unnamed: 0,artist,date,day
0,Tool,"Sunday, June 4th",3
1,Chance The Rapper,"Friday, June 2nd",1
2,Phoenix,"Saturday, June 3rd",2
3,Childish Gambino,"Saturday, June 3rd",2
4,Lorde,"Friday, June 2nd",1
5,Flume,"Friday, June 2nd",1
6,Wu-Tang Clan,"Saturday, June 3rd",2
7,Wiz Khalifa,"Sunday, June 4th",3
8,Logic,"Sunday, June 4th",3
9,Cage The Elephant,"Sunday, June 4th",3


#### Add Names to a SQLite Database

In [11]:
con = sqlite3.connect("pitchfork-data.db")

In [12]:
govs_ball_data.to_sql("govs_ball_data", con,if_exists='replace')

## Pitchfork Crawler

The crawling happens in two distinct stages. In Stage 1, the code loops throught the reviews page on pitchfork.com to find links to all the reviews. Stage 2 goes to each link and pull various bits of information. There's lots more to pull, but this is a solid starting place. 

In [17]:
links = [] #Create an empty list to fill with links

In [21]:
#Stage 1
AVERAGE_SECONDS_BETWEEN_REQUESTS = 10 #Don't go too hard on Pitchfork's servers

for loop in [range(253,500), range(501, 1000), range(1001,1500)]:
    for i in loop: #Use the range function to decide how many pages you want to go through 
        page_no = str(i)
        link = ('http://pitchfork.com/reviews/albums/?page=' + page_no) #create the link
        req = Request(link, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.0; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0' }) #Mask the bot
        webpage = urlopen(req).read()
        soup = BeautifulSoup(webpage, "lxml") #create the soup
        info = soup.findAll('a', {"class":"album-link"}) #pull the album link
        for j in info:
            links.append(j.attrs['href']) #grab all the link attributes
        time.sleep(numpy.random.exponential(AVERAGE_SECONDS_BETWEEN_REQUESTS, 1))  # pause between server requests
        print (i)
    time.sleep(100) #Sleep for a while between requests


55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252


KeyboardInterrupt: 

In [22]:
len(links)

3060

In [24]:
link_table = pd.DataFrame([])
link_table["links"] = links
link_table[:3000].to_sql("link_table_first_3000", con)
link_table.head()

Unnamed: 0,links
0,/reviews/albums/22947-eitheror-expanded-edition/
1,/reviews/albums/22945-moh-lhean/
2,/reviews/albums/22908-erasmo-carlos-e-os-treme...
3,/reviews/albums/22977-uyai/
4,/reviews/albums/22930-50-song-memoir/


In [25]:
album_table = pd.DataFrame([]) #Create an empty dataframe that'll hold the info for each album

In [44]:
BASE_URL = 'http://www.pitchfork.com'

for i in links[1250:]:
    link = BASE_URL + i
    t0 = time.time()
    req = Request(link, headers={ 'User-Agent': 'Firefox/24.0' })
    webpage = urlopen(req).read()
    response_delay = time.time() - t0
    time.sleep(4*response_delay)  # wait 10x longer than it took them to respond
    soup = BeautifulSoup(webpage, "lxml") #same as above
    artist_info = soup.findAll('ul', {"class":"artist-links artist-list"}) #Artist Name
    album_info = soup.findAll('h1', {"class":"review-title"}) #Album Name
    score_info = soup.findAll('div', {"class":"score-circle"}) #Score
    pub_info = soup.findAll('span', {"class":"pub-date"}) # Publication Date
    genre_info = soup.findAll('ul', {"class":"genre-list before"}) #Genre
    for j in artist_info:
        artist = j.text
    for k in album_info:
        album = k.text
    for l in score_info:
        score = l.text
    for m in pub_info:
        pub_date = m.text
    for n in genre_info:
        genre = n.text
    print (artist, ", ", album)
    pd.DataFrame({'artist': artist, #Create the table
                  'album'  : album,
                  'score' : score,
                  'pub_date' : pub_date,
                  'genre' : genre},
                 index=[0]).to_sql("album_table",
                                   con,
                                   if_exists = "append")


album_table.head()

The Goon Sax ,  Up to Anything
Bent Shapes ,  Wolves of Want
Gadget ,  The Great Destroyer
Primal Scream ,  Chaosmosis
Lil Yachty ,  Lil Boat
HÆLOS ,  Full Circle
Various Artists ,  Wayfaring Strangers: Cosmic American Music
Chris Forsyth and the Solar Motel Band ,  The Rarity of Experience
Jeff Buckley ,  You and I
Flatbush Zombies ,  3001: A Laced Odyssey
The Feelies ,  Only Life
Lifetones ,  For A Reason
Ceramic TL ,  Sign of the Cross Every Mile to the Border
Underworld ,  Barbara Barbara, we face a shining future
The Body ,  No One Deserves Happiness
ZelooperZ ,  Bothic
Horse Jumper of Love ,  Horse Jumper of Love
Zach Cooper ,  The Sentence
Sheer Mag ,  III EP
Clark ,  The Last Panthers
DJ Katapila ,  Trotro
Fake Boyfriend ,  Mercy EP
Immix Ensemble & Vessel ,  Transition
The Range ,  Potential
Låpsley ,  Long Way Home
M. Ward ,  More Rain
Big Ups ,  Before a Million Universes
Methyl Ethel ,  Oh Inhuman Spectacle
Iggy Pop ,  Post Pop Depression
Fatima Al Qadiri ,  Brute
Tiga ,  N

KeyboardInterrupt: 

In [40]:
album_table.shape

(914, 5)

In [39]:
album_table.to_sql("album_table", con, if_exists = "replace")