## Creating the Optimal Gov's Ball Scedule

In [35]:
#Bring in the neccessary libraries 
import pandas as pd
from io import StringIO
import urllib
from bs4 import BeautifulSoup 
import sqlite3 
import time
import numpy 
import signal
from urllib.request import Request, urlopen

### Get Gov's Ball Artist Info

In [2]:
#Grab the artist info from the Gov's ball website
with urllib.request.urlopen("https://www.governorsballmusicfestival.com/lineup/interactive-lineup/") as url:
    s = url.read()
soup = BeautifulSoup(s, "lxml")


In [6]:
info = soup.findAll('div', {"class":"c-lineup__artist"})
#Put those names in a list, then a dataframe
govs_ball_data = pd.DataFrame([])
for i in info:
    artist = i.attrs['data-title']
    date = i.attrs["data-day-titles"]
    govs_ball_data = govs_ball_data.append(pd.DataFrame({'artist': artist, #Create the table
                                                         'date'  : date},
                                                  index=[0]),
                                     ignore_index=True)
govs_ball_data.head()

Unnamed: 0,artist,date
0,Tool,"[""Sunday, June 4th""]"
1,Chance The Rapper,"[""Friday, June 2nd""]"
2,Phoenix,"[""Saturday, June 3rd""]"
3,Childish Gambino,"[""Saturday, June 3rd""]"
4,Lorde,"[""Friday, June 2nd""]"


In [29]:
govs_ball_data["day"] = ((govs_ball_data["date"].str.slice(-5, -4)).astype(int) -1).astype(str)
govs_ball_data["date"] = govs_ball_data["date"].str.slice(2, -2)
govs_ball_data

Unnamed: 0,artist,date,day
0,Tool,"Sunday, June 4th",3
1,Chance The Rapper,"Friday, June 2nd",1
2,Phoenix,"Saturday, June 3rd",2
3,Childish Gambino,"Saturday, June 3rd",2
4,Lorde,"Friday, June 2nd",1
5,Flume,"Friday, June 2nd",1
6,Wu-Tang Clan,"Saturday, June 3rd",2
7,Wiz Khalifa,"Sunday, June 4th",3
8,Logic,"Sunday, June 4th",3
9,Cage The Elephant,"Sunday, June 4th",3


#### Add Names to a SQLite Database

In [30]:
con = sqlite3.connect("pitchfork-data.db")

In [31]:
govs_ball_data.to_sql("govs_ball_data", con,if_exists='replace')

## Pitchfork Crawler

The crawling happens in two distinct stages. In Stage 1, the code loops throught the reviews page on pitchfork.com to find links to all the reviews. Stage 2 goes to each link and pull various bits of information. There's lots more to pull, but this is a solid starting place. 

In [62]:
links = [] #Create an empty list to fill with links

In [69]:
#Stage 1
AVERAGE_SECONDS_BETWEEN_REQUESTS = 5 #Don't go too hard on Pitchfork's servers

for i in range(1074,1500): #Use the range function to decide how many pages you want to go through 
    page_no = str(i)
    link = ('http://pitchfork.com/reviews/albums/?page=' + page_no) #create the link
    req = Request(link, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.0; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0' }) #Mask the bot
    webpage = urlopen(req).read()
    soup = BeautifulSoup(webpage, "lxml") #create the soup
    info = soup.findAll('a', {"class":"album-link"}) #pull the album link
    for j in info:
        links.append(j.attrs['href']) #grab all the link attributes
    time.sleep(numpy.random.exponential(AVERAGE_SECONDS_BETWEEN_REQUESTS, 1))  # pause between server requests
    print (i)
    


1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273


KeyboardInterrupt: 

In [70]:
len(links)

16716

In [71]:
link_table = pd.DataFrame([])
link_table["links"] = links
#link_table.to_sql("link_table", con)
link_table.head()

Unnamed: 0,links
0,/reviews/albums/22936-the-tourist/
1,/reviews/albums/22003-planetary-prince/
2,/reviews/albums/22939-machine-response/
3,/reviews/albums/22878-various-artists-outro-te...
4,/reviews/albums/22978-hndrxx/


In [78]:
album_table = pd.DataFrame([]) #Create an empty dataframe that'll hold the info for each album

In [84]:
BASE_URL = 'http://www.pitchfork.com'
AVERAGE_SECONDS_BETWEEN_REQUESTS = 5

for i in links:
    link = BASE_URL + i
    req = Request(link, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.0; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0' })
    webpage = urlopen(req).read()
    soup = BeautifulSoup(webpage, "lxml") #same as above
    artist_info = soup.findAll('ul', {"class":"artist-links artist-list"}) #Artist Name
    album_info = soup.findAll('h1', {"class":"review-title"}) #Album Name
    score_info = soup.findAll('div', {"class":"score-circle"}) #Score
    pub_info = soup.findAll('span', {"class":"pub-date"}) # Publication Date
    genre_info = soup.findAll('ul', {"class":"genre-list before"}) #Genre
    for j in artist_info:
        artist = j.text
    for k in album_info:
        album = k.text
    for l in score_info:
        score = l.text
    for m in pub_info:
        pub_date = m.text
    for n in genre_info:
        genre = n.text
    print (artist, ", ", album)
    album_table = album_table.append(pd.DataFrame({'artist': artist, #Create the table
                                                   'album'  : album,
                                                   'score' : score,
                                                   'pub_date' : pub_date, 
                                                   'genre' : genre},
                                                  index=[0]),
                                     ignore_index=True)
    time.sleep(numpy.random.exponential(AVERAGE_SECONDS_BETWEEN_REQUESTS, 1))


album_table.head()

Clap Your Hands Say Yeah ,  The Tourist
Cameron Graves ,  Planetary Prince
Career Suicide ,  Machine Response
Various Artists ,  Outro Tempo: Electronic and Contemporary Music From Brazil, 1978-1992
Future ,  HNDRXX
PC Worship ,  Buried Wish
Aseethe ,  Hopes of Failure
Dams of the West ,  Youngish American
Vagabon ,  Infinite Worlds
Oddisee ,  The Iceberg
Sherwood & Pinch ,  Man Vs. Sofa
Karriem Riggins ,  Headnod Suite
Power Trip ,  Nightmare Logic
Los Campesinos! ,  Sick Scenes
Pissed Jeans ,  Why Love Now
Xiu Xiu ,  FORGET
Sun Kil Moon ,  Common as Light and Love Are Red Valleys of Blood
Stormzy ,  Gang Signs & Prayer
Entrance ,  Book of Changes
Six Organs of Admittance ,  Burning the Threshold
Flume ,  Skin Companion EP II
Thundercat ,  Drunk
The Feelies ,  In Between
Jonwayne ,  Rap Album Two
Six by Seven ,  The Closer You Get
Weezer ,  Weezer (Blue Album)
Kingdom ,  Tears in the Club
King Gizzard & The Lizard Wizard ,  Flying Microtonal Banana
Grails ,  Chalice Hymnal
Alison Krau

KeyboardInterrupt: 

In [85]:
#album_table.to_sql("album_table", con,if_exists='replace')
album_table

Unnamed: 0,album,artist,genre,pub_date,score
0,The Tourist,Clap Your Hands Say Yeah,Rock,March 4 2017,7.5
1,Planetary Prince,Cameron Graves,Jazz,March 4 2017,7.7
2,Machine Response,Career Suicide,Rock,March 4 2017,7.5
3,Outro Tempo: Electronic and Contemporary Music...,Various Artists,Rock,March 4 2017,8.8
4,HNDRXX,Future,Rap,March 3 2017,7.8
5,Buried Wish,PC Worship,Rock,March 3 2017,7.3
6,Hopes of Failure,Aseethe,Rock,March 3 2017,6.0
7,Youngish American,Dams of the West,Rock,March 3 2017,4.6
8,Infinite Worlds,Vagabon,Rock,March 2 2017,8.5
9,The Tourist,Clap Your Hands Say Yeah,Rock,March 4 2017,7.5
