forked from jbrew/stereotype
-
Notifications
You must be signed in to change notification settings - Fork 0
/
getlyrics.py
37 lines (25 loc) · 884 Bytes
/
getlyrics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
from bs4 import BeautifulSoup
import urllib2
import re
import random
artist_name = raw_input('Enter artist name:\n')
artist_url_fragment = '-'.join(artist_name.split(' '))
url = "http://www.metrolyrics.com/%s-lyrics.html" % artist_url_fragment
#base = "http://www.metrolyrics.com/"
page = urllib2.urlopen(url).read()
soup = BeautifulSoup(page, "html.parser")
suffix = 'lyrics-' + artist_url_fragment
print suffix
# finds all elements with the 'a' tag (i.e. all the links)
foo = soup.findAll(href=re.compile(suffix))
print len(foo)
outfilename = "texts/%s.txt" % '-'.join(artist_name.split())
outfile = open(outfilename, 'w')
for link in foo:
print link['href']
url = link['href']
page = urllib2.urlopen(url).read()
soup = BeautifulSoup(page, "html.parser")
subfoo = soup.findAll(class_ = 'verse')
for x in subfoo:
outfile.write(x.get_text().encode('utf8') + " ")