# build Rock corpus
reference code & dataset seed: Sebastian Raschka, http://www.cs.ubbcluj.ro/zbodo/lastfm.html

In [2]:
import urllib
import lxml.html

class Song(object):
    def __init__(self, artist, title):
        self.artist = self.__format_str(artist)
        self.title = self.__format_str(title)
        self.url = None
        self.lyric = None
        
    def __format_str(self, s):
        # remove paranthesis and contents
        s = s.strip()
        try:
            # strip accent
            s = ''.join(c for c in unicodedata.normalize('NFD', s)
                         if unicodedata.category(c) != 'Mn')
        except:
            pass
        s = s.title()
        return s
        
    def __quote(self, s):
         return urllib.pathname2url(s.replace(' ', '_'))

    def __make_url(self):
        artist = self.__quote(self.artist)
        title = self.__quote(self.title)
        artist_title = '%s:%s' %(artist, title)
        url = 'http://lyrics.wikia.com/' + artist_title
        self.url = url
        
    def update(self, artist=None, title=None):
        if artist:
            self.artist = self.__format_str(artist)
        if title:
            self.title = self.__format_str(title)
        
    def lyricwikia(self):
        self.__make_url()
        try:
            doc = lxml.html.parse(self.url)
            lyricbox = doc.getroot().cssselect('.lyricbox')[0]
        except (IOError, IndexError) as e:
            self.lyric = ''
            return self.lyric
        lyrics = []

        for node in lyricbox:
            if node.tag == 'br':
                lyrics.append('\n')
            if node.tail is not None:
                lyrics.append(node.tail)
        self.lyric =  "".join(lyrics).strip()    
        return self.lyric

In [3]:
#example
song = Song(artist='John Mellencamp', title='Jack and Diane')
lyr = song.lyricwikia()
print(lyr)

Two American kids growin' up in the heartland
Jackie gonna be a football star
Diane's a debutante, backseat of Jackie's car

Suckin' on a chili dog outside the Tastee-Freez
Diane's sittin' on Jackie's lap
He's got his hands between her knees
Jackie say, "Hey Diane, let's run off behind the shady trees
Dribble off those Bobbie Brooks, let me do what I please."
And say uh

Oh yeah, life goes on
Long after the thrill of livin' is gone, they say uh
Oh yeah, life goes on
Long after the thrill of livin' is gone, they walk on

Jackie sits back, collects his thoughts for the moment
Scratches his head and does his best James Dean
"Well then there Diane, we oughta run off to the city."
Diane says, "Baby, you ain't missin' nothing."
And Jackie say uh

Oh yeah, life goes on
Long after the thrill of livin' is gone
Oh yeah, they say life goes on
Long after the thrill of livin' is gone

Gonna let it rock
Let it roll
Let the Bible Belt come and save my soul
Hold on to sixteen as long as you can
Change

# build data set by getting data from lastfm

# test set

## test set top genres

In [25]:
lastfm_test='/Users/zangsir/Downloads/lastfm.test_data.json'
f=open(lastfm_test,'r').read().split('\n')

import json
from collections import defaultdict
#seen=set()
genres=defaultdict(int)
for i in range(len(f)):
    if f[i]=='':
        continue
    a=json.loads(f[i])
    top_tag=a['tags'][0][0]
    #seen.add(top_tag)
    genres[top_tag]+=1
    
print sorted(genres.items(),key=lambda x:x[1],reverse=True)[:10]

[(u'rock', 1507), (u'indie', 938), (u'pop', 813), (u'country', 665), (u'folk', 466), (u'soul', 385), (u'Progressive rock', 372), (u'death metal', 364), (u'electronic', 362), (u'80s', 362)]


## train set

In [31]:
lastfm_train='/Users/zangsir/Downloads/lastfm.train_data.json'
f=open(lastfm_train,'r').read().split('\n')

import json
from collections import defaultdict
#seen=set()
genres=defaultdict(int)
for i in range(len(f)):
    if f[i]=='':
        continue
    a=json.loads(f[i])
    top_tag=a['tags'][0][0]
    #seen.add(top_tag)
    genres[top_tag]+=1
    
print sorted(genres.items(),key=lambda x:x[1],reverse=True)[:10]

[(u'rock', 15269), (u'indie', 6803), (u'pop', 6299), (u'country', 5066), (u'punk', 3818), (u'soul', 3436), (u'folk', 3113), (u'80s', 3022), (u'classic rock', 2756), (u'electronic', 2445)]


# rock has the most tunes in both data sets. let's download lyrics for rock

## first we need to get the artist and title of songs where tags are rock

In [34]:
def get_artist_title(data_set):
    """return a list of lists, where each of the inner lists is a tuple of artist and title """
    f=open(data_set,'r').read().split('\n')
    all_downloads=[]
    for i in range(len(f)):
        if f[i]=='':
            continue
        a=json.loads(f[i])
        top_tag=a['tags'][0][0]
        #seen.add(top_tag)
        if top_tag=='rock':
            artist=a['artist_new']
            title=a['title_new']#using the lyricsWikia artist and title names
            all_downloads.append([artist,title])
    return all_downloads

In [36]:
test_rock=get_artist_title(lastfm_test)

In [40]:
train_rock=get_artist_title(lastfm_train)

# download lyrics

In [56]:

def download_lyrics(artist_title_list):
    open('lyrics_data.txt','w').close()
    f=open('lyrics_data.txt','a')
    count=1
    for tup in artist_title_list:
        if count%500==0:
            print count
        try:
            song = Song(artist=tup[0], title=tup[1])
            lyr = song.lyricwikia()
            f.write("SONG_META_ARTIST_TITLE:" + tup[0] + "   " + tup[1]+'\n\n')
            f.write(lyr.encode('utf-8')+'\n\n\n')
            count+=1
        except Exception:
            print 'error'
            continue
    f.close()
    print 'total downloaded:',count
    
        

In [55]:
download_lyrics(test_rock)

error
error
error
error
total downloaded: 1504


In [57]:
download_lyrics(train_rock)

error
500
error
error
1000
error
error
error
1500
error
error
2000
error
error
2500
error
error
error
3000
error
error
error
3500
4000
error
4500
5000
error
error
error
5500
error
error
6000
error
error
error
error
error
error
6500
error
7000
7500
error
error
error
error
error
8000
error
8500
error
error
error
error
9000
9500
error
error
10000
error
error
10500
error
11000
error
error
11500
error
error
12000
error
error
12500
error
error
error
13000
error
error
13500
error
error
error
error
14000
error
error
error
14500
error
15000
error
total downloaded: 15206
