In [None]:
##############################
## Web scraping song lyrics ##
##############################

import requests
from lxml import html
import random
import time

# list of urls to scrape song lyrics from
web_list = ['http://www.azlyrics.com/w/wonder.html',
            'http://www.azlyrics.com/b/bowie.html', 
            'http://www.azlyrics.com/t/tool.html',
            'http://www.azlyrics.com/n/nine.html',
            'http://www.azlyrics.com/m/metallica.html',
            'http://www.azlyrics.com/b/blacksabbath.html',
            'http://www.azlyrics.com/j/jayz.html',
            'http://www.azlyrics.com/f/frankzappa.html'
           ] 

# Initialize a blank output file (and overwrite any existing file)
with open('scraped_lyrics.txt', 'w') as output_file:
    output_file.write('')
    
random.seed(554646)
# first scrape the links to each song 
url_list = []
for page in web_list:
    artist_songlist = []
    data = html.fromstring(requests.get(page).text)
    for song in data.xpath("//div[@class='listalbum-item']"):
        url = song.xpath("a/@href")
        if len(url) > 0:
            if url[0].startswith('https://www.') == False:
                url = 'http://www.azlyrics.com' + url[0]
                artist_songlist.append(url)
    random.shuffle(artist_songlist)
    print(f'Page: {page} - scraped {len(artist_songlist)} song URLs')
    [url_list.append(i) for i in artist_songlist[:31]]
    time.sleep(5)

print(f'Attempting to scrape lyrics from {len(url_list)} total songs')
random.shuffle(url_list)
scraped_count = 0
for i in range(len(url_list)):
    successful = False
    connection_error_count = 0
    while not successful:
        print('Collecting from page: ', i+1)
        try:
            data = html.fromstring(requests.get(url_list[i]).text)
            lyrics_count = 0
            for lyric in data.xpath("//div[contains(@class,'text-center')]"):
                lyrics = lyric.xpath("div[5]/text()")
                if len(lyric) > 1:
                    for line in lyrics:
                        if len(line.strip()) > 0:
                            with open('scraped_lyrics.txt', 'a') as output_file:
                                try:
                                    output_file.write(str(line.strip()) + '\n')
                                    lyrics_count += 1
                                    successful = True
                                except: 
                                    continue
            if lyrics_count == 0: 
                print('Blocked?')
                x = input("Press any key to try again, or 'skip' to continue to next song.")
                if x == 'skip':
                    print(f'Skipping page {i+1}: {url_list[i]}')
                    successful = True
                else:
                    continue
            else:
                scraped_count += 1
                time.sleep(5) # timer to avoid being ip blocked 
        except:
            print('Connection failure?')
            connection_error_count += 1
            if connection_error_count >= 3: 
                print(f'Skipping page {i+1}: {url_list[i]}')
                successful = True
            else:
                print('Waiting 10 seconds to retry.')
                time.sleep(10)

print(f'Finished scraping {scraped_count} song lyrics. Encountered {len(url_list) - scraped_count} errors.')

Page: http://www.azlyrics.com/w/wonder.html - scraped 330 song URLs
Page: http://www.azlyrics.com/b/bowie.html - scraped 368 song URLs
Page: http://www.azlyrics.com/t/tool.html - scraped 72 song URLs
Page: http://www.azlyrics.com/n/nine.html - scraped 127 song URLs
Page: http://www.azlyrics.com/m/metallica.html - scraped 162 song URLs
Page: http://www.azlyrics.com/b/blacksabbath.html - scraped 206 song URLs
Page: http://www.azlyrics.com/j/jayz.html - scraped 278 song URLs
Page: http://www.azlyrics.com/f/frankzappa.html - scraped 563 song URLs
Attempting to scrape lyrics from 248 total songs
Collecting from page:  1
Collecting from page:  2
Collecting from page:  3
Collecting from page:  4
Connection failure?
Waiting 10 seconds to retry.
Collecting from page:  4
Collecting from page:  5
Collecting from page:  6
Collecting from page:  7
Collecting from page:  8
Collecting from page:  9
Collecting from page:  10
Collecting from page:  11
Collecting from page:  12
Collecting from page:  13

In [8]:
list1 = []
list2 = []
    
f = open("full_corpus.txt", "r")
text = f.readlines()
#print(len(text))
text = list(dict.fromkeys(text)) # remove duplicate
with open('full_corpus_no_dups.txt', 'w') as output_file:
    [output_file.write(i) for i in text]
    
lines_text = len(text)
#print(len(text))
counter = 0
while lines_text > 0:
    for i in range(2):
        if i == 0:
            list1.append(text[counter])
            lines_text -= 1
            counter += 1
        if i == 1:
            list2.append(text[counter])
            lines_text -= 1
            counter += 1
#print(len(list1), len(list2)) # print lengths of each list to check that they're equal
with open('train.tgt', 'w') as output_file:
    [output_file.write(i) for i in list1]
with open('train.txt', 'w') as output_file:
    [output_file.write(i) for i in list1]
with open('test.tgt', 'w') as output_file: # test and valid datasets should be the same per COCO example in github repo
    [output_file.write(i) for i in list2]
with open('test.txt', 'w') as output_file: 
    [output_file.write(i) for i in list2]
with open('valid.tgt', 'w') as output_file: # test and valid datasets should be the same per COCO example in github repo
    [output_file.write(i) for i in list2]

In [9]:
import nltk
from nltk import word_tokenize
nltk.download('punkt')

counter = 0
length = 0
with open('full_corpus_no_dups.txt') as text_file:
    for line in text_file:
        tokens = word_tokenize(line)
        length += len(tokens)
        counter += 1
avglen = length/counter
print(f'The average length of a line in the text is:  {avglen}')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
The average length of a line in the text is:  7.872226472838562
