## Step 2 - Tokenize

In [59]:
from __future__ import division
from nltk.corpus.reader import CategorizedPlaintextCorpusReader
import itertools
import nltk
import string
import re
import pandas as pd
from pandas import Series

### Load data and create a Series
Load a compressed dataframe and converts to a Series after a first pass at cleaning up the text.

In [60]:
def initialClean(text):
    # remove newlines, doubled singlequotes, excess whitespace
    text = text.replace("\n", " ").replace("\'\'", "").strip()
    # remove text enclosed in brackets; ex: [Verse 1]
    text = re.sub(r'\[.+?\]', '', text).strip()
    # remove text beginning with a single quote; ex: '01, 'em
    text = re.sub(r"\'\w+", "", text).strip()
    return text

In [61]:
# read the raw lyrics from a Pandas DataFrame saved as a csv.
lyrics_df = pd.read_csv("./corpus/lyrics/lyrics_raw.gz", compression='gzip', names=['uri', 'lyrics'])
lyrics_df.head()

Unnamed: 0,uri,lyrics
0,036B7lKiRkLerLGI6EHtEr.txt,[Thanks A Lot]\n[Verse]\nYou're telling everyo...
1,0Avmi9t3sOcaGSs1DSbgDg.txt,[Folsom Prison Blues]\nI hear the train a comi...
2,0FR4Ua3VxoSVA7DOFtdPlO.txt,"[Drink To Me]\n[Chorus]\nDrink to me, drink to..."
3,0KSHmjK7OFtGocvbo7NZNO.txt,[Five Feet High And Rising]\n[Chorus]\nHow hig...
4,0PlyzrcKNoaTo5lAVzZCKE.txt,"[Ballad Of A Teenage Queen]\n(Dream on, dream ..."


In [62]:
lyrics_raw = []
for lyrics in lyrics_df['lyrics']:
    lyrics_raw.append(initialClean(lyrics))

# create a Series to index lyrics by filename
lyrics_by_file = Series(lyrics_raw,index=lyrics_df['uri'])

### Tokenize
We'll tokenize using a regular expression tokenizer to preserve contractions.

In [63]:
from nltk.tokenize import RegexpTokenizer
regexpTokenizer = RegexpTokenizer("[\w']+")

In [64]:
for filen, lyrics in lyrics_by_file.iteritems():
    lyrics_by_file[filen] = " ".join(regexpTokenizer.tokenize(lyrics))
lyrics_by_file.head()

uri
036B7lKiRkLerLGI6EHtEr.txt    You telling everyone in town that I don treat ...
0Avmi9t3sOcaGSs1DSbgDg.txt    I hear the train a comin' It rolling the bend ...
0FR4Ua3VxoSVA7DOFtdPlO.txt    Drink to me drink to me Drink to me drink to m...
0KSHmjK7OFtGocvbo7NZNO.txt    How high the water mama Two feet high and risi...
0PlyzrcKNoaTo5lAVzZCKE.txt    Dream on dream on teenage queen prettiest girl...
dtype: object

In [65]:
from nltk.corpus import stopwords
english_stops = stopwords.words('english')
custom_stops = []
with open("./resources/custom_stopwords.txt", 'rb') as handle:
    custom_stops = handle.read().split("\n")
    print custom_stops

['yeah', 'yea', 'ooh', 'ahh', 'ahhh', 'ahhhh', 'ahhhhh', 'huh', 'hey', 'hmm', 'mmm', 'mhm']


In [66]:
def clean_tokens(tokens):
    """ Lowercases, takes out punct and stopwords and short strings """
    return [token.lower() for token in tokens if (token not in string.punctuation) and 
                   (token.lower() not in custom_stops) and (token.lower() not in english_stops) and len(token) > 2]

In [67]:
lyrics_clean = []
for filen, lyrics in lyrics_by_file.iteritems():
    lyrics_clean.append(" ".join(clean_tokens(lyrics.split(' '))))
    
# create a Series to index cleaned lyrics by filename
lyrics_clean_by_file = Series(lyrics_clean,index=lyrics_by_file.index)
lyrics_clean_by_file.head()



uri
036B7lKiRkLerLGI6EHtEr.txt    telling everyone town treat right even say sta...
0Avmi9t3sOcaGSs1DSbgDg.txt    hear train comin' rolling bend seen sunshine s...
0FR4Ua3VxoSVA7DOFtdPlO.txt    drink drink drink drink drink rose carnation l...
0KSHmjK7OFtGocvbo7NZNO.txt    high water mama two feet high rising high wate...
0PlyzrcKNoaTo5lAVzZCKE.txt    dream dream teenage queen prettiest girl ever ...
dtype: object

In [68]:
total_raw_tokens = 0
for filen, lyrics in lyrics_by_file.iteritems():
    total_raw_tokens += len(lyrics.split(' '))

total_clean_tokens = 0
lyrics_clean = ""
for filen, lyrics in lyrics_clean_by_file.iteritems():
    total_clean_tokens += len(lyrics.split(' '))
    lyrics_clean = lyrics_clean + lyrics
    
print "total raw tokens: {0}".format(total_raw_tokens)
print "total clean tokens: {0}".format(total_clean_tokens)
print "{0:.0f}% reduction".format(100 - total_clean_tokens / total_raw_tokens * 100)

total raw tokens: 10718
total clean tokens: 5056
53% reduction


In [69]:
# combine all cleaned tokens in an list an sort
%time sorted_clean = sorted(lyrics_clean.split(' '))

CPU times: user 2.73 ms, sys: 815 µs, total: 3.54 ms
Wall time: 3.69 ms


In [70]:
# sample clean tokens to see if we still have garbage
sorted_clean[0:10]

['abused',
 'accidentally',
 'aches',
 'achesyoung',
 'aching',
 'aching',
 'aching',
 'acres',
 'acting',
 'admit']

### Save tokens
We'll use this file in later notebooks.

In [71]:
def writeFile(path, text):
    text_file = open(path, "w") 
    text_file.write(text + " ")
    text_file.close()

In [72]:
for filen, lyrics in lyrics_clean_by_file.iteritems():
    writeFile("./corpus/lyrics/tokenized/" + filen.replace("spotify:track:", "") + ".txt", lyrics)