### Step 3 - Tf-idf filter
Use the Tf-idf analysis to filter words.

In [51]:
from __future__ import division
from nltk.corpus.reader import CategorizedPlaintextCorpusReader
import pandas as pd
from pandas import Series
import json

In [52]:
FILTER_THRESHOLD = 0.003

In [53]:
def loadSourceJSON():
  with open('./corpus/lyrics_tfidf.json', 'r') as infile:    
    return json.load(infile)

In [54]:
corpus = CategorizedPlaintextCorpusReader("./corpus/lyrics/tokenized/", r".*\.txt", cat_pattern=r"(\w+)/*")

In [55]:
lyrics = []
for filen in corpus.fileids():
    with open("./corpus/lyrics/tokenized/" + filen, 'rb') as handle:
        lyrics.append(handle.read().strip())

# create a Series to index lyrics by filename
lyrics_by_file = Series(lyrics,index=corpus.fileids())
lyrics_by_file.head()

036B7lKiRkLerLGI6EHtEr.txt.txt    telling everyone town treat right even say sta...
0Avmi9t3sOcaGSs1DSbgDg.txt.txt    hear train comin' rolling bend seen sunshine s...
0FR4Ua3VxoSVA7DOFtdPlO.txt.txt    drink drink drink drink drink rose carnation l...
0KSHmjK7OFtGocvbo7NZNO.txt.txt    high water mama two feet high rising high wate...
0PlyzrcKNoaTo5lAVzZCKE.txt.txt    dream dream teenage queen prettiest girl ever ...
dtype: object

In [56]:
# creates the following dictionary w filename as the key and list of word, counts, tf and tfidf scores. 
# defaultdict(list,
    # {'04CpzA2BdOLEz7EMp5uwTU.txt': [{'count': 1,
        # 'tf': 0.006211180124223602,
        # 'tfidf': 0.01568775555470966,
        # 'word': 'summer'},
        # {'count': 1,
        # 'tf': 0.006211180124223602,
        # 'tfidf': 0.01999301754576522,
        # 'word': 'nights'},
tfidfs = loadSourceJSON()

In [57]:
def diff(first, second):
        second = set(second)
        return [item for item in first if item not in second]

lyrics_tfidf = []
filenames = []
for filen in tfidfs: # iterate of tfidfs
    words = tfidfs[filen] # select filename
    low_score = []
    for word in words: # iterate over words
        if word['tfidf'] < FILTER_THRESHOLD: # remove words w a low tfidf 
            low_score.append(word['word'])
            
    lyrics_tfidf.append(' '.join(diff(lyrics_by_file[filen].split(' '), low_score)))
    filenames.append(filen)

lyrics_tfidf_by_file = Series(lyrics_tfidf,index=filenames)
lyrics_tfidf_by_file.head()

5yrEDH9bh0UgOoCwS7NmKZ.txt.txt    troubadour troubadour troubadour troubadour tr...
78eMv0YRfHsVKK15lE9yKu.txt.txt    good lord willing creek stays arms moon taste ...
3YPTYWasmdb0UoMp230b9M.txt.txt    sailed ship john grandfather around nassau tow...
0doe6BX5PtV8BDH0IffSdQ.txt.txt    tell left tell cried tell stayed around tell l...
5iCzMpRXMOamlgRH3FUvRz.txt.txt    run softly blue river darlin asleep run softly...
dtype: object

In [58]:
total_raw_tokens = 0
for filen, lyrics in lyrics_by_file.iteritems():
    total_raw_tokens += len(lyrics.split(' '))

total_tfidf_tokens = 0
lyrics_tfidf = ""
for filen, lyrics in lyrics_tfidf_by_file.iteritems():
    total_tfidf_tokens += len(lyrics.split(' '))
    lyrics_tfidf = lyrics_tfidf + lyrics
    
print "total raw tokens: {0}".format(total_raw_tokens)
print "total clean tokens: {0}".format(total_tfidf_tokens)
print "{0:.0f}% reduction".format(100 - total_tfidf_tokens / total_raw_tokens * 100)

total raw tokens: 5056
total clean tokens: 4762
6% reduction


### Save tf_idf tokens

In [59]:
def writeFile(path, text):
    text_file = open(path, "w") 
    text_file.write(text + " ")
    text_file.close()

In [60]:
for filen, lyrics in lyrics_tfidf_by_file.iteritems():
    writeFile("./corpus/lyrics/tfidf/" + filen, lyrics)