### Step 2 - Tf-idf create
Create a file with tfidf scores for each word in the corpus.

WARNING: This process can take several hours.

In [1]:
from __future__ import division
from nltk.corpus.reader import CategorizedPlaintextCorpusReader
import pandas as pd
from pandas import Series
from nltk.text import TextCollection

In [2]:
corpus = CategorizedPlaintextCorpusReader("./corpus/lyrics/tokenized/", r".*\.txt", cat_pattern=r"(\w+)/*")

In [3]:
lyrics = []
for filen in corpus.fileids():
    with open("./corpus/lyrics/tokenized/" + filen, 'rb') as handle:
        lyrics.append(handle.read())

In [4]:
import collections
from nltk import Text

def compute_tfidf_by_doc(coll, texts, filenames):
    tfidf_by_doc = collections.defaultdict(list)
    for i, text in enumerate(texts):
        for word in set(text.split(" ")):   # just use the words in this text
            tfidfscore = coll.tf_idf(word, text)
            tf = coll.tf(word, text) # is actually count / len(text)
            count = text.count(word)
            if tfidfscore:
                tfidf_by_doc[filenames[i]].append({
                    "word": word,
                    "tfidf": tfidfscore
                    #"tf": tf,
                    #"count": count
                })
    return tfidf_by_doc

In [5]:
# We need to make the text collection, then use it to compute the tf-idf for the words in the docs.
collection = TextCollection(corpus)

In [6]:
# Use the first set of lyrics to test out idf and tf_idf scores.
# low tf_idf score - A word like 'yeah' should appear frequently in all songs.
# high tf_idf score - A word like 'feel' may not appear frequently in all songs but occur a lot in a specific song.
lyrics[0]

'telling everyone town treat right even say stay away come home night losing got thanks lot thanks lot seems like hurt know good give anything anything want really got spot thanks lot thanks lot still would forgive let glad could try seems get looks win matter always love never know hurts see calling things know thanks lot thanks lot matter always love never know hurts see say happy know thanks lot thanks lot thanks lot '

In [13]:
collection.tf_idf("love", lyrics[0])

0.00511491806312874

In [14]:
collection.tf_idf("know", lyrics[0])

0.01063219401368527

In [15]:
collection.tf_idf("thanks", lyrics[0])

0.08675611582778127

In [10]:
# creates the following dictionary w filename as the key and list of word, counts, tf and tfidf scores. 
# defaultdict(list,
    # {'04CpzA2BdOLEz7EMp5uwTU.txt': [{'count': 1,
        # 'tf': 0.006211180124223602,
        # 'tfidf': 0.01568775555470966,
        # 'word': 'summer'},
        # {'count': 1,
        # 'tf': 0.006211180124223602,
        # 'tfidf': 0.01999301754576522,
        # 'word': 'nights'},
%time tfidfs = compute_tfidf_by_doc(collection, lyrics, corpus.fileids())

CPU times: user 5.61 s, sys: 902 ms, total: 6.51 s
Wall time: 6.93 s


In [11]:
len(tfidfs)

59

### Save tfidf json

In [12]:
import json
jsonified = json.dumps(tfidfs)
with open('./corpus/lyrics_tfidf.json', 'w') as handle:
    handle.write(jsonified)