### Step 4 - Stemming
Reduce the cleaned tokens to their stems. We'll later get the frequency distribution of the stems.

In [1]:
from __future__ import division
from nltk.corpus.reader import CategorizedPlaintextCorpusReader
import nltk
import re
import pandas as pd
from pandas import Series
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")
outputFile = "lyrics_stems.txt"

In [2]:
def stem(tokens):
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        tk = token.decode('utf8')
        if re.search('[a-zA-Z]', tk):
            filtered_tokens.append(tk)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems

In [3]:
corpus = CategorizedPlaintextCorpusReader("./corpus/lyrics/tfidf/", r".*\.txt", cat_pattern=r"(\w+)/*", encoding="utf8")

In [4]:
lyrics = []
for filen in corpus.fileids():
    with open("./corpus/lyrics/tokenized/" + filen, 'rb') as handle:
        lyrics.append(handle.read())

# create a Series to index lyrics by filename
lyrics_by_file = Series(lyrics,index=corpus.fileids())
lyrics_by_file.head()

04CpzA2BdOLEz7EMp5uwTU.txt    baby days sun nights rain summer simple plain ...
07GilNHSfS5oicUgHgU7VG.txt    man live hype real ting pull lexus' like hit l...
08zJpaUQVi9FrKv2e32Bah.txt    one goes sides world wide let playa ass nigga ...
0AAyo5bMIVgVxv9DF6XV2B.txt    little chilly stand beyond let friends friends...
0GSrlvOgiCSpBLeAukOIjT.txt    panda fam trust niggas jam hustle man want cru...
dtype: object

### Stem
We'll stem using the Snowball stemmer.

In [5]:
lyrics_stemmed = []
for filen, lyrics in lyrics_by_file.iteritems():
    lyrics_stemmed.append(' '.join(stem(lyrics.split(' '))))

# create a Series to index cleaned lyrics by filename
lyrics_stemmed_by_file = Series(lyrics_stemmed,index=corpus.fileids())
lyrics_stemmed_by_file.head()

04CpzA2BdOLEz7EMp5uwTU.txt    babi day sun night rain summer simpl plain fou...
07GilNHSfS5oicUgHgU7VG.txt    man live hype real ting pull lexus like hit li...
08zJpaUQVi9FrKv2e32Bah.txt    one goe side world wide let playa ass nigga th...
0AAyo5bMIVgVxv9DF6XV2B.txt    littl chilli stand beyond let friend friend an...
0GSrlvOgiCSpBLeAukOIjT.txt    panda fam trust nigga jam hustl man want crush...
dtype: object

In [6]:
def writeFile(path, text):
    text_file = open(path, "w") 
    text_file.write(text + " ")
    text_file.close()

In [7]:
for filen, lyrics in lyrics_stemmed_by_file.iteritems():
    writeFile("./corpus/lyrics/stemmed/" + filen, lyrics.encode("utf8"))