In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import *
import string
from collections import Counter

In [2]:
data = pd.read_pickle('uncleaned-10k-articles.pkl')

In [3]:
def get_tokens(text): 
    lowers = text.lower()
    ## Python 2.7 solution is easier:
    ## no_punctuation = lowers.translate(string.punctuation)
    translator = str.maketrans({key: ' ' for key in string.punctuation}) #
    no_punctuation = lowers.translate(translator)# 
    tokens = nltk.word_tokenize(no_punctuation)
    return tokens

#this requires nltk's stopword corpus
# import nltk 
# then run nltk.download() 
# and then download the stopwords corpus
def remove_stop_words(tokens): 
    filtered = [w for w in tokens if (not w in stopwords.words('english')) and (not 'http' in w)]
    return filtered

def stem(tokens):
    stemmed = []
    for item in tokens:
        stemmed.append(PorterStemmer().stem(item))
    return stemmed

def text_clean(text):
    lowers = text.lower()
    translator = str.maketrans({key: ' ' for key in string.punctuation}) #
    no_punctuation = lowers.translate(translator)# 
    tokens = nltk.word_tokenize(no_punctuation)
    filtered_stemmed = [PorterStemmer().stem(w) for w in tokens if (not w in stopwords.words('english')) and (not 'http' in w)]
    return filtered_stemmed   

In [4]:
# testing the functions
tokens = get_tokens( data.text[1000])
filtered = remove_stop_words(tokens)
filstemed = text_clean(data.text[1000])
cc = Counter(tokens)
print('unfiltered:\n',cc.most_common(20) )
cc2 = Counter(filtered)
print('\nfiltered:\n',cc2.most_common(20) )
cc3 = Counter(filstemed)
print('\nfiltered and then stemmed:\n',cc3.most_common(20) )

unfiltered:
 [('the', 111), ('firestone', 76), ('of', 53), ('in', 47), ('and', 40), ('to', 38), ('ref', 32), ('a', 29), ('tires', 24), ('company', 23), ('was', 23), ('they', 22), ('ford', 21), ('tire', 19), ('that', 19), ('http', 18), ('for', 15), ('on', 15), ('cite', 15), ('title', 15)]

filtered:
 [('firestone', 76), ('ref', 32), ('tires', 24), ('company', 23), ('ford', 21), ('tire', 19), ('cite', 15), ('title', 15), ('bridgestone', 14), ('url', 14), ('www', 13), ('name', 13), ('com', 13), ('rubber', 12), ('accessdate', 11), ('web', 11), ('cnn', 11), ('publisher', 9), ('2007', 9), ('said', 8)]

filtered and then stemmed:
 [('fireston', 76), ('tire', 43), ('ref', 32), ('compani', 24), ('ford', 21), ('titl', 15), ('cite', 15), ('url', 14), ('bridgeston', 14), ('www', 13), ('rubber', 13), ('com', 13), ('name', 13), ('accessd', 11), ('web', 11), ('cnn', 11), ('publish', 9), ('2007', 9), ('recal', 9), ('corpor', 9)]


In [5]:
# takes a while 
data.loc[:,'clean_text'] = data.text.apply(text_clean) 
data.to_pickle("cleaned-10k-articles.pkl")