In [1]:
import unicodedata
import re
import json
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
import pandas as pd
import acquire

1. Define a function named basic_clean. It should take in a string and apply some basic text cleaning to it:

 - Lowercase everything
 - Normalize unicode characters
 - Replace anything that is not a letter, number, whitespace or a single quote.

In [2]:
def basic_clean(string):
    string = string.lower()
    string = unicodedata.normalize('NFKD', string)\
    .encode('ascii', 'ignore')\
    .decode('utf-8', 'ignore')
    # remove anything that is not a letter, a number, a single quote, or whitespace
    string = re.sub(r"[^a-z0-9'\s]", '', string)
    return string

2. Define a function named tokenize. It should take in a string and tokenize all the words in the string.

In [3]:
def tokenize(string):
    tokenizer = nltk.tokenize.ToktokTokenizer()
    return tokenizer.tokenize(string, return_str=True)

3. Define a function named stem. It should accept some text and return the text after applying stemming to all the words.

In [4]:
def stem(text):
    # Create the nltk stemmer object, then use it
    ps = nltk.porter.PorterStemmer()
    stems = [ps.stem(word) for word in text.split()]
    text_stemmed = ' '.join(stems)
    return text_stemmed

4. Define a function named lemmatize. It should accept some text and return the text after applying lemmatization to each word.

In [5]:
def lemmatize(text):
    wnl = nltk.stem.WordNetLemmatizer()
    lemmas = [wnl.lemmatize(word) for word in text.split()]
    text_lemmatized = ' '.join(lemmas)
    return text_lemmatized

5. Define a function named remove_stopwords. It should accept some text and return the text after removing all the stopwords.
This function should define two optional parameters, extra_words and exclude_words. These parameters should define any additional stop words to include, and any words that we don't want to remove.

In [6]:
def remove_stopwords(string, extra_words=[], exclude_words=[]):
    # Tokenize the string
    string = tokenize(string)

    words = string.split()
    stopword_list = stopwords.words('english')

    # remove the excluded words from the stopword list
    stopword_list = set(stopword_list) - set(exclude_words)

    # add in the user specified extra words
    stopword_list = stopword_list.union(set(extra_words))

    filtered_words = [w for w in words if w not in stopword_list]
    final_string = " ".join(filtered_words)
    return final_string

6. Define a function named prep_article that takes in the dictionary representing an article and returns a dictionary that looks like this:
 - {
    'title': 'the original title'.
    'original': original,
    'stemmed': article_stemmed,
    'lemmatized': article_lemmatized,
    'clean': article_without_stopwords
}

In [7]:
def prep_articles(df):
    #df['title'] = df.title
    df["original"] = df.body
    df["stemmed"] = df.body.apply(basic_clean).apply(stem)
    df["lemmatized"] = df.body.apply(basic_clean).apply(lemmatize)
    df["clean"] = df.body.apply(basic_clean).apply(remove_stopwords)
    df.drop(columns=["body"], inplace=True)
    return df

7. Define a function named prepare_article_data that takes in the list of articles dictionaries, applies the prep_article function to each one, and returns the transformed data.

In [8]:
def prep_blog_articles():
    df = acquire.get_blog_articles()
    return prep_articles(df)

In [9]:
df = acquire.get_blog_articles()

In [10]:
df

Unnamed: 0,title,content
0,Codeup’s Data Science Career Accelerator is Here!,The rumors are true! The time has arrived. Cod...
1,Data Science Myths,By Dimitri Antoniou and Maggie GiustData Scien...
2,Data Science VS Data Analytics: What’s The Dif...,"By Dimitri AntoniouA week ago, Codeup launched..."
3,10 Tips to Crush It at the SA Tech Job Fair,10 Tips to Crush It at the SA Tech Job FairSA ...
4,Competitor Bootcamps Are Closing. Is the Model...,Competitor Bootcamps Are Closing. Is the Model...
