In [1]:
import unicodedata
import re
import json
import nltk

import pandas as pd

import acquire

In [2]:
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

### Exercises

The end result of this exercise should be a file named prepare.py that defines the requested functions.

In this exercise we will be defining some functions to prepare textual data. These functions should apply equally well to both the codeup blog articles and the news articles that were previously acquired.

1) Define a function named `basic_clean`. It should take in a string and apply some basic text cleaning to it:

- Lowercase everything
- Normalize unicode characters
- Replace anything that is not a letter, number, whitespace or a single quote.

In [3]:
string = "Paul Erdős and George Pólya were influential Hungarian mathematicians who contributed \
a lot to the field. Erdős's name contains the Hungarian letter 'ő' ('o' with double acute accent), \
but is often incorrectly written as Erdos or Erdös either by mistake or out of typographical necessity"

In [4]:
string

"Paul Erdős and George Pólya were influential Hungarian mathematicians who contributed a lot to the field. Erdős's name contains the Hungarian letter 'ő' ('o' with double acute accent), but is often incorrectly written as Erdos or Erdös either by mistake or out of typographical necessity"

In [5]:
lower_string = string.lower()
lower_string

"paul erdős and george pólya were influential hungarian mathematicians who contributed a lot to the field. erdős's name contains the hungarian letter 'ő' ('o' with double acute accent), but is often incorrectly written as erdos or erdös either by mistake or out of typographical necessity"

In [6]:
normal_string = unicodedata.normalize('NFKD', lower_string)\
.encode('ascii', 'ignore')\
.decode('utf-8', 'ignore')

normal_string

"paul erdos and george polya were influential hungarian mathematicians who contributed a lot to the field. erdos's name contains the hungarian letter 'o' ('o' with double acute accent), but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity"

In [7]:
normal_no_chars_string = re.sub(r'[^a-z0-9\s]', '', normal_string.replace("'", " "))

normal_no_chars_string

'paul erdos and george polya were influential hungarian mathematicians who contributed a lot to the field erdos s name contains the hungarian letter  o   o  with double acute accent but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity'

In [8]:
def basic_clean(string):
    lower_string = string.lower()
    
    normal_string = unicodedata.normalize('NFKD', lower_string)\
    .encode('ascii', 'ignore')\
    .decode('utf-8', 'ignore')
    
    normal_no_chars_string = re.sub(r'[^a-z0-9\s]', '', normal_string.replace("'", " "))
    
    return normal_no_chars_string

In [9]:
string

"Paul Erdős and George Pólya were influential Hungarian mathematicians who contributed a lot to the field. Erdős's name contains the Hungarian letter 'ő' ('o' with double acute accent), but is often incorrectly written as Erdos or Erdös either by mistake or out of typographical necessity"

In [10]:
clean_string = basic_clean(string)

clean_string

'paul erdos and george polya were influential hungarian mathematicians who contributed a lot to the field erdos s name contains the hungarian letter  o   o  with double acute accent but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity'

2) Define a function named `tokenize`. It should take in a string and tokenize all the words in the string.

In [11]:
ttt = ToktokTokenizer()

In [12]:
ttt.tokenize(clean_string, return_str=True)

'paul erdos and george polya were influential hungarian mathematicians who contributed a lot to the field erdos s name contains the hungarian letter o o with double acute accent but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity'

In [13]:
def tokenize(string):
    ttt = ToktokTokenizer()
    return ttt.tokenize(string, return_str=True)

In [14]:
tokenize(clean_string)

'paul erdos and george polya were influential hungarian mathematicians who contributed a lot to the field erdos s name contains the hungarian letter o o with double acute accent but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity'

3) Define a function named `stem`. It should accept some text and return the text after applying stemming to all the words.

In [15]:
ps = nltk.porter.PorterStemmer()

stems = [ps.stem(word) for word in clean_string.split()]
stemmed_string = ' '.join(stems)

In [16]:
stemmed_string

'paul erdo and georg polya were influenti hungarian mathematician who contribut a lot to the field erdo s name contain the hungarian letter o o with doubl acut accent but is often incorrectli written as erdo or erdo either by mistak or out of typograph necess'

In [17]:
def stem(string):
    ps = nltk.porter.PorterStemmer()
    
    stems = [ps.stem(word) for word in string.split()]

    stemmed_string = ' '.join(stems)
    
    return stemmed_string

In [18]:
stem(clean_string)

'paul erdo and georg polya were influenti hungarian mathematician who contribut a lot to the field erdo s name contain the hungarian letter o o with doubl acut accent but is often incorrectli written as erdo or erdo either by mistak or out of typograph necess'

4) Define a function named `lemmatize`. It should accept some text and return the text after applying lemmatization to each word.

In [19]:
wnl = nltk.stem.WordNetLemmatizer()

In [20]:
lemmas = [wnl.lemmatize(word) for word in clean_string.split()]

lemmed_string = ' '.join(lemmas)

lemmed_string

'paul erdos and george polya were influential hungarian mathematician who contributed a lot to the field erdos s name contains the hungarian letter o o with double acute accent but is often incorrectly written a erdos or erdos either by mistake or out of typographical necessity'

In [21]:
def lemmatize(string):
    wnl = nltk.stem.WordNetLemmatizer()
    
    lemmas = [wnl.lemmatize(word) for word in string.split()]
    
    lemmed_string = ' '.join(lemmas)
    
    return lemmed_string

In [22]:
lemmed_string = lemmatize(clean_string)

In [23]:
lemmed_string

'paul erdos and george polya were influential hungarian mathematician who contributed a lot to the field erdos s name contains the hungarian letter o o with double acute accent but is often incorrectly written a erdos or erdos either by mistake or out of typographical necessity'

5) Define a function named `remove_stopwords`. It should accept some text and return the text after removing all the stopwords.

This function should define two optional parameters, `extra_words` and `exclude_words`. These parameters should define any additional stop words to include, and any words that we *don't* want to remove.

In [24]:
stopword_list = stopwords.words('english')

stopword_list

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [25]:
to_remove = []

to_add = []

stopword_list = [word for word in stopword_list if word not in to_remove]

for word in to_add:
    stopword_list.append(word)

stopword_list

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [26]:
def remove_stopwords(string, extra_words=[], exclude_words=[]):
    stopword_list = stopwords.words('english')
    
    #Removing words from list
    stopword_list = [word for word in stopword_list if word not in exclude_words]
    
    #Adding words to list
    
    for word in extra_words:
        stopword_list.append(word)
    
    no_stop_words = [word for word in string.split() if word not in stopword_list]
    
    no_stop_string = ' '.join(no_stop_words)
    
    return no_stop_string

In [27]:
#Check with adding and removing certain words. Won't do it for the actual string

remove_stopwords(lemmed_string, ['influential', 'incorrectly'], ['by'])

'paul erdos george polya hungarian mathematician contributed lot field erdos name contains hungarian letter double acute accent often written erdos erdos either by mistake typographical necessity'

In [28]:
parsed_string = remove_stopwords(lemmed_string)

parsed_string

'paul erdos george polya influential hungarian mathematician contributed lot field erdos name contains hungarian letter double acute accent often incorrectly written erdos erdos either mistake typographical necessity'

6) Use your data from the acquire to produce a dataframe of the news articles. Name the dataframe `news_df`.

In [29]:
news_df = pd.DataFrame(acquire.get_news_articles())

news_df

Unnamed: 0,title,content,category
0,"Sensex, Nifty end at fresh closing highs",Benchmark indices Sensex and Nifty ended at re...,business
1,Amazon tricked millions of customers into enro...,US Federal Trade Commission (FTC) has sued Ama...,business
2,TIME releases list of the world's 100 most inf...,TIME magazine has released its annual list of ...,business
3,Which are the world's top 10 airlines accordin...,Singapore Airlines is the world's best airline...,business
4,"Grab lays off over 1,000 employees",Singapore-based ride-hailing and food delivery...,business
...,...,...,...
95,Richard Gere attends Yoga event led by PM Modi...,Actor Richard Gere was in attendance at the Yo...,entertainment
96,"Parents said 'Even cats, dogs are on TV, when ...","Nawazuddin Siddiqui, while talking about his r...",entertainment
97,"Thought kids at school would laugh at me, said...",Jugal Hansraj said that he had initially rejec...,entertainment
98,"Asked Alia 'What is it that H'wood has', she s...",Filmmaker Mahesh Bhatt said he once asked Alia...,entertainment


In [30]:
news_df.category.value_counts()

business         25
sports           25
technology       25
entertainment    25
Name: category, dtype: int64

7) Make another dataframe for the Codeup blog posts. Name the dataframe `codeup_df`.

In [31]:
urls = ['https://codeup.com/featured/apida-heritage-month/'
       ,'https://codeup.com/featured/women-in-tech-panelist-spotlight/'
       ,'https://codeup.com/events/black-excellence-in-tech-panelist-spotlight-stephanie-jones/'
       ,'https://codeup.com/codeup-news/codeup-best-bootcamps/'
       ,'https://codeup.com/employers/hiring-tech-talent/']

In [32]:
codeup_df = pd.DataFrame(acquire.get_blog_articles(urls))

codeup_df

Unnamed: 0,url,title,content
0,https://codeup.com/featured/apida-heritage-month/,Spotlight on APIDA Voices: Celebrating Heritag...,May is traditionally known as Asian American ...
1,https://codeup.com/featured/women-in-tech-pane...,Women in tech: Panelist Spotlight – Magdalena ...,Women in tech: Panelist Spotlight – Magdalena...
2,https://codeup.com/events/black-excellence-in-...,Black excellence in tech: Panelist Spotlight -...,Black excellence in tech: Panelist Spotlight ...
3,https://codeup.com/codeup-news/codeup-best-boo...,Codeup Among Top 58 Best Coding Bootcamps of 2...,Codeup is pleased to announce we have been ra...
4,https://codeup.com/employers/hiring-tech-talent/,Hiring Tech Talent Around the Holidays - Codeup,Are you a hiring manager having trouble filli...


8) For each dataframe, produce the following columns:

- `title` to hold the title
- `original` to hold the original article/post content
- `clean` to hold the normalized and tokenized original with the stopwords removed.
- `stemmed` to hold the stemmed version of the cleaned data.
- `lemmatized` to hold the lemmatized version of the cleaned data.

In [33]:
news_df_nlp = pd.DataFrame()

news_df_nlp['title'] = news_df['title']
news_df_nlp['original'] = news_df['content']
news_df_nlp['clean'] = [remove_stopwords(tokenize(basic_clean(string))) for string in news_df.content]
news_df_nlp['stemmed'] = [stem(string) for string in news_df_nlp.clean]
news_df_nlp['lemmatized'] = [lemmatize(string) for string in news_df_nlp.clean]

news_df_nlp

Unnamed: 0,title,original,clean,stemmed,lemmatized
0,"Sensex, Nifty end at fresh closing highs",Benchmark indices Sensex and Nifty ended at re...,benchmark indices sensex nifty ended record cl...,benchmark indic sensex nifti end record close ...,benchmark index sensex nifty ended record clos...
1,Amazon tricked millions of customers into enro...,US Federal Trade Commission (FTC) has sued Ama...,us federal trade commission ftc sued amazon ac...,us feder trade commiss ftc su amazon accus tri...,u federal trade commission ftc sued amazon acc...
2,TIME releases list of the world's 100 most inf...,TIME magazine has released its annual list of ...,time magazine released annual list world 100 i...,time magazin releas annual list world 100 infl...,time magazine released annual list world 100 i...
3,Which are the world's top 10 airlines accordin...,Singapore Airlines is the world's best airline...,singapore airlines world best airline accordin...,singapor airlin world best airlin accord skytr...,singapore airline world best airline according...
4,"Grab lays off over 1,000 employees",Singapore-based ride-hailing and food delivery...,singaporebased ridehailing food delivery app g...,singaporebas ridehail food deliveri app grab l...,singaporebased ridehailing food delivery app g...
...,...,...,...,...,...
95,Richard Gere attends Yoga event led by PM Modi...,Actor Richard Gere was in attendance at the Yo...,actor richard gere attendance yoga event led p...,actor richard gere attend yoga event led pm na...,actor richard gere attendance yoga event led p...
96,"Parents said 'Even cats, dogs are on TV, when ...","Nawazuddin Siddiqui, while talking about his r...",nawazuddin siddiqui talking role junior artist...,nawazuddin siddiqui talk role junior artist ti...,nawazuddin siddiqui talking role junior artist...
97,"Thought kids at school would laugh at me, said...",Jugal Hansraj said that he had initially rejec...,jugal hansraj said initially rejected shekhar ...,jugal hansraj said initi reject shekhar kapur ...,jugal hansraj said initially rejected shekhar ...
98,"Asked Alia 'What is it that H'wood has', she s...",Filmmaker Mahesh Bhatt said he once asked Alia...,filmmaker mahesh bhatt said asked alia bhatt h...,filmmak mahesh bhatt said ask alia bhatt holly...,filmmaker mahesh bhatt said asked alia bhatt h...


In [34]:
codeup_df_nlp = pd.DataFrame()

codeup_df_nlp['title'] = codeup_df['title']
codeup_df_nlp['original'] = codeup_df['content']
codeup_df_nlp['clean'] = [remove_stopwords(tokenize(basic_clean(string))) for string in codeup_df.content]
codeup_df_nlp['stemmed'] = [stem(string) for string in codeup_df_nlp.clean]
codeup_df_nlp['lemmatized'] = [lemmatize(string) for string in codeup_df_nlp.clean]

codeup_df_nlp

Unnamed: 0,title,original,clean,stemmed,lemmatized
0,Spotlight on APIDA Voices: Celebrating Heritag...,May is traditionally known as Asian American ...,may traditionally known asian american pacific...,may tradit known asian american pacif island a...,may traditionally known asian american pacific...
1,Women in tech: Panelist Spotlight – Magdalena ...,Women in tech: Panelist Spotlight – Magdalena...,women tech panelist spotlight magdalena rahn c...,women tech panelist spotlight magdalena rahn c...,woman tech panelist spotlight magdalena rahn c...
2,Black excellence in tech: Panelist Spotlight -...,Black excellence in tech: Panelist Spotlight ...,black excellence tech panelist spotlight steph...,black excel tech panelist spotlight stephani j...,black excellence tech panelist spotlight steph...
3,Codeup Among Top 58 Best Coding Bootcamps of 2...,Codeup is pleased to announce we have been ra...,codeup pleased announce ranked among 58 best c...,codeup pleas announc rank among 58 best code b...,codeup pleased announce ranked among 58 best c...
4,Hiring Tech Talent Around the Holidays - Codeup,Are you a hiring manager having trouble filli...,hiring manager trouble filling position around...,hire manag troubl fill posit around holiday co...,hiring manager trouble filling position around...


9) Ask yourself:

- If your corpus is 493KB, would you prefer to use stemmed or lemmatized text? Lemmatized text.
- If your corpus is 25MB, would you prefer to use stemmed or lemmatized text? Lemmatized text.
- If your corpus is 200TB of text and you're charged by the megabyte for your hosted computational resources, would you prefer to use stemmed or lemmatized text? In this instance, given the huge amount, only then would I use stemming.