In [2]:
import re
import unicodedata
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
import pandas as pd

import nlp_acquire

In [3]:
scrape_codeup_content = nlp_acquire.read_url_or_file_codeup()
scrape_inshort_content = nlp_acquire.read_url_or_file_inshort()

Found File
Found File


In [4]:
df_codeup = pd.DataFrame(scrape_codeup_content)
df_inshort = pd.DataFrame(scrape_inshort_content)
df_codeup

Unnamed: 0,title,content
0,Is a Career in Tech Recession-Proof?,"Given the current economic climate, many econo..."
1,Codeup X Superhero Car Show & Comic Con,Codeup had a blast at the San Antonio Superher...
2,What Jobs Can You Get After a Coding Bootcamp?...,If you’re considering a career in web developm...
3,Codeup’s New Dallas Campus,Codeup’s Dallas campus has a new location! For...
4,Codeup TV Commercial,Codeup has officially made its TV debut! Our c...
5,What Jobs Can You Get After a Coding Bootcamp?...,Have you been considering a career in Cloud Ad...


### Define a function named basic_clean. It should take in a string and apply some basic text cleaning to it:
- Lowercase everything
- Normalize unicode characters
- Replace anything that is not a letter, number, whitespace or a single quote.

In [5]:
def make_lower(content_list):
    cleaned_content = []
    for content in content_list:
        clean_content = {
            'title': content['title'].lower(),
            'content': content['content'].lower()
        }
        cleaned_content.append(clean_content)
    return cleaned_content

def make_no_special_chars(content_list):
    cleaned_content = []
    r_ex = r"[^a-zA-Z0-9\s]"
    for content in content_list:
        clean_content = {
            'title': re.sub(r_ex,'', content['title']),
            'content': re.sub(r_ex,'', content['content'])
        }
        cleaned_content.append(clean_content)
    return cleaned_content

def make_normal(content_list):
    cleaned_content = []
    for content in content_list:
        clean_content = {
            'title': unicodedata.normalize('NFKD', content['title']).encode('ascii', 'ignore').decode('utf-8', 'ignore'),
            'content': unicodedata.normalize('NFKD', content['content']).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        }
        cleaned_content.append(clean_content)
    return cleaned_content

def basic_clean(content_list):
    content_list = make_normal(content_list)
    content_list = make_no_special_chars(content_list)
    content_list = make_lower(content_list)
    return content_list

In [6]:
scrape_codeup_content = basic_clean(scrape_codeup_content)
scrape_inshort_content = basic_clean(scrape_inshort_content)

#### make a function to tokenize; take in a string and tokenize the words

In [7]:
def tokenize(s):
    tokenizer = ToktokTokenizer()
    return tokenizer.tokenize(s)

def mass_tokenize(content_list):
    for content in content_list:
        content['clean'] = tokenize(content['content'])
    return content_list


In [8]:
tokenize('one two three string')

['one', 'two', 'three', 'string']

In [9]:
scrape_codeup_content = mass_tokenize(scrape_codeup_content)
scrape_inshort_content = mass_tokenize(scrape_inshort_content)

#### make a function to stem; accept string/text and return text after stemming the words

In [10]:
def stem(s):
    ps = nltk.porter.PorterStemmer()
    return [ps.stem(word) for word in s]

def mass_stem(content_list):
    for content in content_list:
        content['stemmed'] = stem(content['clean'])
    return content_list

In [11]:
scrape_codeup_content = mass_stem(scrape_codeup_content)
scrape_inshort_content = mass_stem(scrape_inshort_content)

#### make a function to lemmatize; take a text and return lemmatized text

In [12]:
# import nltk
# nltk.download('all')

In [13]:
def lemmatize(s):
    wnl = nltk.stem.WordNetLemmatizer()
    return [wnl.lemmatize(word) for word in s]

def mass_lemmatize(content_list):
    for content in content_list:
        content['lemmatized'] = lemmatize(content['clean'])
    return content_list

In [14]:
scrape_codeup_content = mass_lemmatize(scrape_codeup_content)
scrape_inshort_content = mass_lemmatize(scrape_inshort_content)

#### make a function to remove stopwords, accept text and return text with stop words removed. utilize extra parameters to include any words we want to include and any we dont want to remove

In [15]:
def remove_stopwords(s, extra_words = ['codeup'], exclude_words = ['shan']):
    stopwords_list = nltk.corpus.stopwords.words('english')
    stopwords_list = stopwords_list + extra_words
    return [word for word in s if word not in stopwords_list]

def mass_remove_stopwords(content_list):
    for content in content_list:
        content['clean'] = remove_stopwords(content['clean'])
    return content_list

In [16]:
scrape_codeup_content = mass_remove_stopwords(scrape_codeup_content)
scrape_inshort_content = mass_remove_stopwords(scrape_inshort_content)

#### use data from the acquire to produce df of news articles

In [17]:
news_df = pd.DataFrame(nlp_acquire.read_url_or_file_inshort())
news_df

Found File


Unnamed: 0,title,content,category
0,ED arrests former NSE CEO Ravi Narain in money...,The Enforcement Directorate has arrested Ravi ...,business
1,Musk's lawyer seeks to delay Twitter trial to ...,Tesla CEO Elon Musk's lawyer urged that the tr...,business
2,No sense to buy Twitter if we're heading into ...,"Amid their lawsuit, Twitter's lawyer revealed ...",business
3,ByteDance fires hundreds of employees from vid...,TikTok's parent company ByteDance has reported...,business
4,Are you kidding me: EaseMyTrip Co-founder shar...,EaseMyTrip Co-founder Prashant Pitti took to T...,business
...,...,...,...
93,"Didn't know much about Vicky, was won over whe...","Actress Katrina Kaif, during her appearance on...",entertainment
94,"Superhero films worldwide have high budgets, w...","Filmmaker Rakesh Roshan, speaking about 'Krris...",entertainment
95,Parents rejected films after 'Iqbal' for me to...,"Actress Shweta Basu Prasad, who featured as a ...",entertainment
96,Was planning to do an album with Bamba Bakya: ...,Composer-singer AR Rahman remembered late play...,entertainment


#### make a df with the blogposts

In [18]:
codeup_df = pd.DataFrame(nlp_acquire.read_url_or_file_codeup())
codeup_df

Found File


Unnamed: 0,title,content
0,Is a Career in Tech Recession-Proof?,"Given the current economic climate, many econo..."
1,Codeup X Superhero Car Show & Comic Con,Codeup had a blast at the San Antonio Superher...
2,What Jobs Can You Get After a Coding Bootcamp?...,If you’re considering a career in web developm...
3,Codeup’s New Dallas Campus,Codeup’s Dallas campus has a new location! For...
4,Codeup TV Commercial,Codeup has officially made its TV debut! Our c...
5,What Jobs Can You Get After a Coding Bootcamp?...,Have you been considering a career in Cloud Ad...


#### For each dataframe, produce the following columns:

1. title to hold the title original to hold the original article/post content 
2. clean to hold the normalized and tokenized original with the stopwords removed. 
3. stemmed to hold the stemmed version of the cleaned data. 
4. lemmatized to hold the lemmatized version of the cleaned data.

In [19]:
def make_dataframe(text_dict):
    text_dict = basic_clean(text_dict)
    text_dict = mass_tokenize(text_dict)
    text_dict = mass_remove_stopwords(text_dict)
    text_dict = mass_stem(text_dict)
    text_dict = mass_lemmatize(text_dict)
    return pd.DataFrame(text_dict)

In [20]:
codeup_df = make_dataframe(nlp_acquire.read_url_or_file_codeup())
codeup_df

Found File


Unnamed: 0,title,content,clean,stemmed,lemmatized
0,is a career in tech recessionproof,given the current economic climate many econom...,"[given, current, economic, climate, many, econ...","[given, current, econom, climat, mani, economi...","[given, current, economic, climate, many, econ..."
1,codeup x superhero car show comic con,codeup had a blast at the san antonio superher...,"[blast, san, antonio, superhero, car, show, co...","[blast, san, antonio, superhero, car, show, co...","[blast, san, antonio, superhero, car, show, co..."
2,what jobs can you get after a coding bootcamp ...,if youre considering a career in web developme...,"[youre, considering, career, web, development,...","[your, consid, career, web, develop, dont, kno...","[youre, considering, career, web, development,..."
3,codeups new dallas campus,codeups dallas campus has a new location for m...,"[codeups, dallas, campus, new, location, two, ...","[codeup, dalla, campu, new, locat, two, year, ...","[codeups, dallas, campus, new, location, two, ..."
4,codeup tv commercial,codeup has officially made its tv debut our co...,"[officially, made, tv, debut, community, stude...","[offici, made, tv, debut, commun, student, sta...","[officially, made, tv, debut, community, stude..."
5,what jobs can you get after a coding bootcamp ...,have you been considering a career in cloud ad...,"[considering, career, cloud, administration, i...","[consid, career, cloud, administr, idea, job, ...","[considering, career, cloud, administration, i..."


In [None]:
news_df = make_dataframe(acquire.read_url_or_file_inshort())
news_df