<a href="https://colab.research.google.com/github/tcarlon94/Cap_3_News_Categorization/blob/main/Cap3_Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# News Category Automation NLP

#Preprocessing

# Import Modules

In [1]:
import numpy as np
import pandas as pd
import textblob as TextBlob
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# Load Data

In [2]:
file_path = 'news_data.csv'
data = pd.read_csv(file_path)
data.head()

Unnamed: 0,link,category,authors,date,description
0,https://www.huffpost.com/entry/covid-boosters-...,U.S. NEWS,"Carla K. Johnson, AP",2022-09-23,Over 4 Million Americans Roll Up Sleeves For O...
1,https://www.huffpost.com/entry/american-airlin...,U.S. NEWS,Mary Papenfuss,2022-09-23,"American Airlines Flyer Charged, Banned For Li..."
2,https://www.huffpost.com/entry/funniest-tweets...,COMEDY,Elyse Wanshel,2022-09-23,23 Of The Funniest Tweets About Cats And Dogs ...
3,https://www.huffpost.com/entry/funniest-parent...,PARENTING,Caroline Bologna,2022-09-23,The Funniest Tweets From Parents This Week (Se...
4,https://www.huffpost.com/entry/amy-cooper-lose...,U.S. NEWS,Nina Golgowski,2022-09-22,Woman Who Called Cops On Black Bird-Watcher Lo...


In [3]:
#examine data
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 189802 entries, 0 to 189801
Data columns (total 5 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   link         189802 non-null  object
 1   category     189802 non-null  object
 2   authors      156860 non-null  object
 3   date         189802 non-null  object
 4   description  189802 non-null  object
dtypes: object(5)
memory usage: 7.2+ MB


In [4]:
#confirm missing description
data[data['description'] == '']

Unnamed: 0,link,category,authors,date,description


# Lower Case Text

In [5]:
#lowercase description column
data['description'] = data['description'].str.lower()
data.head()

Unnamed: 0,link,category,authors,date,description
0,https://www.huffpost.com/entry/covid-boosters-...,U.S. NEWS,"Carla K. Johnson, AP",2022-09-23,over 4 million americans roll up sleeves for o...
1,https://www.huffpost.com/entry/american-airlin...,U.S. NEWS,Mary Papenfuss,2022-09-23,"american airlines flyer charged, banned for li..."
2,https://www.huffpost.com/entry/funniest-tweets...,COMEDY,Elyse Wanshel,2022-09-23,23 of the funniest tweets about cats and dogs ...
3,https://www.huffpost.com/entry/funniest-parent...,PARENTING,Caroline Bologna,2022-09-23,the funniest tweets from parents this week (se...
4,https://www.huffpost.com/entry/amy-cooper-lose...,U.S. NEWS,Nina Golgowski,2022-09-22,woman who called cops on black bird-watcher lo...


This will ensure consistency in words and reduce vocab size to help our model.

# Check for URLs/HTML tags

In [6]:
import re

# Check for HTML tags
html_tags = data['description'].str.contains(r'<.*?>', regex=True)
html_tags[html_tags == True]

Unnamed: 0,description


In [7]:
# Check for URL
url = data['description'].str.contains(r'http\S+|www.\S+', regex=True)
url[url == True]

Unnamed: 0,description
14384,True
16033,True
21359,True
23083,True
25327,True
...,...
187426,True
187769,True
187982,True
188384,True


In [8]:
# Remove URLs
data['description'] = data['description'].str.replace(r'http\S+|www.\S+', '', regex=True)
data['description'].str.contains(r'http\S+|www.\S+', regex=True).any()

np.False_

Now we have no html tags or urls in our text data

# Remove Punctuation

In [9]:
# Create punctuation variable from string
punc = string.punctuation
punc

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [10]:
# Remove punctuation from description
data['description'] = data['description'].str.translate(str.maketrans('', '', punc))
data.head()

Unnamed: 0,link,category,authors,date,description
0,https://www.huffpost.com/entry/covid-boosters-...,U.S. NEWS,"Carla K. Johnson, AP",2022-09-23,over 4 million americans roll up sleeves for o...
1,https://www.huffpost.com/entry/american-airlin...,U.S. NEWS,Mary Papenfuss,2022-09-23,american airlines flyer charged banned for lif...
2,https://www.huffpost.com/entry/funniest-tweets...,COMEDY,Elyse Wanshel,2022-09-23,23 of the funniest tweets about cats and dogs ...
3,https://www.huffpost.com/entry/funniest-parent...,PARENTING,Caroline Bologna,2022-09-23,the funniest tweets from parents this week sep...
4,https://www.huffpost.com/entry/amy-cooper-lose...,U.S. NEWS,Nina Golgowski,2022-09-22,woman who called cops on black birdwatcher los...


In [11]:
# Double check for punctuation
punc_pattern = r'[{}]'.format(punc)
data['description'].str.contains(punc_pattern).any()

np.False_

We've confirmed we removed punctuation from our text data. This will remove noise from our text data and make it cleaner for the model

# Handle ChatWords & StopWords

ChatWords would be internet slang (EX: LOL, TMI, etc.). This shouldn't be very prevalent in our data set as it is a news site but these may come in to play in some categories such as comedy, weird news, and others.

In [12]:
# Common ChatWords found in github repository https://github.com/rishabhverma17/sms_slang_translator/blob/master/slang.txt
chat_words = {
    "AFAIK": "As Far As I Know",
    "AFK": "Away From Keyboard",
    "ASAP": "As Soon As Possible",
    "ATK": "At The Keyboard",
    "ATM": "At The Moment",
    "A3": "Anytime, Anywhere, Anyplace",
    "BAK": "Back At Keyboard",
    "BBL": "Be Back Later",
    "BBS": "Be Back Soon",
    "BFN": "Bye For Now",
    "B4N": "Bye For Now",
    "BRB": "Be Right Back",
    "BRT": "Be Right There",
    "BTW": "By The Way",
    "B4": "Before",
    "B4N": "Bye For Now",
    "CU": "See You",
    "CUL8R": "See You Later",
    "CYA": "See You",
    "FAQ": "Frequently Asked Questions",
    "FC": "Fingers Crossed",
    "FWIW": "For What It's Worth",
    "FYI": "For Your Information",
    "GAL": "Get A Life",
    "GG": "Good Game",
    "GN": "Good Night",
    "GMTA": "Great Minds Think Alike",
    "GR8": "Great!",
    "G9": "Genius",
    "IC": "I See",
    "ICQ": "I Seek you (also a chat program)",
    "ILU": "ILU: I Love You",
    "IMHO": "In My Honest/Humble Opinion",
    "IMO": "In My Opinion",
    "IOW": "In Other Words",
    "IRL": "In Real Life",
    "KISS": "Keep It Simple, Stupid",
    "LDR": "Long Distance Relationship",
    "LMAO": "Laugh My A.. Off",
    "LOL": "Laughing Out Loud",
    "LTNS": "Long Time No See",
    "L8R": "Later",
    "MTE": "My Thoughts Exactly",
    "M8": "Mate",
    "NRN": "No Reply Necessary",
    "OIC": "Oh I See",
    "PITA": "Pain In The A..",
    "PRT": "Party",
    "PRW": "Parents Are Watching",
    "QPSA?": "Que Pasa?",
    "ROFL": "Rolling On The Floor Laughing",
    "ROFLOL": "Rolling On The Floor Laughing Out Loud",
    "ROTFLMAO": "Rolling On The Floor Laughing My A.. Off",
    "SK8": "Skate",
    "STATS": "Your sex and age",
    "ASL": "Age, Sex, Location",
    "THX": "Thank You",
    "TTFN": "Ta-Ta For Now!",
    "TTYL": "Talk To You Later",
    "U": "You",
    "U2": "You Too",
    "U4E": "Yours For Ever",
    "WB": "Welcome Back",
    "WTF": "What The F...",
    "WTG": "Way To Go!",
    "WUF": "Where Are You From?",
    "W8": "Wait...",
    "7K": "Sick:-D Laugher",
    "TFW": "That feeling when",
    "MFW": "My face when",
    "MRW": "My reaction when",
    "IFYP": "I feel your pain",
    "TNTL": "Trying not to laugh",
    "JK": "Just kidding",
    "IDC": "I don't care",
    "ILY": "I love you",
    "IMU": "I miss you",
    "ADIH": "Another day in hell",
    "ZZZ": "Sleeping, bored, tired",
    "WYWH": "Wish you were here",
    "TIME": "Tears in my eyes",
    "BAE": "Before anyone else",
    "FIMH": "Forever in my heart",
    "BSAAW": "Big smile and a wink",
    "BWL": "Bursting with laughter",
    "BFF": "Best friends forever",
    "CSL": "Can't stop laughing"
}

In [13]:
# Convert chat words to text:
def chat_word_conversion(text):
    """Convert chat words to text"""
    new_text = []
    for w in text.split():
        if w.upper() in chat_words:
            new_text.append(chat_words[w.upper()])
        else:
            new_text.append(w)
    return " ".join(new_text)

In [14]:
# Convert chat words in description column
data['description'] = data['description'].apply(lambda x: chat_word_conversion(x))
data.head()

Unnamed: 0,link,category,authors,date,description
0,https://www.huffpost.com/entry/covid-boosters-...,U.S. NEWS,"Carla K. Johnson, AP",2022-09-23,over 4 million americans roll up sleeves for o...
1,https://www.huffpost.com/entry/american-airlin...,U.S. NEWS,Mary Papenfuss,2022-09-23,american airlines flyer charged banned for lif...
2,https://www.huffpost.com/entry/funniest-tweets...,COMEDY,Elyse Wanshel,2022-09-23,23 of the funniest tweets about cats and dogs ...
3,https://www.huffpost.com/entry/funniest-parent...,PARENTING,Caroline Bologna,2022-09-23,the funniest tweets from parents this week sep...
4,https://www.huffpost.com/entry/amy-cooper-lose...,U.S. NEWS,Nina Golgowski,2022-09-22,woman who called cops on black birdwatcher los...


Now we will also handle StopWords like 'the', 'is', 'and', etc. These carry little meaning and removing them will reduce noise for our model

In [15]:
#download stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

First, lets check for spanish text in our headlines

In [16]:
pip install langdetect



In [17]:
pip install pandarallel



In [18]:
from pandarallel import pandarallel
# Initialize paralell processing
pandarallel.initialize(progress_bar=True)

# Check for Spanish text in Latino Voices
from langdetect import detect
data['language'] = data['description'].parallel_apply(lambda x: detect(x))
spanish = data[data['language'] == 'es']

INFO: Pandarallel will run on 1 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=189802), Label(value='0 / 189802')…

In [19]:
# Check spanish descriptions
pd.set_option('display.max_colwidth', None)
spanish

Unnamed: 0,link,category,authors,date,description,language
11152,https://www.huffingtonpost.com/entry/venezuela-prison-fire_us_5accf59ae4b0e3074f651cb9,LATINO VOICES,"The Conversation US, Editorial Partner",2018-04-10,behind the scenes of venezuela’s deadly prison fire by rebecca hanson university of florida and leonard gómez núñez universidad nacional experimental de seguridad unes venezuela,es
41853,https://www.huffingtonpost.com/entry/as-us-closes-borders-thousands-of-haitian-refugees_us_58b58a79e4b0e5fdf61976e2,THE WORLDPOST,"The Conversation Global, ContributorThe Conversation is a collaboration between editors and academ...",2017-02-28,as us closes borders thousands of haitian refugees trapped in mexico lose hope ariadna estévez universidad nacional autónoma de méxico unam a united states federal court has blocked president donald,es
49893,https://www.huffingtonpost.com/entry/fidel-castro-life-in-photos_us_5839a286e4b000af95ee4e34,THE WORLDPOST,Jesselyn Cook,2016-11-26,explore cuban leader fidel castros controversial life in photos cubas longtime ruler leaves a divisive legacy,es
51313,https://www.huffingtonpost.com/entry/marco-rubio-florida-senate_us_5820dc23e4b0e80b02cbcee9,POLITICS,Zach Carter,2016-11-09,marco rubio survives in florida senate race hes not done just yet,es
52653,https://www.huffingtonpost.com/entry/youll-go-gaga-over-james-cordens-next-carpool-karaoke-guest_us_580efd18e4b02444efa509c0,ENTERTAINMENT,Rebecca Shapiro,2016-10-25,youll go gaga for james cordens latest carpool karaoke promo applause,es
54255,https://www.huffingtonpost.com/entry/2016-nobel-peace-prize_us_57f7649ce4b068ecb5dd997d,THE WORLDPOST,,2016-10-07,2016 nobel peace prize awarded to colombian president juan manuel santos he is the 2nd colombianborn nobel laureate after writer gabriel garcía márquez,es
64520,https://www.huffingtonpost.com/entry/logo-queer-trailblazer-honors_us_5758632ce4b0e39a28ac3fed,QUEER VOICES,James Michael Nichols,2016-06-08,logo to honor queer trailblazers happy pride,es
66972,https://www.huffingtonpost.com/entry/gluten-free-bread_us_5730da14e4b096e9f09230a4,TASTE,Michelle Persad,2016-05-10,8 glutenfree bread recipes you have to try no gluten no problem,es
75502,https://www.huffingtonpost.com/entry/barack-obama-serenades-hillarys-america-in-madame-president_us_56afb37be4b0b8d7c2301d3f,COMEDY,"Nadya Agrawal, Guest Writer",2016-02-01,barack obama serenades hillary clinton in parody endorsement video heres to you madame president,es
75552,https://www.huffingtonpost.com/entry/leonardo-dicaprio-vape-pen-sag-awards_us_56ae192ee4b00b033aaf6cf0,ENTERTAINMENT,Cole Delbyck,2016-01-31,meet your new favorite celebrity couple leonardo dicaprio and his vape pen leonardo divaprio,es


Looks like most of the spanish language is names of people and places. We'll translate this

In [20]:
pip install googletrans==4.0.0-rc1



In [21]:
from googletrans import Translator
translator = Translator()

In [22]:
# Translate spanish text
spanish['description'] = spanish['description'].parallel_apply(lambda x: translator.translate(x, dest='en').text)
spanish

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=16), Label(value='0 / 16'))),))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  spanish['description'] = spanish['description'].parallel_apply(lambda x: translator.translate(x, dest='en').text)


Unnamed: 0,link,category,authors,date,description,language
11152,https://www.huffingtonpost.com/entry/venezuela-prison-fire_us_5accf59ae4b0e3074f651cb9,LATINO VOICES,"The Conversation US, Editorial Partner",2018-04-10,Behind the Scenes of Venezuela’s Deadly Prison Fire By Rebecca Hanson University of Florida and Leonard Gómez Núñez National Experimental University of Security UNES Venezuela,es
41853,https://www.huffingtonpost.com/entry/as-us-closes-borders-thousands-of-haitian-refugees_us_58b58a79e4b0e5fdf61976e2,THE WORLDPOST,"The Conversation Global, ContributorThe Conversation is a collaboration between editors and academ...",2017-02-28,as us shorts borders those thousand of haitian refugees trapped in Mexico Lose Ariadna Esévez University of Nacional Autónoma de México Unam of the United States Federal Court Has Blocked President Donald,es
49893,https://www.huffingtonpost.com/entry/fidel-castro-life-in-photos_us_5839a286e4b000af95ee4e34,THE WORLDPOST,Jesselyn Cook,2016-11-26,explore cuban leader fidel castros controversial life in photos cubas longtime ruler leaves a divisive legacy,es
51313,https://www.huffingtonpost.com/entry/marco-rubio-florida-senate_us_5820dc23e4b0e80b02cbcee9,POLITICS,Zach Carter,2016-11-09,marco rubio survives in florida senate race hes not done just yet,es
52653,https://www.huffingtonpost.com/entry/youll-go-gaga-over-james-cordens-next-carpool-karaoke-guest_us_580efd18e4b02444efa509c0,ENTERTAINMENT,Rebecca Shapiro,2016-10-25,youll go gaga for james cordens latest carpool karaoke promo applause,es
54255,https://www.huffingtonpost.com/entry/2016-nobel-peace-prize_us_57f7649ce4b068ecb5dd997d,THE WORLDPOST,,2016-10-07,2016 nobel peace prize awarded to colombian president juan manuel santos he is the 2nd colombianborn nobel laureate after writer gabriel garcía márquez,es
64520,https://www.huffingtonpost.com/entry/logo-queer-trailblazer-honors_us_5758632ce4b0e39a28ac3fed,QUEER VOICES,James Michael Nichols,2016-06-08,logo to honor queer trailblazers happy pride,es
66972,https://www.huffingtonpost.com/entry/gluten-free-bread_us_5730da14e4b096e9f09230a4,TASTE,Michelle Persad,2016-05-10,8 glutenfree bread recipes you have to try no gluten no problem,es
75502,https://www.huffingtonpost.com/entry/barack-obama-serenades-hillarys-america-in-madame-president_us_56afb37be4b0b8d7c2301d3f,COMEDY,"Nadya Agrawal, Guest Writer",2016-02-01,barack obama serenades hillary clinton in parody endorsement video heres to you madame president,es
75552,https://www.huffingtonpost.com/entry/leonardo-dicaprio-vape-pen-sag-awards_us_56ae192ee4b00b033aaf6cf0,ENTERTAINMENT,Cole Delbyck,2016-01-31,meet your new favorite celebrity couple leonardo dicaprio and his vape pen leonardo divaprio,es


In [23]:
# Lower case the translated language
spanish['description'] = spanish['description'].str.lower()
spanish.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  spanish['description'] = spanish['description'].str.lower()


Unnamed: 0,link,category,authors,date,description,language
11152,https://www.huffingtonpost.com/entry/venezuela-prison-fire_us_5accf59ae4b0e3074f651cb9,LATINO VOICES,"The Conversation US, Editorial Partner",2018-04-10,behind the scenes of venezuela’s deadly prison fire by rebecca hanson university of florida and leonard gómez núñez national experimental university of security unes venezuela,es
41853,https://www.huffingtonpost.com/entry/as-us-closes-borders-thousands-of-haitian-refugees_us_58b58a79e4b0e5fdf61976e2,THE WORLDPOST,"The Conversation Global, ContributorThe Conversation is a collaboration between editors and academ...",2017-02-28,as us shorts borders those thousand of haitian refugees trapped in mexico lose ariadna esévez university of nacional autónoma de méxico unam of the united states federal court has blocked president donald,es
49893,https://www.huffingtonpost.com/entry/fidel-castro-life-in-photos_us_5839a286e4b000af95ee4e34,THE WORLDPOST,Jesselyn Cook,2016-11-26,explore cuban leader fidel castros controversial life in photos cubas longtime ruler leaves a divisive legacy,es
51313,https://www.huffingtonpost.com/entry/marco-rubio-florida-senate_us_5820dc23e4b0e80b02cbcee9,POLITICS,Zach Carter,2016-11-09,marco rubio survives in florida senate race hes not done just yet,es
52653,https://www.huffingtonpost.com/entry/youll-go-gaga-over-james-cordens-next-carpool-karaoke-guest_us_580efd18e4b02444efa509c0,ENTERTAINMENT,Rebecca Shapiro,2016-10-25,youll go gaga for james cordens latest carpool karaoke promo applause,es


Now the spanish text looks better so we can add it back into the original dataframe. I also notice there is some emojis here so we will take care of that after the StopWords

In [24]:
# Add translated text back to original df
data = pd.concat([data, spanish]).drop_duplicates()
data[data['language'] == 'es']

Unnamed: 0,link,category,authors,date,description,language
11152,https://www.huffingtonpost.com/entry/venezuela-prison-fire_us_5accf59ae4b0e3074f651cb9,LATINO VOICES,"The Conversation US, Editorial Partner",2018-04-10,behind the scenes of venezuela’s deadly prison fire by rebecca hanson university of florida and leonard gómez núñez universidad nacional experimental de seguridad unes venezuela,es
41853,https://www.huffingtonpost.com/entry/as-us-closes-borders-thousands-of-haitian-refugees_us_58b58a79e4b0e5fdf61976e2,THE WORLDPOST,"The Conversation Global, ContributorThe Conversation is a collaboration between editors and academ...",2017-02-28,as us closes borders thousands of haitian refugees trapped in mexico lose hope ariadna estévez universidad nacional autónoma de méxico unam a united states federal court has blocked president donald,es
49893,https://www.huffingtonpost.com/entry/fidel-castro-life-in-photos_us_5839a286e4b000af95ee4e34,THE WORLDPOST,Jesselyn Cook,2016-11-26,explore cuban leader fidel castros controversial life in photos cubas longtime ruler leaves a divisive legacy,es
51313,https://www.huffingtonpost.com/entry/marco-rubio-florida-senate_us_5820dc23e4b0e80b02cbcee9,POLITICS,Zach Carter,2016-11-09,marco rubio survives in florida senate race hes not done just yet,es
52653,https://www.huffingtonpost.com/entry/youll-go-gaga-over-james-cordens-next-carpool-karaoke-guest_us_580efd18e4b02444efa509c0,ENTERTAINMENT,Rebecca Shapiro,2016-10-25,youll go gaga for james cordens latest carpool karaoke promo applause,es
54255,https://www.huffingtonpost.com/entry/2016-nobel-peace-prize_us_57f7649ce4b068ecb5dd997d,THE WORLDPOST,,2016-10-07,2016 nobel peace prize awarded to colombian president juan manuel santos he is the 2nd colombianborn nobel laureate after writer gabriel garcía márquez,es
64520,https://www.huffingtonpost.com/entry/logo-queer-trailblazer-honors_us_5758632ce4b0e39a28ac3fed,QUEER VOICES,James Michael Nichols,2016-06-08,logo to honor queer trailblazers happy pride,es
66972,https://www.huffingtonpost.com/entry/gluten-free-bread_us_5730da14e4b096e9f09230a4,TASTE,Michelle Persad,2016-05-10,8 glutenfree bread recipes you have to try no gluten no problem,es
75502,https://www.huffingtonpost.com/entry/barack-obama-serenades-hillarys-america-in-madame-president_us_56afb37be4b0b8d7c2301d3f,COMEDY,"Nadya Agrawal, Guest Writer",2016-02-01,barack obama serenades hillary clinton in parody endorsement video heres to you madame president,es
75552,https://www.huffingtonpost.com/entry/leonardo-dicaprio-vape-pen-sag-awards_us_56ae192ee4b00b033aaf6cf0,ENTERTAINMENT,Cole Delbyck,2016-01-31,meet your new favorite celebrity couple leonardo dicaprio and his vape pen leonardo divaprio,es


Now we'll take care of the StopWords

In [25]:
# Create variable for english stop words
stopword = stopwords.words('english')

In [26]:
# Remove stopwords
data['description'] = data['description'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stopword)]))
data.head()

Unnamed: 0,link,category,authors,date,description,language
0,https://www.huffpost.com/entry/covid-boosters-uptake-us_n_632d719ee4b087fae6feaac9,U.S. NEWS,"Carla K. Johnson, AP",2022-09-23,4 million americans roll sleeves omicrontargeted covid boosters health experts said early predict whether demand would match 171 million doses new boosters us ordered fall,en
1,https://www.huffpost.com/entry/american-airlines-passenger-banned-flight-attendant-punch-justice-department_n_632e25d3e4b0e247890329fe,U.S. NEWS,Mary Papenfuss,2022-09-23,american airlines flyer charged banned life punching flight attendant video subdued passengers crew fled back aircraft confrontation according us attorneys office los angeles,en
2,https://www.huffpost.com/entry/funniest-tweets-cats-dogs-september-17-23_n_632de332e4b0695c1d81dc02,COMEDY,Elyse Wanshel,2022-09-23,23 funniest tweets cats dogs week sept 1723 dog dont understand could eaten,en
3,https://www.huffpost.com/entry/funniest-parenting-tweets_l_632d7d15e4b0d12b5403e479,PARENTING,Caroline Bologna,2022-09-23,funniest tweets parents week sept 1723 accidentally put grownup toothpaste toddler’s toothbrush screamed like cleaning teeth carolina reaper dipped tabasco sauce,en
4,https://www.huffpost.com/entry/amy-cooper-loses-discrimination-lawsuit-franklin-templeton_n_632c6463e4b09d8701bd227e,U.S. NEWS,Nina Golgowski,2022-09-22,woman called cops black birdwatcher loses lawsuit exemployer amy cooper accused investment firm franklin templeton unfairly firing branding racist video central park encounter went viral,en


# Handle Emojis

In [27]:
pip install emoji



In [28]:
import emoji

# Remove emojis from description
data['description'] = data['description'].apply(lambda x: emoji.replace_emoji(x, replace=''))
data.iloc[36074]

Unnamed: 0,36074
link,https://www.huffingtonpost.com/entry/mike-pence-cinco-de-mayo-latino-trump_us_590c90a7e4b0104c734e6e8e
category,LATINO VOICES
authors,Carolina Moreno
date,2017-05-05
description,mike pence uses cinco de mayo party claim latinos priority trump
language,en


#Tokenization

Tokenization will break down the text into managable words for processing and standardize the words

In [29]:
# Tokenize the description column
data['description'] = data['description'].parallel_apply(lambda x: word_tokenize(x))
data.head()

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=189805), Label(value='0 / 189805')…

Unnamed: 0,link,category,authors,date,description,language
0,https://www.huffpost.com/entry/covid-boosters-uptake-us_n_632d719ee4b087fae6feaac9,U.S. NEWS,"Carla K. Johnson, AP",2022-09-23,"[4, million, americans, roll, sleeves, omicrontargeted, covid, boosters, health, experts, said, early, predict, whether, demand, would, match, 171, million, doses, new, boosters, us, ordered, fall]",en
1,https://www.huffpost.com/entry/american-airlines-passenger-banned-flight-attendant-punch-justice-department_n_632e25d3e4b0e247890329fe,U.S. NEWS,Mary Papenfuss,2022-09-23,"[american, airlines, flyer, charged, banned, life, punching, flight, attendant, video, subdued, passengers, crew, fled, back, aircraft, confrontation, according, us, attorneys, office, los, angeles]",en
2,https://www.huffpost.com/entry/funniest-tweets-cats-dogs-september-17-23_n_632de332e4b0695c1d81dc02,COMEDY,Elyse Wanshel,2022-09-23,"[23, funniest, tweets, cats, dogs, week, sept, 1723, dog, dont, understand, could, eaten]",en
3,https://www.huffpost.com/entry/funniest-parenting-tweets_l_632d7d15e4b0d12b5403e479,PARENTING,Caroline Bologna,2022-09-23,"[funniest, tweets, parents, week, sept, 1723, accidentally, put, grownup, toothpaste, toddler, ’, s, toothbrush, screamed, like, cleaning, teeth, carolina, reaper, dipped, tabasco, sauce]",en
4,https://www.huffpost.com/entry/amy-cooper-loses-discrimination-lawsuit-franklin-templeton_n_632c6463e4b09d8701bd227e,U.S. NEWS,Nina Golgowski,2022-09-22,"[woman, called, cops, black, birdwatcher, loses, lawsuit, exemployer, amy, cooper, accused, investment, firm, franklin, templeton, unfairly, firing, branding, racist, video, central, park, encounter, went, viral]",en


# Stemming

Stemming will reduce the words to their root words (removing suffixes). This helps simplify the vocabulary.

In [30]:
from nltk.stem import PorterStemmer

# Instantiate stemmer
pst = PorterStemmer()

In [31]:
# Use stemmer on description column
data['description'] = data['description'].parallel_apply(lambda x: [pst.stem(word) for word in x])
data.head()

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=189805), Label(value='0 / 189805')…

Unnamed: 0,link,category,authors,date,description,language
0,https://www.huffpost.com/entry/covid-boosters-uptake-us_n_632d719ee4b087fae6feaac9,U.S. NEWS,"Carla K. Johnson, AP",2022-09-23,"[4, million, american, roll, sleev, omicrontarget, covid, booster, health, expert, said, earli, predict, whether, demand, would, match, 171, million, dose, new, booster, us, order, fall]",en
1,https://www.huffpost.com/entry/american-airlines-passenger-banned-flight-attendant-punch-justice-department_n_632e25d3e4b0e247890329fe,U.S. NEWS,Mary Papenfuss,2022-09-23,"[american, airlin, flyer, charg, ban, life, punch, flight, attend, video, subdu, passeng, crew, fled, back, aircraft, confront, accord, us, attorney, offic, lo, angel]",en
2,https://www.huffpost.com/entry/funniest-tweets-cats-dogs-september-17-23_n_632de332e4b0695c1d81dc02,COMEDY,Elyse Wanshel,2022-09-23,"[23, funniest, tweet, cat, dog, week, sept, 1723, dog, dont, understand, could, eaten]",en
3,https://www.huffpost.com/entry/funniest-parenting-tweets_l_632d7d15e4b0d12b5403e479,PARENTING,Caroline Bologna,2022-09-23,"[funniest, tweet, parent, week, sept, 1723, accident, put, grownup, toothpast, toddler, ’, s, toothbrush, scream, like, clean, teeth, carolina, reaper, dip, tabasco, sauc]",en
4,https://www.huffpost.com/entry/amy-cooper-loses-discrimination-lawsuit-franklin-templeton_n_632c6463e4b09d8701bd227e,U.S. NEWS,Nina Golgowski,2022-09-22,"[woman, call, cop, black, birdwatch, lose, lawsuit, exemploy, ami, cooper, accus, invest, firm, franklin, templeton, unfairli, fire, brand, racist, video, central, park, encount, went, viral]",en


# Lemmatization

This will reduce words to their base form to enhance consistency. Lemmatization ensures words are transformed to their canonical form

In [32]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [33]:
# Instantiate Lemmatizer
lemm = WordNetLemmatizer()

In [34]:
# Lemmatize description column
data['description'] = data['description'].parallel_apply(lambda x: [lemm.lemmatize(word) for word in x])
data.head()

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=189805), Label(value='0 / 189805')…

Unnamed: 0,link,category,authors,date,description,language
0,https://www.huffpost.com/entry/covid-boosters-uptake-us_n_632d719ee4b087fae6feaac9,U.S. NEWS,"Carla K. Johnson, AP",2022-09-23,"[4, million, american, roll, sleev, omicrontarget, covid, booster, health, expert, said, earli, predict, whether, demand, would, match, 171, million, dose, new, booster, u, order, fall]",en
1,https://www.huffpost.com/entry/american-airlines-passenger-banned-flight-attendant-punch-justice-department_n_632e25d3e4b0e247890329fe,U.S. NEWS,Mary Papenfuss,2022-09-23,"[american, airlin, flyer, charg, ban, life, punch, flight, attend, video, subdu, passeng, crew, fled, back, aircraft, confront, accord, u, attorney, offic, lo, angel]",en
2,https://www.huffpost.com/entry/funniest-tweets-cats-dogs-september-17-23_n_632de332e4b0695c1d81dc02,COMEDY,Elyse Wanshel,2022-09-23,"[23, funniest, tweet, cat, dog, week, sept, 1723, dog, dont, understand, could, eaten]",en
3,https://www.huffpost.com/entry/funniest-parenting-tweets_l_632d7d15e4b0d12b5403e479,PARENTING,Caroline Bologna,2022-09-23,"[funniest, tweet, parent, week, sept, 1723, accident, put, grownup, toothpast, toddler, ’, s, toothbrush, scream, like, clean, teeth, carolina, reaper, dip, tabasco, sauc]",en
4,https://www.huffpost.com/entry/amy-cooper-loses-discrimination-lawsuit-franklin-templeton_n_632c6463e4b09d8701bd227e,U.S. NEWS,Nina Golgowski,2022-09-22,"[woman, call, cop, black, birdwatch, lose, lawsuit, exemploy, ami, cooper, accus, invest, firm, franklin, templeton, unfairli, fire, brand, racist, video, central, park, encount, went, viral]",en


# Export data

In [35]:
data.to_csv('news_data_processed.csv', index=False)