In [1]:
import pandas as pd
import emoji
import regex as re
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.stem import PorterStemmer

## Functions

In [2]:
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

In [3]:
def clean_text(filename, start_col, end_col, clean_usa=False, deemojize=False, clean_punctuation=False, remove_stopwords=False, lemmatize=False, stemming=False):
    df = pd.read_excel(filename)
    df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

    stopwords = nltk.corpus.stopwords.words('english')
    lemmatizer = WordNetLemmatizer()
    ps = PorterStemmer()
    cleaned_text_list = []

    for idx, row in df.iterrows():
        # concatenate text columns
        text = ' '.join(row[start_col:end_col+1].dropna().astype(str)) 
        
        # lower case
        text = text.lower()

        if 'reddit' in filename:
            text = '' if (str(text) == 'nan' or str(text) == '[removed]') else str(text)
            text = text.replace('amp;', '')

        if clean_usa:
            text = text.replace('u.s.', 'united states').replace('u.s.a.', 'united states')
    
        if deemojize:
            text = emoji.demojize(text)
    
        if clean_punctuation:
            text = re.sub(r'[^\w\s]', '', text)
    
        # tokenize
        tokens = word_tokenize(text)

        if remove_stopwords:
            tokens = [word for word in tokens if word not in stopwords]

        if lemmatize:
            # POS tagging
            tokens = [nltk.pos_tag([word]) for word in tokens]
            # lemmatization
            tokens = [lemmatizer.lemmatize(word[0][0], get_wordnet_pos(word[0][1])) 
                if get_wordnet_pos(word[0][1])!=None else lemmatizer.lemmatize(word[0][0]) for word in tokens]
    
        if stemming:
            tokens = [ps.stem(word) for word in tokens]
    
        # concatenate tokens back
        cleaned_text = " ".join(tokens)
        cleaned_text_list.append(cleaned_text)

        if idx % 1000 == 0:
            print(f'---- {idx} DONE ----')

    df['cleaned_text'] = cleaned_text_list
    
    return df

## New York Times

In [4]:
nyt_df = clean_text('../data/raw/nyt_2016_2022_final.xlsx', start_col=2, end_col=4, clean_usa=True, deemojize=True, clean_punctuation=True, remove_stopwords=True, lemmatize=True, stemming=False)

---- 0 DONE ----
---- 1000 DONE ----
---- 2000 DONE ----
---- 3000 DONE ----
---- 4000 DONE ----
---- 5000 DONE ----
---- 6000 DONE ----
---- 7000 DONE ----
---- 8000 DONE ----
---- 9000 DONE ----
---- 10000 DONE ----
---- 11000 DONE ----
---- 12000 DONE ----
---- 13000 DONE ----
---- 14000 DONE ----
---- 15000 DONE ----
---- 16000 DONE ----
---- 17000 DONE ----
---- 18000 DONE ----


In [5]:
nyt_df.head(5)

Unnamed: 0,date,keyword,headline,abstract,lead_paragraph,section,hits,word_count,cleaned_text
0,2016-01-01,"['Microsoft', 'Tesla']",Looking Beyond the Internet of Things,"Adam Bosworth, a tech pioneer, sees the future...",SAN FRANCISCO — If you have sent email on Goog...,Technology,1259,1180,look beyond internet thing adam bosworth tech ...
1,2016-01-01,"[""McDonald's""]",No More Statutes of Limitations for Rape,Bill Cosby came close to escaping sexual assau...,"THIS week, Bill Cosby was charged with three c...",Opinion,1514,914,statute limitation rape bill cosby come close ...
2,2016-01-01,['Visa'],U.S. Doesn’t Know How Many Foreign Visitors Ov...,After two decades of failed attempts to track ...,WASHINGTON — The question from the congressman...,U.S.,2394,1189,united state doesnt know many foreign visitor ...
3,2016-01-02,['Fed'],Making And Using Models,It’s about self-discipline.,"Larry Summers, Brad DeLong, and yours truly ar...",Opinion,3998,576,make use model selfdiscipline larry summer bra...
4,2016-01-02,['Amazon'],Cutting the Cord and Feeling Good About It,Canceling cable has meant becoming more intent...,"Nearly three years ago, when I first thought a...",Opinion,6215,440,cut cord feel good cancel cable meant become i...


In [6]:
nyt_df.to_excel('../data/cleaned/nyt_2016_2022_cleaned_1710.xlsx')

## Reddit

In [7]:
reddit_df = clean_text('../data/raw/reddit_2016_2022_final.xlsx', start_col=5, end_col=6, clean_usa=True, deemojize=True, clean_punctuation=True, remove_stopwords=True, lemmatize=True, stemming=False)

---- 0 DONE ----
---- 1000 DONE ----
---- 2000 DONE ----
---- 3000 DONE ----
---- 4000 DONE ----
---- 5000 DONE ----
---- 6000 DONE ----
---- 7000 DONE ----
---- 8000 DONE ----
---- 9000 DONE ----
---- 10000 DONE ----
---- 11000 DONE ----
---- 12000 DONE ----
---- 13000 DONE ----
---- 14000 DONE ----
---- 15000 DONE ----
---- 16000 DONE ----
---- 17000 DONE ----
---- 18000 DONE ----
---- 19000 DONE ----
---- 20000 DONE ----
---- 21000 DONE ----
---- 22000 DONE ----
---- 23000 DONE ----
---- 24000 DONE ----
---- 25000 DONE ----
---- 26000 DONE ----
---- 27000 DONE ----
---- 28000 DONE ----
---- 29000 DONE ----
---- 30000 DONE ----
---- 31000 DONE ----
---- 32000 DONE ----
---- 33000 DONE ----
---- 34000 DONE ----
---- 35000 DONE ----
---- 36000 DONE ----
---- 37000 DONE ----
---- 38000 DONE ----
---- 39000 DONE ----
---- 40000 DONE ----
---- 41000 DONE ----
---- 42000 DONE ----
---- 43000 DONE ----
---- 44000 DONE ----
---- 45000 DONE ----
---- 46000 DONE ----
---- 47000 DONE ----
---- 

In [None]:
reddit_df.to_excel('../data/cleaned/reddit_2016_2022_cleaned_1710.xlsx')