# Preprocessing Stage

This .ipynb file is for preprocessing the texts from the scraped articles. At the end we obtain the LIB, CORPUS, and VOCAB tables. 

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.lancaster import LancasterStemmer
import re
import warnings
warnings.filterwarnings("ignore")

In [2]:
directory = "./"
data_files = "./files/"
data_out = "./data/"

### Setting Up The CNN Library

In [3]:
cnn = pd.read_csv(data_files + "cnn.csv")
cnn_out = pd.read_csv(data_files + "cnn_out.csv")

In [4]:
titles = cnn[['id', 'url_dir_pr']]

In [5]:
cnn_structure = pd.merge(cnn_out, titles, how="left", on="id").drop("Unnamed: 0", axis=1)
cnn_lib = cnn_structure[['url', 'url_dir_pr', 'scraped']]

In [6]:
curr = ['url_dir_pr', 'scraped']
new_names = ['title', 'text']

In [7]:
for i in range(len(curr)):
    cnn_lib.columns = cnn_lib.columns.str.replace(curr[i], new_names[i])

In [8]:
cnn_lib['source'] = "CNN"

In [9]:
cnn_lib['date'] = cnn_lib['url'].apply(lambda url: '/'.join(url.split("/")[3:6]))
cnn_lib['date'] = cnn_lib['date'].where(cnn_lib['date'].str.match(r'^\d'), np.nan)

In [10]:
# Fixing the order of the columns
good_order = ['source', 'date', 'url', 'title', 'text']
cnn_lib = cnn_lib[good_order]

### Setting Up The CNBC Library

In [14]:
cnbc = pd.read_csv(data_files + "cnbc.csv")
cnbc_out = pd.read_csv(data_files + "cnbc_out.csv")

In [15]:
titles_cnbc = cnbc[['id', 'url_dir_page_pr']]
cnbc_structure = pd.merge(cnbc_out, titles_cnbc, how="left", on="id").drop("Unnamed: 0", axis=1)
cnbc_lib = cnbc_structure[['url', 'url_dir_page_pr', 'scraped']]

In [16]:
curr = ['url_dir_page_pr', 'scraped']
new_names = ['title', 'text']

In [17]:
for i in range(len(curr)):
    cnbc_lib.columns = cnbc_lib.columns.str.replace(curr[i], new_names[i])

In [18]:
cnbc_lib['source'] = "CNBC"

In [19]:
cnbc_lib.head()

Unnamed: 0,url,title,text,source
0,https://www.cnbc.com/2020/09/23/coronavirus-is...,coronavirus safe trick treat pandemic cdc,+++Related Stories+++Feeling spooked about wha...,CNBC
1,https://www.cnbc.com/2020/11/22/chris-christie...,chris christie tell trump time end legal fight,WASHINGTON — President Donald Trumps confidant...,CNBC
2,https://www.cnbc.com/2020/11/07/america-reacts...,america react joe biden win presidential,Americans poured into the streets Saturday aft...,CNBC
3,https://www.cnbc.com/2020/09/17/social-securit...,social security benefit money cola,The Social Security Administration wont announ...,CNBC
4,https://www.cnbc.com/2020/12/04/cdcs-new-guida...,cdcs new guidance short covid quarantines catch,The Centers for Disease Control and Prevention...,CNBC


In [20]:
cnbc_lib['date'] = cnbc_lib['url'].apply(lambda url: '/'.join(url.split("/")[3:6]))

In [21]:
cnbc_lib = cnbc_lib[good_order]

### Create LIB

In [29]:
LIB = pd.concat([cnn_lib, cnbc_lib], axis=0).reset_index().drop("index", axis=1)

In [33]:
LIB.tail(10)

Unnamed: 0,source,date,url,title,text
980,CNBC,2020/12/25,https://www.cnbc.com/2020/12/25/the-plant-base...,plant base meat industry rise challenge,SINGAPORE — Demand for meat alternatives has g...
981,CNBC,2020/12/22,https://www.cnbc.com/2020/12/22/cdc-says-new-c...,cdc says new covid strain uk circulating unde...,The new coronavirus strain that was first dete...
982,CNBC,2020/11/06,https://www.cnbc.com/2020/11/06/dr-scott-gottl...,dr scott gottlieb daily covid case half,Dr. Scott Gottlieb on Friday offered a dismal ...
983,CNBC,2020/11/23,https://www.cnbc.com/2020/11/23/ghislaine-maxw...,ghislaine maxwell quarantine possible coronav...,"Ghislaine Maxwell, the British socialite charg..."
984,CNBC,2020/11/19,https://www.cnbc.com/2020/11/19/nfl-visa-to-ho...,nfl visa host cashless super bowl lv tampa,In this article+++For the first time in 55 yea...
985,CNBC,2020/02/11,https://www.cnbc.com/2020/02/11/3-perfect-resu...,perfect resume example base experience level ...,+++Related Stories+++There is no one-size-fits...
986,CNBC,2020/12/01,https://www.cnbc.com/2020/12/01/nasdaq-proposa...,nasdaq proposal require company great diversity,In this article+++Nasdaq is pushing for greate...
987,CNBC,2020/09/15,https://www.cnbc.com/2020/09/15/social-securit...,social security benefit base,About 88% of Americans over the age of 65 will...
988,CNBC,2020/09/15,https://www.cnbc.com/2020/09/15/stock-market-l...,stock market live update dow future big tech ...,This is CNBCs markets live blog that will be u...
989,CNBC,2020/11/24,https://www.cnbc.com/2020/11/24/bidens-foreign...,biden foreign policy team lay national securi...,"WASHINGTON — ""Diplomacy is back,"" and America ..."


In [34]:
print(len(LIB), len(LIB.drop_duplicates()))

990 760


There are some duplicates in our dataset, but we will keep these for the purposes of our analysis.

### Separating Paragraphs

In [35]:
PARAS = LIB['text'].str.split("\+\+\+", expand=True).stack()\
    .to_frame('para_str').sort_index()
PARAS.index.names = ['text_num', 'para_num']
PARAS['para_str'] = PARAS['para_str'].str.replace(r'\n', ' ', regex=True)
PARAS['para_str'] = PARAS['para_str'].str.strip()
PARAS = PARAS[~PARAS['para_str'].str.match(r'^\s*$')]

In [36]:
PARAS

Unnamed: 0_level_0,Unnamed: 1_level_0,para_str
text_num,para_num,Unnamed: 2_level_1
0,0,Two more infants have contracted the herpes vi...
0,1,"In the ritual, known as metzitzah b’peh, after..."
0,2,Most adults are infected with the herpes simpl...
0,3,“While HSV-1 in adults can cause the common co...
0,4,"Since 2000, there have been 13 reports in New ..."
...,...,...
989,13,"Mayorkas: ""The Department of Homeland Security..."
989,14,"Thomas-Greenfield: ""My fellow career diplomats..."
989,15,"Haines: ""Mr. President-elect, you know that Iv..."
989,16,"Sullivan: Mr. President-elect, ""You have also ..."


### Breaking down into sentences

In [37]:
SENTS = PARAS.para_str.apply(lambda x: pd.Series(nltk.sent_tokenize(x)))\
        .stack()\
        .to_frame('sent_str')
SENTS.index.names = ['text_num', 'para_num', 'sent_num']

In [38]:
SENTS

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sent_str
text_num,para_num,sent_num,Unnamed: 3_level_1
0,0,0,Two more infants have contracted the herpes vi...
0,1,0,"In the ritual, known as metzitzah b’peh, after..."
0,1,1,Antibacterial ointment is applied and the woun...
0,1,2,The health department says the procedure is da...
0,2,0,Most adults are infected with the herpes simpl...
...,...,...,...
989,16,0,"Sullivan: Mr. President-elect, ""You have also ..."
989,16,1,"You have told us the alliances we rebuild, the..."
989,17,0,"Kerry: ""The road ahead is exciting."
989,17,1,It means creating millions of middle-class job...


In [39]:
# Getting rid of characters and standardizing the text
SENTS['sent_str'] = SENTS['sent_str'].str.replace(r'\W', ' ').str.lower()

In [40]:
SENTS

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sent_str
text_num,para_num,sent_num,Unnamed: 3_level_1
0,0,0,two more infants have contracted the herpes vi...
0,1,0,in the ritual known as metzitzah b peh after...
0,1,1,antibacterial ointment is applied and the woun...
0,1,2,the health department says the procedure is da...
0,2,0,most adults are infected with the herpes simpl...
...,...,...,...
989,16,0,sullivan mr president elect you have also ...
989,16,1,you have told us the alliances we rebuild the...
989,17,0,kerry the road ahead is exciting
989,17,1,it means creating millions of middle class job...


### Getting Tokens

In [41]:
keep_whitespace = True

In [42]:
# Breaking into tokens
if keep_whitespace:
    TOKENS = SENTS.sent_str\
            .apply(lambda x: pd.Series(nltk.pos_tag(nltk.word_tokenize(x))))\
            .stack()\
            .to_frame('pos_tuple')
else:
    TOKENS = SENTS.sent_str\
            .apply(lambda x: pd.Series(nltk.pos_tag(nltk.WhitespaceTokenizer().tokenize(x))))\
            .stack()\
            .to_frame('pos_tuple')
TOKENS.index.names = ["text_num","para_num","sent_num","token_num"]

In [43]:
TOKENS

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,pos_tuple
text_num,para_num,sent_num,token_num,Unnamed: 4_level_1
0,0,0,0,"(two, CD)"
0,0,0,1,"(more, JJR)"
0,0,0,2,"(infants, NNS)"
0,0,0,3,"(have, VBP)"
0,0,0,4,"(contracted, VBN)"
...,...,...,...,...
989,17,2,19,"(healing, NN)"
989,17,2,20,"(planet, NN)"
989,17,2,21,"(to, TO)"
989,17,2,22,"(future, JJ)"


### Create CORPUS

In [44]:
CORPUS = TOKENS
CORPUS['pos'] = CORPUS.pos_tuple.apply(lambda x: x[1])
CORPUS['token_str'] = CORPUS.pos_tuple.apply(lambda x: x[0])
CORPUS['term_str'] = CORPUS.token_str.str.lower()

In [45]:
CORPUS

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,pos_tuple,pos,token_str,term_str
text_num,para_num,sent_num,token_num,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0,0,0,"(two, CD)",CD,two,two
0,0,0,1,"(more, JJR)",JJR,more,more
0,0,0,2,"(infants, NNS)",NNS,infants,infants
0,0,0,3,"(have, VBP)",VBP,have,have
0,0,0,4,"(contracted, VBN)",VBN,contracted,contracted
...,...,...,...,...,...,...,...
989,17,2,19,"(healing, NN)",NN,healing,healing
989,17,2,20,"(planet, NN)",NN,planet,planet
989,17,2,21,"(to, TO)",TO,to,to
989,17,2,22,"(future, JJ)",JJ,future,future


### Extracting VOCAB

In [46]:
VOCAB = CORPUS.term_str.value_counts().to_frame('n').sort_index()
VOCAB.index.name = 'term_str'
VOCAB['n_chars'] = VOCAB.index.str.len()
VOCAB['p'] = VOCAB.n / VOCAB.n.sum()
VOCAB['i'] = -np.log2(VOCAB.p)
VOCAB['max_pos'] = CORPUS[['term_str','pos']].value_counts().unstack(fill_value=0).idxmax(1)
VOCAB['n_pos'] = CORPUS[['term_str','pos']].value_counts().unstack().count(1)
VOCAB['cat_pos'] = CORPUS[['term_str','pos']].value_counts().to_frame('n').reset_index()\
    .groupby('term_str').pos.apply(lambda x: set(x))
sw = pd.DataFrame(nltk.corpus.stopwords.words('english'), columns=['term_str'])
sw = sw.reset_index().set_index('term_str')
sw.columns = ['dummy']
sw.dummy = 1
VOCAB['stop'] = VOCAB.index.map(sw.dummy)
VOCAB['stop'] = VOCAB['stop'].fillna(0).astype('int')
VOCAB = VOCAB.drop('cat_pos', 1) 

stemmer1 = PorterStemmer()
VOCAB['stem_porter'] = VOCAB.apply(lambda x: stemmer1.stem(x.name), 1)

stemmer2 = SnowballStemmer("english")
VOCAB['stem_snowball'] = VOCAB.apply(lambda x: stemmer2.stem(x.name), 1)

stemmer3 = LancasterStemmer()
VOCAB['stem_lancaster'] = VOCAB.apply(lambda x: stemmer3.stem(x.name), 1)

VOCAB.sort_values('p', ascending=False).head(10)

Unnamed: 0_level_0,n,n_chars,p,i,max_pos,n_pos,stop,stem_porter,stem_snowball,stem_lancaster
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
the,45744,3,0.056716,4.140108,DT,1,1,the,the,the
to,22702,2,0.028147,5.150871,TO,1,1,to,to,to
of,19297,2,0.023925,5.385314,IN,1,1,of,of,of
and,18537,3,0.022983,5.443282,CC,1,1,and,and,and
a,18140,1,0.022491,5.474516,DT,1,1,a,a,a
in,15926,2,0.019746,5.662306,IN,1,1,in,in,in
that,9787,4,0.012134,6.364752,IN,4,1,that,that,that
for,8271,3,0.010255,6.607557,IN,1,1,for,for,for
on,7263,2,0.009005,6.795053,IN,1,1,on,on,on
is,6956,2,0.008624,6.85736,VBZ,1,1,is,is,is


In [47]:
VOCAB

Unnamed: 0_level_0,n,n_chars,p,i,max_pos,n_pos,stop,stem_porter,stem_snowball,stem_lancaster
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,201,1,0.000249,11.970351,CD,1,0,0,0,0
00,17,2,0.000021,15.533940,CD,1,0,00,00,00
000,898,3,0.001113,9.810831,CD,1,0,000,000,000
0000,1,4,0.000001,19.621403,CD,1,0,0000,0000,0000
007,6,3,0.000007,17.036440,CD,1,0,007,007,007
...,...,...,...,...,...,...,...,...,...,...
zung,4,4,0.000005,17.621403,NN,1,0,zung,zung,zung
zurich,2,6,0.000002,18.621403,CD,2,0,zurich,zurich,zurich
zwiebel,1,7,0.000001,19.621403,NNP,1,0,zwiebel,zwiebel,zwiebel
zwilling,2,8,0.000002,18.621403,VBG,1,0,zwill,zwill,zwil


### Extract LIB, CORPUS, VOCAB

In [48]:
LIB.to_csv(data_out+"LIB.csv", index=False)
CORPUS.to_csv(data_out+"CORPUS.csv")
VOCAB.to_csv(data_out+"VOCAB.csv")