# Preprocessing Stage

This .ipynb file is for preprocessing the texts from the scraped articles. At the end we obtain the LIB, CORPUS, and VOCAB tables. 

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.lancaster import LancasterStemmer
import re
import warnings
warnings.filterwarnings("ignore")

In [2]:
directory = "./"
data_files = "./files/"
data_out = "./data/"

### Setting Up The CNN Library

In [3]:
cnn = pd.read_csv(data_files + "cnn.csv")
cnn_out = pd.read_csv(data_files + "cnn_out.csv")

In [4]:
titles = cnn[['id', 'url_dir_pr']]

In [5]:
cnn_structure = pd.merge(cnn_out, titles, how="left", on="id").drop("Unnamed: 0", axis=1)
cnn_lib = cnn_structure[['url', 'url_dir_pr', 'scraped']]

In [6]:
curr = ['url_dir_pr', 'scraped']
new_names = ['title', 'text']

In [7]:
for i in range(len(curr)):
    cnn_lib.columns = cnn_lib.columns.str.replace(curr[i], new_names[i])

In [24]:
cnn_lib['source_id'] = "CNN"

In [25]:
cnn_lib['date'] = cnn_lib['url'].apply(lambda url: '/'.join(url.split("/")[3:6]))
cnn_lib['date'] = cnn_lib['date'].where(cnn_lib['date'].str.match(r'^\d'), np.nan)

In [26]:
# Fixing the order of the columns
good_order = ['source_id', 'date', 'url', 'title', 'text']
cnn_lib = cnn_lib[good_order]

### Setting Up The CNBC Library

In [27]:
cnbc = pd.read_csv(data_files + "cnbc.csv")
cnbc_out = pd.read_csv(data_files + "cnbc_out.csv")

In [28]:
titles_cnbc = cnbc[['id', 'url_dir_page_pr']]
cnbc_structure = pd.merge(cnbc_out, titles_cnbc, how="left", on="id").drop("Unnamed: 0", axis=1)
cnbc_lib = cnbc_structure[['url', 'url_dir_page_pr', 'scraped']]

In [29]:
curr = ['url_dir_page_pr', 'scraped']
new_names = ['title', 'text']

In [30]:
for i in range(len(curr)):
    cnbc_lib.columns = cnbc_lib.columns.str.replace(curr[i], new_names[i])

In [31]:
cnbc_lib['source_id'] = "CNBC"

In [32]:
cnbc_lib['date'] = cnbc_lib['url'].apply(lambda url: '/'.join(url.split("/")[3:6]))

In [33]:
cnbc_lib = cnbc_lib[good_order]

### Create LIB

In [34]:
LIB = pd.concat([cnn_lib, cnbc_lib], axis=0).reset_index().drop("index", axis=1)

In [35]:
LIB.sample(10)

Unnamed: 0,source_id,date,url,title,text
817,CNBC,2020/12/30,https://www.cnbc.com/2020/12/30/california-has...,california identify case new covid strain newsom,California health officials have identified th...
669,CNBC,2020/10/05,https://www.cnbc.com/2020/10/05/trump-press-se...,trump press secretary kayleigh mcenany test p...,White House press secretary Kayleigh McEnany a...
951,CNBC,2020/10/08,https://www.cnbc.com/2020/10/08/ibm-shares-sur...,ibm share surge plan spin unit separate publi...,IBM said Thursday it would spin off its IT inf...
772,CNBC,2020/10/20,https://www.cnbc.com/2020/10/20/cramer-doj-goo...,cramer doj google suit take alphabet buy strong,CNBCs Jim Cramer said Tuesday that Alphabet wo...
27,CNN,2020/09/12,https://www.cnn.com/2020/09/12/us/top-chef-aar...,chef aaron grissom dead trnd,Former “Top Chef” contestant Aaron Grissom has...
273,CNN,2020/11/13,https://www.cnn.com/2020/11/13/us/505-year-dru...,year drug war sentence seresi free invs,In a stunning reprieve for a man sentenced to ...
137,CNN,2020/11/27,https://www.cnn.com/2020/11/27/cnn-underscored...,cnn underscore black friday sale,Our editors scour the internet each and every ...
254,CNN,2020/10/04,https://www.cnn.com/2020/10/04/politics/nichol...,politic nicholas luna body man trump positive ...,One of the White House aides who works closest...
499,CNBC,2020/11/18,https://www.cnbc.com/2020/11/18/cuomo-offers-f...,cuomo offer free online skill training unempl...,New York Gov. Andrew Cuomo announced on Wednes...
263,CNN,2020/10/05,https://www.cnn.com/2020/10/05/us/breonna-tayl...,breonna taylor second grand juror,A Louisville activist says he’s been contacted...


In [36]:
### Extract LIB file
LIB.to_csv(data_out+"LIB.csv", index=False)

In [97]:
print(len(LIB), len(LIB.drop_duplicates()))

990 990


There are some duplicates in our dataset, but we will keep these for the purposes of our analysis.

### Separating Paragraphs

In [37]:
PARAS = LIB['text'].str.split("\+\+\+", expand=True).stack()\
    .to_frame('para_str').sort_index()
PARAS.index.names = ['text_num', 'para_num']
PARAS['para_str'] = PARAS['para_str'].str.replace(r'\n', ' ', regex=True)
PARAS['para_str'] = PARAS['para_str'].str.strip()
PARAS = PARAS[~PARAS['para_str'].str.match(r'^\s*$')]

In [38]:
PARAS

Unnamed: 0_level_0,Unnamed: 1_level_0,para_str
text_num,para_num,Unnamed: 2_level_1
0,0,Two more infants have contracted the herpes vi...
0,1,"In the ritual, known as metzitzah b’peh, after..."
0,2,Most adults are infected with the herpes simpl...
0,3,“While HSV-1 in adults can cause the common co...
0,4,"Since 2000, there have been 13 reports in New ..."
...,...,...
989,13,"Mayorkas: ""The Department of Homeland Security..."
989,14,"Thomas-Greenfield: ""My fellow career diplomats..."
989,15,"Haines: ""Mr. President-elect, you know that Iv..."
989,16,"Sullivan: Mr. President-elect, ""You have also ..."


### Breaking down into sentences

In [39]:
SENTS = PARAS.para_str.apply(lambda x: pd.Series(nltk.sent_tokenize(x)))\
        .stack()\
        .to_frame('sent_str')
SENTS.index.names = ['text_num', 'para_num', 'sent_num']

In [40]:
# Getting rid of characters and standardizing the text
SENTS['sent_str'] = SENTS['sent_str'].str.replace(r'\W', ' ').str.lower()

In [41]:
SENTS

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sent_str
text_num,para_num,sent_num,Unnamed: 3_level_1
0,0,0,two more infants have contracted the herpes vi...
0,1,0,in the ritual known as metzitzah b peh after...
0,1,1,antibacterial ointment is applied and the woun...
0,1,2,the health department says the procedure is da...
0,2,0,most adults are infected with the herpes simpl...
...,...,...,...
989,16,0,sullivan mr president elect you have also ...
989,16,1,you have told us the alliances we rebuild the...
989,17,0,kerry the road ahead is exciting
989,17,1,it means creating millions of middle class job...


### Getting Tokens

In [42]:
keep_whitespace = True

In [43]:
# Breaking into tokens
if keep_whitespace:
    TOKENS = SENTS.sent_str\
            .apply(lambda x: pd.Series(nltk.pos_tag(nltk.word_tokenize(x))))\
            .stack()\
            .to_frame('pos_tuple')
else:
    TOKENS = SENTS.sent_str\
            .apply(lambda x: pd.Series(nltk.pos_tag(nltk.WhitespaceTokenizer().tokenize(x))))\
            .stack()\
            .to_frame('pos_tuple')

In [46]:
TOKENS.index.names = ['text_num', 'para_num', "sent_num","token_num"]

In [47]:
TOKENS

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,pos_tuple
text_num,para_num,sent_num,token_num,Unnamed: 4_level_1
0,0,0,0,"(two, CD)"
0,0,0,1,"(more, JJR)"
0,0,0,2,"(infants, NNS)"
0,0,0,3,"(have, VBP)"
0,0,0,4,"(contracted, VBN)"
...,...,...,...,...
989,17,2,19,"(healing, NN)"
989,17,2,20,"(planet, NN)"
989,17,2,21,"(to, TO)"
989,17,2,22,"(future, JJ)"


### Create CORPUS

In [48]:
CORPUS = TOKENS
CORPUS['pos'] = CORPUS.pos_tuple.apply(lambda x: x[1])
CORPUS['token_str'] = CORPUS.pos_tuple.apply(lambda x: x[0])
CORPUS['term_str'] = CORPUS.token_str.str.lower()

In [49]:
CORPUS

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,pos_tuple,pos,token_str,term_str
text_num,para_num,sent_num,token_num,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0,0,0,"(two, CD)",CD,two,two
0,0,0,1,"(more, JJR)",JJR,more,more
0,0,0,2,"(infants, NNS)",NNS,infants,infants
0,0,0,3,"(have, VBP)",VBP,have,have
0,0,0,4,"(contracted, VBN)",VBN,contracted,contracted
...,...,...,...,...,...,...,...
989,17,2,19,"(healing, NN)",NN,healing,healing
989,17,2,20,"(planet, NN)",NN,planet,planet
989,17,2,21,"(to, TO)",TO,to,to
989,17,2,22,"(future, JJ)",JJ,future,future


In [50]:
CORPUS.reset_index(inplace=True)

In [51]:
CORPUS['source'] = CORPUS['text_num'].apply(lambda x: 'CNN' if x <= 489 else 'CNBC')

In [52]:
CORPUS.set_index(['source', 'text_num', 'para_num', 'sent_num', 'token_num'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,pos_tuple,pos,token_str,term_str
source,text_num,para_num,sent_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
CNN,0,0,0,0,"(two, CD)",CD,two,two
CNN,0,0,0,1,"(more, JJR)",JJR,more,more
CNN,0,0,0,2,"(infants, NNS)",NNS,infants,infants
CNN,0,0,0,3,"(have, VBP)",VBP,have,have
CNN,0,0,0,4,"(contracted, VBN)",VBN,contracted,contracted
...,...,...,...,...,...,...,...,...
CNBC,989,17,2,19,"(healing, NN)",NN,healing,healing
CNBC,989,17,2,20,"(planet, NN)",NN,planet,planet
CNBC,989,17,2,21,"(to, TO)",TO,to,to
CNBC,989,17,2,22,"(future, JJ)",JJ,future,future


In [53]:
### Extracting CORPUS
CORPUS.to_csv(data_out+"CORPUS.csv")

### Extracting VOCAB

In [54]:
VOCAB = CORPUS.term_str.value_counts().to_frame('n').sort_index()
VOCAB.index.name = 'term_str'
VOCAB['n_chars'] = VOCAB.index.str.len()
VOCAB['p'] = VOCAB.n / VOCAB.n.sum()
VOCAB['i'] = -np.log2(VOCAB.p)
VOCAB['max_pos'] = CORPUS[['term_str','pos']].value_counts().unstack(fill_value=0).idxmax(1)
VOCAB['n_pos'] = CORPUS[['term_str','pos']].value_counts().unstack().count(1)
VOCAB['cat_pos'] = CORPUS[['term_str','pos']].value_counts().to_frame('n').reset_index()\
    .groupby('term_str').pos.apply(lambda x: set(x))
sw = pd.DataFrame(nltk.corpus.stopwords.words('english'), columns=['term_str'])
sw = sw.reset_index().set_index('term_str')
sw.columns = ['dummy']
sw.dummy = 1
VOCAB['stop'] = VOCAB.index.map(sw.dummy)
VOCAB['stop'] = VOCAB['stop'].fillna(0).astype('int')
VOCAB = VOCAB.drop('cat_pos', 1) 

stemmer1 = PorterStemmer()
VOCAB['stem_porter'] = VOCAB.apply(lambda x: stemmer1.stem(x.name), 1)

stemmer2 = SnowballStemmer("english")
VOCAB['stem_snowball'] = VOCAB.apply(lambda x: stemmer2.stem(x.name), 1)

stemmer3 = LancasterStemmer()
VOCAB['stem_lancaster'] = VOCAB.apply(lambda x: stemmer3.stem(x.name), 1)

VOCAB.sort_values('p', ascending=False).head(10)

Unnamed: 0_level_0,n,n_chars,p,i,max_pos,n_pos,stop,stem_porter,stem_snowball,stem_lancaster
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
the,45744,3,0.056716,4.140108,DT,1,1,the,the,the
to,22702,2,0.028147,5.150871,TO,1,1,to,to,to
of,19297,2,0.023925,5.385314,IN,1,1,of,of,of
and,18537,3,0.022983,5.443282,CC,1,1,and,and,and
a,18140,1,0.022491,5.474516,DT,1,1,a,a,a
in,15926,2,0.019746,5.662306,IN,1,1,in,in,in
that,9787,4,0.012134,6.364752,IN,4,1,that,that,that
for,8271,3,0.010255,6.607557,IN,1,1,for,for,for
on,7263,2,0.009005,6.795053,IN,1,1,on,on,on
is,6956,2,0.008624,6.85736,VBZ,1,1,is,is,is


In [55]:
VOCAB

Unnamed: 0_level_0,n,n_chars,p,i,max_pos,n_pos,stop,stem_porter,stem_snowball,stem_lancaster
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,201,1,0.000249,11.970351,CD,1,0,0,0,0
00,17,2,0.000021,15.533940,CD,1,0,00,00,00
000,898,3,0.001113,9.810831,CD,1,0,000,000,000
0000,1,4,0.000001,19.621403,CD,1,0,0000,0000,0000
007,6,3,0.000007,17.036440,CD,1,0,007,007,007
...,...,...,...,...,...,...,...,...,...,...
zung,4,4,0.000005,17.621403,NN,1,0,zung,zung,zung
zurich,2,6,0.000002,18.621403,CD,2,0,zurich,zurich,zurich
zwiebel,1,7,0.000001,19.621403,NNP,1,0,zwiebel,zwiebel,zwiebel
zwilling,2,8,0.000002,18.621403,VBG,1,0,zwill,zwill,zwil


In [35]:
### Extracting VOCAB
VOCAB.to_csv(data_out+"VOCAB.csv")