## Importing Libraries

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.lancaster import LancasterStemmer
import re
import warnings
warnings.filterwarnings("ignore")

## Importing LIB

In [4]:
LIB = pd.read_csv("C:/Users/Student/Desktop/UVA/UVA '23 Spring/DS 5001/Final Project/LIB.csv")

In [5]:
LIB.sample(10)

Unnamed: 0,source_id,date,url,title,text
148,CNN,2020/11/19,https://www.cnn.com/2020/11/19/politics/mitt-r...,politic mitt romney lame duck transition conse...,Republican Sen. Mitt Romney is warning that th...
543,CNBC,2020/11/04,https://www.cnbc.com/2020/11/04/fords-new-ceo-...,ford new ceo promise investor transparency re...,In this article+++Ford Motor will release its ...
105,CNN,,https://www.cnn.com/travel/article/covid-hawai...,travel article covid hawaii couple arrest,A couple from Hawaii was arrested over the wee...
207,CNN,2020/10/28,https://www.cnn.com/2020/10/28/politics/anonym...,politic anonymous new york time ope writer,The anonymous senior Trump administration offi...
899,CNBC,2020/09/01,https://www.cnbc.com/2020/09/01/fauci-debunks-...,fauci debunk theory low cdc coronavirus death...,White House coronavirus advisor Dr. Anthony Fa...
650,CNBC,2020/11/11,https://www.cnbc.com/2020/11/11/covid-19-new-c...,new coronavirus case trump white house election,Two more people who attended an Election Night...
295,CNN,2020/11/23,https://www.cnn.com/2020/11/23/politics/joe-bi...,politic joe biden foreign policy national secu...,President-elect Joe Biden on Monday unveiled a...
426,CNN,2020/09/04,https://www.cnn.com/2020/09/04/business/big-lo...,business big lot stores retail,Retail bankruptcies are piling up during the p...
74,CNN,2020/09/07,https://www.cnn.com/2020/09/07/politics/trump-...,politic trump attack military leadership,President Donald Trump launched an unprecedent...
272,CNN,2020/12/30,https://www.cnn.com/2020/12/30/politics/trump-...,politic trump return washington early,President Donald Trump and first lady Melania ...


## Sorting into Paragraphs

In [6]:
PARAS = LIB['text'].str.split("\+\+\+", expand=True).stack()\
    .to_frame('para_str').sort_index()
PARAS.index.names = ['text_num', 'para_num']
PARAS['para_str'] = PARAS['para_str'].str.replace(r'\n', ' ', regex=True)
PARAS['para_str'] = PARAS['para_str'].str.strip()
PARAS = PARAS[~PARAS['para_str'].str.match(r'^\s*$')]

In [7]:
PARAS

Unnamed: 0_level_0,Unnamed: 1_level_0,para_str
text_num,para_num,Unnamed: 2_level_1
0,0,Two more infants have contracted the herpes vi...
0,1,"In the ritual, known as metzitzah b’peh, after..."
0,2,Most adults are infected with the herpes simpl...
0,3,“While HSV-1 in adults can cause the common co...
0,4,"Since 2000, there have been 13 reports in New ..."
...,...,...
989,13,"Mayorkas: ""The Department of Homeland Security..."
989,14,"Thomas-Greenfield: ""My fellow career diplomats..."
989,15,"Haines: ""Mr. President-elect, you know that Iv..."
989,16,"Sullivan: Mr. President-elect, ""You have also ..."


## Sorting into Sentences

In [8]:
SENTS = PARAS.para_str.apply(lambda x: pd.Series(nltk.sent_tokenize(x)))\
        .stack()\
        .to_frame('sent_str')
SENTS.index.names = ['text_num', 'para_num', 'sent_num']

In [9]:
# standardizing text
SENTS['sent_str'] = SENTS['sent_str'].str.replace(r'\W', ' ').str.lower()
SENTS

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sent_str
text_num,para_num,sent_num,Unnamed: 3_level_1
0,0,0,two more infants have contracted the herpes vi...
0,1,0,in the ritual known as metzitzah b peh after...
0,1,1,antibacterial ointment is applied and the woun...
0,1,2,the health department says the procedure is da...
0,2,0,most adults are infected with the herpes simpl...
...,...,...,...
989,16,0,sullivan mr president elect you have also ...
989,16,1,you have told us the alliances we rebuild the...
989,17,0,kerry the road ahead is exciting
989,17,1,it means creating millions of middle class job...


## Getting Tokens

In [10]:
keep_whitespace = True
if keep_whitespace:
    TOKENS = SENTS.sent_str\
            .apply(lambda x: pd.Series(nltk.pos_tag(nltk.word_tokenize(x))))\
            .stack()\
            .to_frame('pos_tuple')
else:
    TOKENS = SENTS.sent_str\
            .apply(lambda x: pd.Series(nltk.pos_tag(nltk.WhitespaceTokenizer().tokenize(x))))\
            .stack()\
            .to_frame('pos_tuple')

In [11]:
TOKENS.index.names = ['text_num', 'para_num', "sent_num","token_num"]
TOKENS

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,pos_tuple
text_num,para_num,sent_num,token_num,Unnamed: 4_level_1
0,0,0,0,"(two, CD)"
0,0,0,1,"(more, JJR)"
0,0,0,2,"(infants, NNS)"
0,0,0,3,"(have, VBP)"
0,0,0,4,"(contracted, VBN)"
...,...,...,...,...
989,17,2,19,"(healing, NN)"
989,17,2,20,"(planet, NN)"
989,17,2,21,"(to, TO)"
989,17,2,22,"(future, JJ)"


## Making the Corups

In [12]:
CORPUS = TOKENS
CORPUS['pos'] = CORPUS.pos_tuple.apply(lambda x: x[1])
CORPUS['token_str'] = CORPUS.pos_tuple.apply(lambda x: x[0])
CORPUS['term_str'] = CORPUS.token_str.str.lower()

In [13]:
CORPUS

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,pos_tuple,pos,token_str,term_str
text_num,para_num,sent_num,token_num,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0,0,0,"(two, CD)",CD,two,two
0,0,0,1,"(more, JJR)",JJR,more,more
0,0,0,2,"(infants, NNS)",NNS,infants,infants
0,0,0,3,"(have, VBP)",VBP,have,have
0,0,0,4,"(contracted, VBN)",VBN,contracted,contracted
...,...,...,...,...,...,...,...
989,17,2,19,"(healing, NN)",NN,healing,healing
989,17,2,20,"(planet, NN)",NN,planet,planet
989,17,2,21,"(to, TO)",TO,to,to
989,17,2,22,"(future, JJ)",JJ,future,future


In [21]:
CORPUS.reset_index(inplace=True)

In [22]:
CORPUS['source'] = CORPUS['text_num'].apply(lambda x: 'CNN' if x <= 489 else 'CNBC')

In [23]:
CORPUS.set_index(['source', 'text_num', 'para_num', 'sent_num', 'token_num'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,pos_tuple,pos,token_str,term_str
source,text_num,para_num,sent_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
CNN,0,0,0,0,"(two, CD)",CD,two,two
CNN,0,0,0,1,"(more, JJR)",JJR,more,more
CNN,0,0,0,2,"(infants, NNS)",NNS,infants,infants
CNN,0,0,0,3,"(have, VBP)",VBP,have,have
CNN,0,0,0,4,"(contracted, VBN)",VBN,contracted,contracted
...,...,...,...,...,...,...,...,...
CNBC,989,17,2,19,"(healing, NN)",NN,healing,healing
CNBC,989,17,2,20,"(planet, NN)",NN,planet,planet
CNBC,989,17,2,21,"(to, TO)",TO,to,to
CNBC,989,17,2,22,"(future, JJ)",JJ,future,future


In [28]:
# corpus to csv
CORPUS.to_csv("CORPUS.csv")

## Extracting VOCAB

In [26]:
VOCAB

Unnamed: 0_level_0,n,n_chars,p,i,max_pos,n_pos,stop,stem_porter,stem_snowball,stem_lancaster
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,201,1,0.000249,11.970351,CD,1,0,0,0,0
00,17,2,0.000021,15.533940,CD,1,0,00,00,00
000,898,3,0.001113,9.810831,CD,1,0,000,000,000
0000,1,4,0.000001,19.621403,CD,1,0,0000,0000,0000
007,6,3,0.000007,17.036440,CD,1,0,007,007,007
...,...,...,...,...,...,...,...,...,...,...
zung,4,4,0.000005,17.621403,NN,1,0,zung,zung,zung
zurich,2,6,0.000002,18.621403,CD,2,0,zurich,zurich,zurich
zwiebel,1,7,0.000001,19.621403,NNP,1,0,zwiebel,zwiebel,zwiebel
zwilling,2,8,0.000002,18.621403,VBG,1,0,zwill,zwill,zwil


In [27]:
# vocab to csv
VOCAB.to_csv("VOCAB.csv")

## Sentiment Analysis

In [16]:
import vaderSentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyser = SentimentIntensityAnalyzer()

TypeError: can only concatenate str (not "tuple") to str