## HK Protests vs. US Protests

In [63]:
# Standard
import numpy as np
import pandas as pd
import re
import pickle
import datetime
import matplotlib.pyplot as plt
import seaborn as sns

# Preprocessing and NLP
import spacy
from nltk.tokenize import sent_tokenize
from gensim.parsing.preprocessing import STOPWORDS
from gensim.utils import simple_preprocess

# Sentiment
from textblob import TextBlob

In [50]:
f = open('us_protests.p', 'rb')
df_us = pickle.load(f)
f.close()

In [51]:
f = open('hk_protests.p', 'rb')
df_hk = pickle.load(f)
f.close()

In [52]:
df_us['topic'] = 'US'
df_hk['topic'] = 'HK'

In [110]:
df = pd.concat([df_us, df_hk])

## Cleaning Articles
- Clean the body of escape characters and convert date strings to dates
- Merge CNN articles on HK protests and US protests into 1 DataFrame
- Remove investing and travel articles
- 186 US protest (59%) and 129 HK protest (41%)

In [111]:
# Clean funtions
def string_to_date(s):
    if s == '':
        return None
    match = re.search(r'(?<=,\s)(.+$)', s).group(0)
    try:
        date = datetime.datetime.strptime(match, '%a %B %d, %Y')
    except:
        date = datatime.datetime.strptime(match, '%B %d, %Y')
    return date

def clean_body(s):
    try:
        match = re.search(r'(?<=\))(.*)', s).group(0)
    except:
        match = s
        
    match = match.replace('\'', '')
    match = re.sub(r'\(.*?\)', '', match)
    
    return match
    
def replace_words(s):
    s = s.replace('Gov.', 'governor')
    s = re.sub(r'U\.S\.', 'US', s)
    s = re.sub(r'U\.S\.A\.', 'US', s)
    s = re.sub(r'US', 'USA', s)
    s = re.sub(r'U\.K\.', 'UK', s)
    s = re.sub(r'Mr\.', 'MR', s)
    s = re.sub(r'Mrs\.', 'MRS', s)
    s = re.sub(r'Ms\.', 'MS', s)
    s = re.sub(r'\.\.\.', '', s)
    s = re.sub(r'U.S-China', 'US-China', s)
    s = s.replace('Co.', 'Co')
    
    return s   

In [112]:
df['date'] = df['date'].map(string_to_date)
df['body'] = df['body'].map(clean_body)

In [113]:
df['body'] = np.where(df['body'] == '', None, df['body'])

In [114]:
df = df.loc[df['url'].map(lambda x: re.search(r'/money/', x)).isna()]
df = df.loc[df['url'].map(lambda x: re.search(r'/investing/', x)).isna()]
df = df.loc[df['url'].map(lambda x: re.search(r'/transport/', x)).isna()]

In [115]:
df.reset_index(inplace=True)
df.drop(columns='index', inplace=True)

In [116]:
print(df['topic'].value_counts())
print(df['topic'].value_counts(normalize=True))

US    186
HK    129
Name: topic, dtype: int64
US    0.590476
HK    0.409524
Name: topic, dtype: float64


In [119]:
def preprocess_body(text, stopwords):
    simple_text = simple_preprocess(text)
    text_out = [ word for word in simple_text if word not in stopwords ]
    return text_out

def preprocess_sent(texts, stopwords):
    texts_out = []
    for text in texts:
        simple_text = simple_preprocess(text)
        no_stop = [ word for word in simple_text if word not in stopwords ]
        if no_stop != []:
            texts_out.append(no_stop)
    return texts_out

In [120]:
stopwords = STOPWORDS
nlp = spacy.load("en_core_web_sm")

In [121]:
df

Unnamed: 0,headline,body,date,url,topic
0,New York passes a police reform bill package t...,New York legislators passed a package of bills...,2020-06-10,https://www.cnn.com/2020/06/10/us/new-york-pas...,US
1,3 recordings. 3 cries of 'I can't breathe.' 3 ...,In the days since George Floyds cries that he ...,2020-06-10,https://www.cnn.com/2020/06/10/us/cant-breathe...,US
2,The PGA Tour will leave the 8:46 tee time open...,The PGA Tour plans to honor George Floyd and t...,2020-06-10,https://www.cnn.com/2020/06/10/us/pga-tour-tee...,US
3,Tennessee police chief tells officers they now...,"Police officers in Chattanooga, Tennessee, now...",2020-06-10,https://www.cnn.com/2020/06/10/us/tn-police-of...,US
4,Oakland police investigate state patrolmen's f...,Investigators are looking into the California ...,2020-06-10,https://www.cnn.com/2020/06/10/us/oakland-poli...,US
...,...,...,...,...,...
310,"5 things to know for June 12: Hong Kong, campa...",If youre into shutting down lemonade stands ru...,2019-06-12,https://www.cnn.com/2019/06/12/us/five-things-...,HK
311,,,NaT,https://www.cnn.com/2019/06/11/asia/hong-kong-...,HK
312,"5 things to know for June 10: David Ortiz, Mex...",Check out the building in Barcelona that final...,2019-06-10,https://www.cnn.com/2019/06/10/us/five-things-...,HK
313,Fully occupied? Hong Kong's protest hotel,The face of Hong Kongs chief executive covers ...,2015-01-29,https://www.cnn.com/2015/01/29/world/hong-kong...,HK


In [123]:
doc = nlp(df['body'].iloc[314])

In [135]:
for token in doc:
    if token.text != ' ':
        print(
            token.text, '\n',
            token.pos_, '\n',
            token.tag_, '\n',
            token.is_stop, '\n',
        )

search 
 NOUN 
 NN 
 False 

engine 
 NOUN 
 NN 
 False 

Baidu.com 
 PROPN 
 NNP 
 False 

revealed 
 VERB 
 VBD 
 False 

a 
 DET 
 DT 
 True 

list 
 NOUN 
 NN 
 False 

of 
 ADP 
 IN 
 True 

mostly 
 ADV 
 RB 
 True 

positive 
 ADJ 
 JJ 
 False 

articles 
 NOUN 
 NNS 
 False 

about 
 ADP 
 IN 
 True 

the 
 DET 
 DT 
 True 

DPRK 
 NOUN 
 NN 
 False 

. 
 PUNCT 
 . 
 False 

A 
 DET 
 DT 
 True 

Baidu 
 PROPN 
 NNP 
 False 

search 
 NOUN 
 NN 
 False 

for 
 ADP 
 IN 
 True 

" 
 PUNCT 
 `` 
 False 

North 
 PROPN 
 NNP 
 False 

Korea 
 PROPN 
 NNP 
 False 

hack 
 NOUN 
 NN 
 False 

" 
 PUNCT 
 '' 
 False 

in 
 ADP 
 IN 
 True 

English 
 PROPN 
 NNP 
 False 

revealed 
 VERB 
 VBD 
 False 

just 
 ADV 
 RB 
 True 

one 
 NUM 
 CD 
 True 

nearly 
 ADV 
 RB 
 False 

two 
 NUM 
 CD 
 True 

- 
 PUNCT 
 HYPH 
 False 

week 
 NOUN 
 NN 
 False 

- 
 PUNCT 
 HYPH 
 False 

old 
 ADJ 
 JJ 
 False 

article 
 NOUN 
 NN 
 False 

naming 
 VERB 
 VBG 
 False 

the 
 DET 
 DT 
 T

 CD 
 True 

thing 
 NOUN 
 NN 
 False 

is 
 AUX 
 VBZ 
 True 

certain 
 ADJ 
 JJ 
 False 

: 
 PUNCT 
 : 
 False 

International 
 ADJ 
 JJ 
 False 

news 
 NOUN 
 NN 
 False 

organizations 
 NOUN 
 NNS 
 False 

such 
 ADJ 
 JJ 
 True 

as 
 SCONJ 
 IN 
 True 

CNN 
 PROPN 
 NNP 
 False 

will 
 VERB 
 MD 
 True 

continue 
 VERB 
 VB 
 False 

with 
 ADP 
 IN 
 True 

extensive 
 ADJ 
 JJ 
 False 

coverage 
 NOUN 
 NN 
 False 

. 
 PUNCT 
 . 
 False 

And 
 CCONJ 
 CC 
 True 

, 
 PUNCT 
 , 
 False 

thanks 
 NOUN 
 NNS 
 False 

to 
 ADP 
 IN 
 True 

heavy 
 ADJ 
 JJ 
 False 

- 
 PUNCT 
 HYPH 
 False 

handed 
 ADJ 
 JJ 
 False 

government 
 NOUN 
 NN 
 False 

censors 
 NOUN 
 NNS 
 False 

, 
 PUNCT 
 , 
 False 

most 
 ADJ 
 JJS 
 True 

citizens 
 NOUN 
 NNS 
 False 

of 
 ADP 
 IN 
 True 

China 
 PROPN 
 NNP 
 False 

will 
 VERB 
 MD 
 True 

continue 
 VERB 
 VB 
 False 

get 
 AUX 
 VB 
 True 

only 
 ADV 
 RB 
 True 

the 
 DET 
 DT 
 True 

news 
 NOUN 
 NN 
 Fals

In [None]:
df['sentences'] = df['body'].map(sent_tokenize)

In [None]:
df['tokens'] = df['body'].map(lambda x: preprocess_body(x, stopwords))

In [None]:
df['sent_tokens'] = df['sentences'].map(lambda x: preprocess_sent(x, stopwords))

In [None]:
df['word_length'] = df['tokens'].map(len)
df['sent_length'] = df['sentences'].map(len)

In [None]:
df['sentences'].iloc[0]

In [None]:
doc = nlp(df['body'].iloc[0])

## Early EDA
- Classes and mean lengths:
    - HK Articles - 94 (mean 365 words and 30 sentences)
    - US Articles - 183 (mean 367 words and 35 sentences)
- Dropped articles with word length > 1000 (6 articles, 2 HK and 3 US), all seem to be updates on older articles or long form

In [None]:
df = df.loc[df['word_length'] <= 1000]

In [None]:
print(df.groupby('topic')['word_length'].mean())
print(df.groupby('topic')['sent_length'].mean())