### NLP with ChannelNewsAsia Articles 

In [1]:
from spacy.en import English
nlp = English()

### BeautifulSoup
https://www.crummy.com/software/BeautifulSoup/bs4/doc/

In [2]:
from bs4 import BeautifulSoup
import urllib2

In [3]:
def getSoupFromUrl(url):
    req = urllib2.Request(url, headers={'User-Agent' : "Chrome"}) 
    con = urllib2.urlopen(req)
    page = con.read()
    soup = BeautifulSoup(page, "lxml")
    con.close()
    return soup

In [None]:
'''
# Sample code for scraping used
base_url = 'http://www.channelnewsasia.com'
articles = []
num_pages = 2
for i in range(num_pages):
    url = 'http://www.channelnewsasia.com/archives/8396078/news?pageNum={}&channelId=7469254'.format(i)
    soup = getSoupFromUrl(url)
    print("..{}".format(i))
    for link in soup.find_all("div", attrs={"class": "c-result-section--default"})[0].find_all("a", attrs={"class":"teaser__title"}):
        #print(base_url + link.get("href"))
        soup = getSoupFromUrl(base_url + link.get("href"))
        paras = soup.find_all("div", attrs={"class":"c-rte--article"})[0].find_all("p")
        content = " ".join([para.getText() for para in paras])
        articles.append(content)
'''

In [54]:
''' # Saving Articles
filename = 'data/news.txt'
file = codecs.open(filename, "w", "utf-8")
for article in articles:
    file.write(re.sub( '\s+', ' ', article).strip() + '\n')
file.close()
''' 

In [2]:
import codecs

In [3]:
''' Reading Saved Articles'''
filename = 'data/news.txt'
articles = []
for article in codecs.open(filename, 'r', encoding="utf-8"):
    articles.append(article)

In [4]:
len(articles)

10000

### CountVectorizer

In [5]:
from sklearn.feature_extraction import text 
additional_stop_words = ["singapore", "said", "mr", "want", "say", "year"]
stop_words = text.ENGLISH_STOP_WORDS.union(additional_stop_words)

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(binary=False, stop_words=stop_words, min_df=3)
articles_cv = cv.fit_transform(articles)

### Gensim 

In [7]:
id2word = dict(enumerate(cv.get_feature_names()))

In [8]:
from gensim.models.ldamodel import LdaModel
from gensim.matutils import Sparse2Corpus

# First we convert our word-matrix into gensim's format
article_corpus = Sparse2Corpus(articles_cv, documents_columns = False)

num_topics = 20

# Then we fit an LDA model
lda_model = LdaModel(corpus=article_corpus, id2word=id2word, num_topics=num_topics)

In [9]:
num_words_per_topic = 20
for ti, topic in enumerate(lda_model.show_topics(num_topics = num_topics, num_words = num_words_per_topic)):
    print("Topic: {} \n{}\n".format(ti, topic))

Topic: 0 
(0, u'0.011*"transport" + 0.010*"bus" + 0.010*"train" + 0.009*"smrt" + 0.008*"station" + 0.007*"line" + 0.007*"lta" + 0.006*"public" + 0.005*"commuters" + 0.005*"services" + 0.005*"new" + 0.005*"time" + 0.005*"trains" + 0.004*"service" + 0.004*"east" + 0.004*"stations" + 0.004*"rail" + 0.004*"transit" + 0.004*"mrt" + 0.004*"added"')

Topic: 1 
(1, u'0.009*"minister" + 0.008*"home" + 0.007*"ministry" + 0.006*"community" + 0.006*"care" + 0.006*"added" + 0.005*"dr" + 0.005*"health" + 0.005*"security" + 0.005*"public" + 0.004*"patients" + 0.004*"residents" + 0.004*"social" + 0.004*"singaporeans" + 0.004*"shanmugam" + 0.004*"children" + 0.004*"support" + 0.004*"lee" + 0.003*"affairs" + 0.003*"family"')

Topic: 2 
(2, u'0.009*"team" + 0.006*"tan" + 0.005*"football" + 0.005*"games" + 0.005*"time" + 0.005*"league" + 0.004*"added" + 0.004*"scdf" + 0.004*"national" + 0.004*"told" + 0.004*"facebook" + 0.004*"president" + 0.003*"sports" + 0.003*"years" + 0.003*"court" + 0.003*"players" +

In [10]:
for num, doc_topics in enumerate(lda_model.get_document_topics(article_corpus)):
    print("Row: {} Topics: {}".format(num, doc_topics))
    if num > 10:
        break

Row: 0 Topics: [(2, 0.94313617391685223), (16, 0.048125961859806464)]
Row: 1 Topics: [(0, 0.031871984987507693), (1, 0.23459731247521295), (8, 0.17779529384003226), (11, 0.039186833898108717), (15, 0.048638803702314042), (17, 0.020255652862940992), (19, 0.44557744087899009)]
Row: 2 Topics: [(1, 0.016597708647520384), (2, 0.042576130111019322), (3, 0.073125853536057425), (8, 0.27839679528899625), (16, 0.010802079899069719), (19, 0.54248207871863685)]
Row: 3 Topics: [(7, 0.45772551756543089), (17, 0.29473828177912054), (18, 0.086638222211567958), (19, 0.15732654980537428)]
Row: 4 Topics: [(1, 0.041990253567746397), (2, 0.10728189329430193), (13, 0.77212659601106581), (17, 0.073569810489947804)]
Row: 5 Topics: [(2, 0.6628202643727531), (7, 0.28464405726638531), (16, 0.039656890246462878)]
Row: 6 Topics: [(2, 0.42123228736234891), (4, 0.15820640109673589), (7, 0.025325549970156298), (13, 0.093951677288616878), (19, 0.29821031372634998)]
Row: 7 Topics: [(2, 0.026742459170311525), (3, 0.0687

In [11]:
print(articles[9])

HAMBURG, Germany: Singapore and Japan have expressed their continued support to explore ways to continue with the Trans-Pacific Partnership (TPP), when the prime ministers of both countries met on the sidelines of the G20 Leaders' Summit on Saturday (Jul 8).  Prime Minister Lee Hsien Loong and his Japanese counterpart Shinzo Abe also exchanged views on developments on the Korean Peninsula. "PM Lee expressed Singapore’s grave concerns over the escalating tensions which could jeopardise the peace and stability of the region," said a statement from the Prime Minister's Office (PMO). Both leaders also looked forward to further enhancing bilateral cooperation and noted "with satisfaction" that Singapore and Japan successfully held events to celebrate 50 years of diplomatic relations last year. "Japan is ready to further develop our cooperative relationship with Singapore towards the next 50 years," said Mr Abe. The two leaders also discussed the Kuala Lumpur-Singapore High Speed Rail and PM

In [12]:
import numpy as np
doc_topics_array = np.zeros([len(articles), num_topics])

In [13]:
doc_topics_array

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [14]:
for doc_num, doc_topics in enumerate(lda_model.get_document_topics(article_corpus)):
    for topic in doc_topics:
        doc_topics_array[doc_num, topic[0]] = topic[1]

In [15]:
doc_topics_array.shape

(10000, 20)

In [16]:
from scipy import spatial
tree = spatial.KDTree(doc_topics_array)

In [17]:
tree.query(doc_topics_array[200], k=5)

(array([ 0.        ,  0.13329878,  0.13538154,  0.13680232,  0.14843561]),
 array([ 200, 9474, 9993, 4131,  930]))

In [18]:
doc_topics_array[200]

array([ 0.        ,  0.        ,  0.33021903,  0.18812824,  0.11019738,
        0.        ,  0.17764187,  0.        ,  0.        ,  0.01131917,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.18036016])

In [23]:
print(articles[930])




### Word2Vec 

In [24]:
text = [x.split() for x in articles]

In [25]:
from gensim.models import Word2Vec
model = Word2Vec(text, size=100, window=5, min_count=5, workers=4)

In [30]:
model.most_similar(positive=['PAP'])

[(u'opposition', 0.786681592464447),
 (u'SDP', 0.7367699146270752),
 (u'Party', 0.7248833775520325),
 (u'FAS', 0.6891103386878967),
 (u'House', 0.6889873743057251),
 (u'WP', 0.6832637190818787),
 (u'Cabinet', 0.6769022941589355),
 (u'candidate', 0.6729015111923218),
 (u"Workers'", 0.6587272882461548),
 (u'President', 0.6578125357627869)]

In [28]:
model.most_similar(positive=['HDB'])

[(u'flats', 0.748914361000061),
 (u'2-room', 0.6920130848884583),
 (u'non-mature', 0.6850610971450806),
 (u'Flexi', 0.685046374797821),
 (u'BTO', 0.6770901083946228),
 (u'flats,', 0.6729371547698975),
 (u'resale', 0.6695283651351929),
 (u'two-room', 0.6662867069244385),
 (u'(BTO)', 0.6644257307052612),
 (u'rental', 0.6583526134490967)]

### Scapy 

In [105]:
from collections import Counter
c = Counter()
for i, article in enumerate(articles):
    parsed_article = nlp(article)
    c.update([ent.text for ent in parsed_article.ents if ent.label_ == "PERSON" or ent.label_ == "ORG"])

In [106]:
c.most_common(50)

[(u'Mr Lee', 2294),
 (u'Channel NewsAsia', 2019),
 (u'Parliament', 1876),
 (u'Jan', 1338),
 (u'Jun', 1304),
 (u'Mar', 1286),
 (u'NEA', 1105),
 (u'LTA', 1064),
 (u'SMRT', 978),
 (u'HDB', 832),
 (u'MAS', 779),
 (u'Lee Hsien Loong', 768),
 (u'Lee', 762),
 (u'Bharati', 708),
 (u'SCDF', 704),
 (u'ASEAN', 612),
 (u'State', 589),
 (u'Mr Lim', 574),
 (u'Tan', 557),
 (u'AVA', 540),
 (u'Lim', 522),
 (u'MOH', 513),
 (u'Dr Tan', 498),
 (u'Yang', 496),
 (u'PM Lee', 457),
 (u'Mr Teo', 438),
 (u'Mr Tan', 435),
 (u'Bharati Jagdish', 433),
 (u'MFA', 432),
 (u'NUS', 423),
 (u'MOE', 422),
 (u'Dr Ng', 417),
 (u'MOM', 401),
 (u'SIA', 398),
 (u'Mr Wong', 393),
 (u'Mediacorp', 385),
 (u'Mr Ng', 377),
 (u'NTU', 373),
 (u'MP', 353),
 (u'Mr Ong', 351),
 (u'Facebook', 344),
 (u'Mr Shanmugam', 325),
 (u'NTUC', 325),
 (u'ICA', 321),
 (u'Budget', 320),
 (u'DPP', 319),
 (u'Dr Balakrishnan', 318),
 (u'MINDEF', 317),
 (u'Mr Nathan', 317),
 (u'SPF', 315)]