In [1]:
import pickle
from lib.utility import ProcessPipeline
from lib.nlp import entityPipeline,featurePipeline
import pandas as pd
import numpy as np

# Part 1: data engineering

In [3]:
### read pickle file
with open('outputs/step2_news_raw.pickle', 'rb') as handle:
    texts = pickle.load(handle)

### Input: list of string

In [4]:
len(texts)

2988

In [5]:
processPipe = ProcessPipeline(texts=texts,steps=["langdetection","summarization",'tokenization'],
                           tokenization_steps=['remove_digits','remove_punctuation',"remove_stopwords",'lemmatization','stemmization'],stopwordsWhiteList=["n't", "not", "no"])

In [6]:
tokens = processPipe.run(workers=6,return_str=False)

# Part 2: Topic Modeling

In [7]:
tokens[0][:10]

['forum',
 'address',
 'one',
 'press',
 'issu',
 'lifetim',
 'global',
 'energi',
 'climat',
 'chang']

In [9]:
from lib.nlp import LDA

In [10]:
%%capture
LDAmodel = LDA(texts=tokens,num_topics=3)

In [11]:
LDAmodel.plot()

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [13]:
### Check each document and its topics
topicDic = {0:[],1:[],2:[]}
for i in range(len(LDAmodel.corpus)):
    allTopics = LDAmodel.get_doc_topics(docId=i,print_topics=False) ### eg: [(1, 0.58676654), (2, 0.4099236)]
    for topic in allTopics:
        topicID = topic[0]
        prob = topic[1]
        if prob > 0.6:
            topicDic[topicID] += [i]
            break

In [14]:
topicDic.keys()

dict_keys([0, 1, 2])

# Part 3: Sentiment analysis

In [18]:
summarizePipe = ProcessPipeline(texts=texts,steps=["langdetection","summarization"])
summarized = summarizePipe.run()

In [14]:
summarized[0]

'The forum addressed one of the most pressing issues of our lifetimes - global energy and climate change.\nIndia’s development will undoubtedly be fuelled by an increase in energy consumption, but this economic development belies a growing problem - climate change caused by CO2 emissions.\nThis includes increase in Earth’s mean surface temperature (also known as global warming), rise in sea level and acidification, extreme weather events, and so on.\nWhile the increase in global temperature and loss of polar ice has been strongly linked to anthropogenic activities (particularly CO2 emissions), there is no consensus among researchers about the link between extreme weather events such as forest fires, cyclones, droughts etc and anthropogenic causes.\nNow, concerning the timeline, it is expected that global temperatures will increase by over 2 degrees C by 2040 if emissions continue as before, well within our lifetimes for most of us reading this article.\nMillions of Indians live off the

In [19]:
featurePipe = featurePipeline(texts=summarized)
featureDic = featurePipe.run()

In [21]:
sentDF = pd.DataFrame(featureDic)
sentDF.head()

Unnamed: 0,compound,neg,neu,polarity,pos,subjectivity
0,0.9178,0.045,0.828,0.079527,0.127,0.403542
1,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0
3,0.9468,0.0,0.778,0.466667,0.222,0.766667
4,0.9584,0.011,0.818,0.052548,0.172,0.52314


In [17]:
len(featureDic)

2988

In [18]:
posFeatures = sentDF['pos'].values
negFeatures = sentDF['neg'].values

In [19]:
featureDic = {}
for topicID in topicDic:
    newsIDs = [topicDic[topicID]]
    
    tmpDic = {'pos':posFeatures[newsIDs],'neg':negFeatures[newsIDs]}
    featureDic[topicID] = tmpDic

  """


In [21]:
top3Dic = {}
for topicID in featureDic:
    tmpPos = featureDic[topicID]['pos']
    tmpNeg = featureDic[topicID]['neg']
    
    top3pos = np.argsort(tmpPos)[-3:]
    top3neg =  np.argsort(tmpNeg)[-3:]
    
    tmpDic = {}
    tmpDic['pos'] = top3pos[::-1]
    tmpDic['neg'] = top3neg[::-1]
    
    top3Dic.update({topicID:tmpDic})

np.argsort(featureDic[1]['pos'])
np.sort(featureDic[1]['pos'])

In [22]:
top3Dic

{0: {'pos': array([931, 314, 105]), 'neg': array([916, 421, 169])},
 1: {'pos': array([110, 107,  46]), 'neg': array([  1, 110, 236])},
 2: {'pos': array([413, 128, 276]), 'neg': array([349, 347, 391])}}

In [23]:
for topicID in top3Dic:
    print(f"For topic {topicID}: three most positive articles are : {top3Dic[topicID]['pos']}, three most negative articles are : {top3Dic[topicID]['neg']}")

For topic 0: three most positive articles are : [931 314 105], three most negative articles are : [916 421 169]
For topic 1: three most positive articles are : [110 107  46], three most negative articles are : [  1 110 236]
For topic 2: three most positive articles are : [413 128 276], three most negative articles are : [349 347 391]


# Part 4: Keyword extraction

In [24]:
entityDic = {}

In [25]:
for topicID in topicDic:
    includedNewsIndex = topicDic[topicID]
    includedNews = np.array(summarized)[includedNewsIndex]
    print("Start to process of topic:",topicID)
    pipeline = entityPipeline(texts=includedNews.tolist(),targetPOSs=[]) ### only interested in entity
    entities = pipeline.run()
    entityDic[topicID] = entities
    print("Finish process of topic:",topicID)

Start to process of topic: 0
Finish process of topic: 0
Start to process of topic: 1
Finish process of topic: 1
Start to process of topic: 2
Finish process of topic: 2


In [26]:
entityDic[1]

[{'Trump': {'count': 1, 'label': 'PERSON'}},
 {},
 {'Washington': {'count': 1, 'label': 'GPE'},
  'Democrats': {'count': 3, 'label': 'NORP'},
  'Nancy Pelosi': {'count': 1, 'label': 'PERSON'},
  'House': {'count': 4, 'label': 'ORG'},
  '116th Congress': {'count': 1, 'label': 'ORG'},
  'Donald Trump': {'count': 1, 'label': 'PERSON'},
  'America': {'count': 1, 'label': 'GPE'},
  'Pelosi': {'count': 4, 'label': 'PERSON'},
  'first': {'count': 2, 'label': 'ORDINAL'},
  'Congress': {'count': 3, 'label': 'ORG'},
  'Americans': {'count': 1, 'label': 'NORP'},
  'Trump': {'count': 2, 'label': 'NORP'},
  'the White House': {'count': 2, 'label': 'ORG'},
  "Trump's": {'count': 2, 'label': 'ORG'},
  'U.S': {'count': 1, 'label': 'GPE'},
  'Mike Pence': {'count': 1, 'label': 'PERSON'},
  'Senate': {'count': 1, 'label': 'ORG'},
  'Republicans': {'count': 1, 'label': 'NORP'},
  'Mitch McConnell': {'count': 1, 'label': 'PERSON'},
  'New Yorker': {'count': 1, 'label': 'ORG'},
  'Ocasio-Cortez': {'count':