In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import requests
import pandas as pd
from article_scraper import ArticleScraper
from newspaper.article import ArticleException
from pipeline.lda import LDABuilder
from pipeline.lda_similarity import LDASimilarity
from pipeline.preprocessing import Preprocessor

from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv())

flatten = lambda l: [item for sublist in l for item in sublist]

In [35]:
n_topics = 1000
lda_builder = LDABuilder()
trigram_dictionary = lda_builder.get_corpus_dict(from_scratch=False)
similarity_model = LDASimilarity(lda_builder, n_topics, trigram_dictionary)
lda = similarity_model.model
prep = Preprocessor(preload_models=True)

Loading trigram dict...
Loading LDA model (n_topics=1000)...
Loading index...
load spacy model
Loading bi-gram model...
Done!
Loading tri-gram model...
Done!


In [5]:
# url = 'https://www.nytimes.com/2017/02/16/us/politics/neil-gorsuch-supreme-court-senate-hearing.html'
url = 'https://www.nytimes.com/2018/09/12/climate/pruitt-coal-consulting.html'
try:
    title, text = ArticleScraper.scrape(url)
    parsed_doc = prep.process_doc(title + ' ' + text)
except ArticleException as e:
    # If the download for some reason fails (ex. 404) we need to show an error msg and redirect to main
    print('SCRAPING FAILED!')
#     return redirect(url_for('.index', scraping_error=True)) # TODO pass this without adding to url params (ugly)

In [6]:
bow = trigram_dictionary.doc2bow(parsed_doc)
doc_topics = similarity_model.model.get_document_topics(bow, minimum_probability=0.05)
doc_topics

[(353, 0.12589951),
 (368, 0.097925633),
 (388, 0.054451562),
 (412, 0.082356423),
 (609, 0.15325566),
 (785, 0.062666863),
 (861, 0.10649794)]

In [7]:
# similarity to all other articles
sims = similarity_model.similarity_index[doc_topics]
sims.shape

(142482,)

In [8]:
topics = [{'id': tid, 'p':p, 'words': similarity_model.model.show_topic(tid, 5)} for tid,p in doc_topics] # TODO
topics

[{'id': 353,
  'p': 0.12589951,
  'words': [('document', 0.043323945),
   ('government', 0.026519662),
   ('information', 0.022614015),
   ('report', 0.020168744),
   ('agency', 0.012710739)]},
 {'id': 368,
  'p': 0.097925633,
  'words': [('money', 0.067638882),
   ('pay', 0.066051938),
   ('million', 0.019178221),
   ('spend', 0.015551433),
   ('fund', 0.011786493)]},
 {'id': 388,
  'p': 0.054451562,
  'words': [('committee', 0.038875617),
   ('report', 0.030800708),
   ('intelligence', 0.020262659),
   ('surveillance', 0.018971667),
   ('claim', 0.017765481)]},
 {'id': 412,
  'p': 0.082356423,
  'words': [('office', 0.061038882),
   ('ethic', 0.03857825),
   ('conflict', 0.036110982),
   ('transparency', 0.016583975),
   ('public', 0.016452176)]},
 {'id': 609,
  'p': 0.15325566,
  'words': [('agency', 0.1179639),
   ('epa', 0.090001889),
   ('environmental', 0.036948301),
   ('pruitt', 0.029954851),
   ('environmental_protection_agency', 0.021240372)]},
 {'id': 785,
  'p': 0.06266686

In [126]:
topics = lda.get_document_topics(bow, minimum_probability=0.02, 
                                 minimum_phi_value=0.0001, 
                                 per_word_topics=False)

In [150]:
# set([t for t,p in topics])
# set(word_topics[0][1])
# [sum([p for t,p  in ww[1]]) for ww in phi_relevance]
# phi_relevance[0]
# from collections import defaultdict, Counter
# word_doc_prob = defaultdict(float)

# topics_i_care_about = [x[0] for x in topics]
# topics_i_care_about

# highest = np.argsort(lda.expElogbeta[topics_i_care_about].sum(axis=0))[-50:] # goes [topic][word] # gives the prob of each word per topic
np.sort(lda.expElogbeta.sum(axis=0)).shape
# [lda.id2word[a] for a in highest]

(100000,)

In [137]:
# terms = sorted(flatten([lda.show_topic(t) for t in topics_i_care_about]), key=lambda x: x[0])
# [(w,c) for w,c in Counter(list(zip(*terms))[0]).items() if c > 1]

[('accord', 2), ('agency', 2), ('government', 3), ('official', 3)]

I can either take just the top m words from the top 1-2 topics and call it a day... or I can calculate each words probability and take the top n

In [141]:
# lda.show_topic(top2topics[0])#, n_words_per_topic, formatted=True)
# lda.inference()
lda.get_topics().shape

(1000, 100000)

In [9]:
# By top topics - total # is n_topics * n_words_per_topic
n_topics = 2
n_words_per_topic = 5

top2topics = [tid for tid,p in sorted(doc_topics, key=lambda item: -item[1])[:n_topics]]
words = []
[words.extend(similarity_model.model.show_topic(tid, n_words_per_topic)) for tid in top2topics]
top_topic_words = [w for w,p in words]
top_topic_words

['agency',
 'epa',
 'environmental',
 'pruitt',
 'environmental_protection_agency',
 'document',
 'government',
 'information',
 'report',
 'agency']

In [163]:
# by joint probability, P(topic,word)
n_words_per_topic = 6 # should be >= to overall # of top words in case there is just one topic, however unlikely
n_top_words = 20

words_joint_proba = [[(w,wp*p/lda.expElogbeta[:,w].sum()) for w,wp, in similarity_model.model.get_topic_terms(tid, n_words_per_topic)] for tid,p in doc_topics]
words = sorted(flatten(words_joint_proba), key=lambda x: -x[1])
top_jp_words = [lda.id2word[w] for w,p in words]
top_jp_words[:n_top_words]

['pruitt',
 'epa',
 'environmental_protection_agency',
 'node',
 'mr.',
 'environmental',
 'ethic',
 'agency',
 'document',
 'transparency',
 'environment',
 'money',
 'conflict',
 'pay',
 'information',
 'investigation',
 'dollar',
 'fund',
 'office',
 'agency']

In [11]:
# mixture of the two approaches
n_topics = 2
n_words_per_topic = 5

top2topics = sorted(doc_topics, key=lambda item: -item[1])[:n_topics]
words_joint_proba = [[(w,wp*p) for w,wp, in similarity_model.model.show_topic(tid, n_words_per_topic)] for tid,p in top2topics]
words = sorted(flatten(words_joint_proba), key=lambda x: -x[1])
[w for w,p in words]

['agency',
 'epa',
 'environmental',
 'document',
 'pruitt',
 'government',
 'environmental_protection_agency',
 'information',
 'report',
 'agency']

Need to prioritize more specific words like `supreme_court` and `gorsuch` over generic politics words like `senate` and `vote`

In [12]:
def filter_by_title(words):
    t = set(parsed_doc[:50])
    bools = [w.replace('_', ' ') in t for w in words]
    return list(pd.Series(words)[bools])

def build_query(words):
    return ' '.join(['"' + w.replace('_', ' ') + '"' if '_' in w else w for w in words])

def resolve_source_id(source_name):
    x = source_name.lower().split()
#     if x[0] == 'the':
#         x = x[1:]

    return '-'.join(x)

def get_sources_by_bias(bias, sources_filter):
    if not isinstance(bias, list):
        bias = [bias]
        
    sources = flatten([[resolve_source_id(name) for name in bias_sources_map[b]] for b in bias])
    return ','.join(filter(lambda x: x in sources_filter, sources)) # filter and concat

bias_sources_map = {
    'hyper-left':    ['Occupy Democrats','Daily Kos'],
    'left':          ['MSNBC','Buzzfeed','The Atlantic','Vox','The Huffington Post','Talking Points Memo'],
    'center-left':   ['The Guardian UK', 'Politico','The Washington Post','The New York Times','CNN', 'Business Insider'],
    'center':        ['Reuters','Associated Press', 'NPR'],
    'center-right':  ['The Wall Street Journal','The Hill'],
    'right':         ['National Review', 'New York Post','The Weekly Standard','Examiner', 'Washington Examiner'],
    'hyper-right':   ['Fox News','Breitbart News','The American Conservative'],
}

In [13]:
BASE_URL = 'https://newsapi.org/'
EVERYTHING_ENDPOINT = '/v2/everything'
SOURCES_ENDPOINT = '/v2/sources'

key_params = {
    'apiKey': os.getenv('NEWS_API_KEY'),
}
date_params = {
#     'from': '2018-08-12',
#     'to': '2018-08-18'
}

In [14]:
sources = requests.get(BASE_URL + SOURCES_ENDPOINT, key_params).json()['sources']
true_sources = [s['id'] for s in sources]
my_sources = [resolve_source_id(s) for s in flatten(bias_sources_map.values())]

print('Only some of our sources are represented in the API:')
{key:get_sources_by_bias(key, set(true_sources)) for key in bias_sources_map.keys()}

Only some of our sources are represented in the API:


{'center': 'reuters,associated-press',
 'center-left': 'the-guardian-uk,politico,the-washington-post,the-new-york-times,cnn,business-insider',
 'center-right': 'the-wall-street-journal,the-hill',
 'hyper-left': '',
 'hyper-right': 'fox-news,breitbart-news,the-american-conservative',
 'left': 'msnbc,buzzfeed,the-huffington-post',
 'right': 'national-review'}

In [15]:

other_params = {
    'q': build_query(filter_by_title(top_topic_words)),
    'sources': get_sources_by_bias(['center'], set(true_sources)),#get_sources_by_bias(['left','center-left']),
    'language': 'en',
#     'pageSize': page_size,
#     'page': 1,
}

url = BASE_URL + EVERYTHING_ENDPOINT

In [16]:
params = {**date_params, **other_params, **key_params}
print(params['q'])
response = requests.get(url, params)
json_resp = response.json()
json_resp['totalResults']

agency pruitt agency


11

In [17]:
[a['title'] for a in json_resp['articles']]

["Environmental agency's top watchdog retiring after Pruitt probes",
 'New U.S. data shows EPA expanded biofuel waiver program for 2017',
 'FEMA chief under scrutiny over government car use as storm approaches U.S.: Politico',
 'EPA may release new data on small refinery biofuel waivers: sources',
 'UPDATE 2-EPA details broad expansion of biofuel waiver program',
 "EPA watchdog faults agency for Pruitt's 24-7 security costs",
 "EPA's top watchdog quits amid probes of agency's leadership",
 'Ousted EPA head Pruitt denies getting improper gifts, income',
 'FEMA head denies intentionally misusing federal vehicles',
 "DHS secretary: FEMA chief misused cars, but won't lose job",
 'The Latest: Primary runoff for Oklahoma AG too close to call']

In [18]:
[a['content'].split('…')[0] + '...' for a in json_resp['articles'][:5]]

['(Reuters) - The U.S. Environmental Protection Agency’s top internal watchdog will retire next month to take a job outside the federal government, the agency said on Tuesday, after overseeing a slew of probes centered around the Trump administration’s former E...',
 'NEW YORK (Reuters) - The U.S. Environmental Protection Agency (EPA) gave 29 waivers exempting small oil refineries from a requirement to blend biofuels into gasoline and diesel last year, higher than in previous years, according to agency data released on Thu...',
 'WASHINGTON (Reuters) - The head of the U.S. Federal Emergency Management Agency is under investigation over his use of government vehicles, Politico reported on Thursday, as a massive hurricane approached the U.S. Southeast coast. The Department of Homeland S...',
 'NEW YORK (Reuters) - The U.S. Environmental Protection Agency (EPA) may release new data on Thursday related to its program to exempt small refineries from annual biofuels blending requirements, ac

In [19]:
pd.DataFrame(json_resp['articles']).head()

Unnamed: 0,author,content,description,publishedAt,source,title,url,urlToImage
0,Nichola Groom,(Reuters) - The U.S. Environmental Protection ...,The U.S. Environmental Protection Agency's top...,2018-09-18T20:05:59Z,"{'id': 'reuters', 'name': 'Reuters'}",Environmental agency's top watchdog retiring a...,https://www.reuters.com/article/us-usa-epa-ins...,https://s4.reutersmedia.net/resources/r/?m=02&...
1,Reuters Editorial,NEW YORK (Reuters) - The U.S. Environmental Pr...,The U.S. Environmental Protection Agency (EPA)...,2018-09-20T17:23:06Z,"{'id': 'reuters', 'name': 'Reuters'}",New U.S. data shows EPA expanded biofuel waive...,https://www.reuters.com/article/us-usa-epa-bio...,https://s4.reutersmedia.net/resources/r/?m=02&...
2,Reuters Editorial,WASHINGTON (Reuters) - The head of the U.S. Fe...,The head of the U.S. Federal Emergency Managem...,2018-09-13T12:00:33Z,"{'id': 'reuters', 'name': 'Reuters'}",FEMA chief under scrutiny over government car ...,https://www.reuters.com/article/us-storm-flore...,https://s4.reutersmedia.net/resources/r/?m=02&...
3,Reuters Editorial,NEW YORK (Reuters) - The U.S. Environmental Pr...,The U.S. Environmental Protection Agency (EPA)...,2018-09-20T15:48:43Z,"{'id': 'reuters', 'name': 'Reuters'}",EPA may release new data on small refinery bio...,https://www.reuters.com/article/us-usa-epa-bio...,https://s1.reutersmedia.net/resources/r/?m=02&...
4,Chris Prentice,NEW YORK (Reuters) - The U.S. Environmental Pr...,The U.S. Environmental Protection Agency (EPA)...,2018-09-20T18:34:38Z,"{'id': 'reuters', 'name': 'Reuters'}",UPDATE 2-EPA details broad expansion of biofue...,https://www.reuters.com/article/us-usa-epa-bio...,https://s4.reutersmedia.net/resources/r/?m=02&...


Sources..

In [20]:
def intersection(lst1, lst2): 
    return set(lst1).intersection(lst2) 

print('in both')
intersection(true_sources,my_sources)

in both


{'associated-press',
 'breitbart-news',
 'business-insider',
 'buzzfeed',
 'cnn',
 'fox-news',
 'msnbc',
 'national-review',
 'politico',
 'reuters',
 'the-american-conservative',
 'the-guardian-uk',
 'the-hill',
 'the-huffington-post',
 'the-new-york-times',
 'the-wall-street-journal',
 'the-washington-post'}

In [21]:
print('only in my sources')
set(my_sources) - set(true_sources)

only in my sources


{'daily-kos',
 'examiner',
 'new-york-post',
 'npr',
 'occupy-democrats',
 'talking-points-memo',
 'the-atlantic',
 'the-weekly-standard',
 'vox',
 'washington-examiner'}

## Now test against the new NewsApi class

In [22]:
from news_api import NewsAPI

In [23]:
news_api = NewsAPI(similarity_model.model)
doc_topics = similarity_model.model.get_document_topics(bow, minimum_probability=0.05)

In [24]:
rdf = news_api.query(doc_topics, [resolve_source_id(s) for s in bias_sources_map['center-left']], parsed_doc)

epa pruitt agency document
epa pruitt agency
17 {'q': 'epa pruitt', 'sources': 'the-guardian-uk,politico,the-washington-post,the-new-york-times,cnn,business-insider', 'language': 'en', 'apiKey': '2a1a1f11c4a64049a9f7ba286a976d07'}


In [25]:
import spacy
nlp = spacy.load('en')

In [26]:
n = nlp('mr. agency pruitt agency public')

for token in n:
    print(token.text, token.pos_, token.tag_, token.dep_)

mr NOUN NN ROOT
. PUNCT . punct
agency NOUN NN compound
pruitt NOUN NNS compound
agency NOUN NN compound
public ADJ JJ ROOT


In [27]:
[t for t in rdf.title.head()]

['Scott Pruitt wasted millions at EPA on security detail, report says',
 "EPA lacked justification, authority for Pruitt's 24/7 security detail, watchdog finds - Washington Post",
 'Pruitt faced mounting financial pressures as EPA chief, new documents show - Washington Post',
 'With a shrinking EPA, Trump delivers on his promise to cut government - Washington Post',
 "Watchdog: EPA hasn't justified Pruitt's security spending"]

In [33]:
top_topic_words = news_api._topic_words_joint(doc_topics)
# top_topic_words = news_api._topic_words_top(doc_topics)
news_api._topic_words_joint(doc_topics, include_prob=True)

[('mr.', 0.058592934),
 ('agency', 0.017974241),
 ('epa', 0.01371365),
 ('money', 0.0065978765),
 ('pay', 0.0064430768),
 ('environmental', 0.005629838),
 ('document', 0.005258665),
 ('pruitt', 0.0045642415),
 ('office', 0.0044843247),
 ('environmental_protection_agency', 0.0032364102),
 ('government', 0.0032189593),
 ('environment', 0.002932119),
 ('ethic', 0.0028342165),
 ('information', 0.0027448914),
 ('conflict', 0.0026529545),
 ('report', 0.0024480843),
 ('million', 0.0018707514),
 ('agency', 0.0015428308),
 ('spend', 0.0015169741),
 ('investigation', 0.0014671239),
 ('transparency', 0.0012183699),
 ('public', 0.0012086871),
 ('fund', 0.0011497206),
 ('dollar', 0.0010978124),
 ('government', 0.0010851874),
 ('trump', 0.00082963251),
 ('president', 0.00039815492),
 ('node', 0.00033913099),
 ('attack', 0.00032352866),
 ('want', 0.00022593714)]

In [29]:
# list(news_api._filter_query(top_topic_words, parsed_doc))

In [30]:
# set(parsed_doc[:50])

In [31]:
# [token.lemma_ for token in nlp(title) if not (token.is_punct or token.is_space)]

In [32]:
doc_topics = similarity_model.model.get_document_topics(bow, minimum_probability=0.07)
sorted([(p, news_api.lda.show_topic(tid, 6)) for tid, p in doc_topics], key=lambda x: -x[0])

[(0.15237069,
  [('agency', 0.1179639),
   ('epa', 0.090001889),
   ('environmental', 0.036948301),
   ('pruitt', 0.029954851),
   ('environmental_protection_agency', 0.021240372),
   ('environment', 0.019243326)]),
 (0.12138011,
  [('document', 0.043323945),
   ('government', 0.026519662),
   ('information', 0.022614015),
   ('report', 0.020168744),
   ('agency', 0.012710739),
   ('investigation', 0.012087021)]),
 (0.10647663,
  [('mr.', 0.55028915),
   ('trump', 0.0077916868),
   ('president', 0.0037393644),
   ('node', 0.0031850275),
   ('attack', 0.0030384946),
   ('want', 0.0021219412)]),
 (0.097545616,
  [('money', 0.067638882),
   ('pay', 0.066051938),
   ('million', 0.019178221),
   ('spend', 0.015551433),
   ('fund', 0.011786493),
   ('dollar', 0.01125435)]),
 (0.073466696,
  [('office', 0.061038882),
   ('ethic', 0.03857825),
   ('conflict', 0.036110982),
   ('transparency', 0.016583975),
   ('public', 0.016452176),
   ('government', 0.014771148)])]