In [325]:
import pickle
import calendar
import numpy as np

# gensim
from gensim import corpora, models, similarities, matutils
# sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans


import pandas as pd
from dateparser import parse

# logging for gensim (set to INFO)
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# 1) Bring all data in-house

In [507]:
google_news_texts_only = pickle.load( open("google_news_texts_only.pickle", "rb" ))

# 3) Pre-processing

## Document Preprocessing
We'll need to generate a term-document matrix of word (token) counts for use in LDA.

We'll use `sklearn`'s `CountVectorizer` to generate our term-document matrix of counts. We'll make use of a few parameters to accomplish the following preprocessing of the text documents all within the `CountVectorizer`:
* `analyzer=word`: Tokenize by word
* `ngram_range=(1,2)`: Keep all 1 and 2-word grams
* `stop_words=english`: Remove all English stop words
* `token_pattern=\\b[a-z][a-z]+\\b`: Match all tokens with 2 or more (strictly) alphabet characters <-- only takes alphanumeric 

\b - this is a word boundary, so this just says begining of the word and end of the word

how count vectorizer works is that it defaults to "lowercase=True" and then applies any filter which is in the token_pattern.

## Count Vectorizer -- Documents/ Texts -> One-hot encoding for tokens in the vocabulary 

In [None]:
################ THis is what I'm Focusing on ################

count_vectorizer_unigram = CountVectorizer(analyzer="word",ngram_range=(1,1),stop_words="english", token_pattern="\\b[a-z][a-z]+\\b",
                                  min_df = 2, max_df = 0.2)


fitted_count_vectorizer_100k__FILL_ME_IN__ = count_vectorizer_unigram.fit(list_of_document_strings)
pickle.dump( fitted_count_vectorizer_100k__FILL_ME_IN__, open( "fitted_count_vectorizer_100k__FILL_ME_IN__.pickle", "wb" ) )

In [None]:
count_vectorizer_trigram = CountVectorizer(analyzer="word",ngram_range=(1,3),stop_words="english", token_pattern="\\b[a-z][a-z]+\\b",
                                  min_df = 2, max_df = 0.2)

In [348]:
my_strings = google_news_df_top_pubs_good_dates["text"][google_news_df_top_pubs_good_dates["date_cat"]==(2014,7)]

In [349]:
my_strings[0:5]

7      Health How Big Tobacco Went To War With A Tiny Country CREDIT: AP Images\n\nThe small South American nation of Uruguay might be forced to pay a heavy price for trying to curb smoking and avert a public health disaster. The country is currently embroiled in a high stakes legal battle with Phillips Morris, the world’s largest cigarette manufacturer. The industry giant, whose annual profits outsi...
329    SALVADOR, Brazil — The United States made two changes from its 1-0 loss to Germany ahead of Tuesday’s match against Belgium.\n\nGeoff Cameron, who started the first two games in central defense, replaced Kyle Beckerman in defensive midfield. Alejandro Bedoya, who also started the opening two games, was back in right midfield in place of Brad Davis.\n\nBelgium captain Vincent Kompany will start...
368    Greg Sargent points us to an interesting new CNN poll about Obamacare. It asks the usual question about favoring or opposing the law, with the usual results. The basic question sh

In [350]:
fitted_model = count_vectorizer_unigram.fit(my_strings)

In [353]:
num_topics = 10
n_topics = 10

In [362]:
my_model = text_to_lda_model(my_strings, fitted_model, n_topics=15)

In [365]:
my_model.print_topics()

[(7,
  u'0.005*border + 0.003*children + 0.003*bush + 0.003*family + 0.002*old + 0.002*white + 0.002*mr + 0.002*war + 0.002*texas + 0.002*thing'),
 (0,
  u'0.006*women + 0.004*brazil + 0.004*street + 0.003*mr + 0.003*retirement + 0.003*solar + 0.003*colbert + 0.003*miller + 0.003*black + 0.003*second'),
 (12,
  u'0.008*congress + 0.005*supreme + 0.005*constitution + 0.004*lawsuit + 0.004*boehner + 0.004*administration + 0.004*laws + 0.004*legal + 0.004*decision + 0.003*party'),
 (13,
  u'0.003*coal + 0.003*war + 0.003*power + 0.003*social + 0.003*climate + 0.002*change + 0.002*women + 0.002*media + 0.002*big + 0.002*politics'),
 (14,
  u'0.004*tax + 0.003*ukraine + 0.003*company + 0.002*russia + 0.002*business + 0.002*book + 0.002*school + 0.002*economic + 0.002*middle + 0.002*old'),
 (2,
  u'0.018*women + 0.011*religious + 0.009*lobby + 0.009*hobby + 0.008*abortion + 0.007*decision + 0.006*birth + 0.006*control + 0.006*supreme + 0.005*contraception'),
 (1,
  u'0.008*republicans + 0.00

In [372]:
my_model5 = text_to_lda_model(my_strings, fitted_model, n_topics=15)

In [375]:
my_model5

<gensim.models.ldamodel.LdaModel at 0x1642a6590>

In [374]:
my_model20.print_topics(num_topics =20, )

[(0,
  u'0.007*women + 0.005*party + 0.005*tea + 0.003*police + 0.003*gun + 0.002*republican + 0.002*war + 0.002*conservative + 0.002*man + 0.002*black'),
 (1,
  u'0.006*children + 0.005*information + 0.004*video + 0.004*data + 0.003*company + 0.003*police + 0.003*records + 0.002*department + 0.002*mr + 0.002*marijuana'),
 (2,
  u'0.006*mr + 0.005*cup + 0.004*family + 0.004*brazil + 0.003*soccer + 0.003*middle + 0.003*team + 0.003*class + 0.002*argentina + 0.002*free'),
 (3,
  u'0.015*religious + 0.014*women + 0.011*lobby + 0.011*hobby + 0.007*birth + 0.006*control + 0.006*decision + 0.006*contraception + 0.006*supreme + 0.004*freedom'),
 (4,
  u'0.006*ukraine + 0.004*russia + 0.004*russian + 0.003*flight + 0.003*border + 0.003*ap + 0.003*near + 0.003*plane + 0.003*airlines + 0.003*malaysia'),
 (5,
  u'0.012*abortion + 0.011*women + 0.004*conservatives + 0.003*conservative + 0.003*workers + 0.003*republicans + 0.003*access + 0.002*rights + 0.002*richards + 0.002*child'),
 (6,
  u'0.010

In [364]:
a[a["publisher"]== "theatlantic"]

Unnamed: 0,publisher,text,url,count,date_cat
7978,theatlantic,"Please consider disabling it for our site, or supporting our work in one of these ways\n\nWhy providing women more options to control pregnancies is in the interest of everyone\n\nIn a hotly contested decision this week, the Supreme Court ruled that for-profit employers can opt out of providing certain types of contraception coverage on religious grounds. Ostensibly, the holding is narrow: Onl...",http://www.theatlantic.com/health/archive/2014/07/the-broader-benefits-of-contraception/373856/,1,"(2014, 7)"
10504,theatlantic,"1. We can thank former President Bill Clinton for perfect clarity in his comments about the chaos and horror of Gaza. In an interview on Indian television, Clinton—who told us in his memoir that Palestinian self-destructiveness (in the form of Yasir Arafat’s various delusions and prevarications) undid his effort to bring about a two-state solution to the Middle East conflict—blames the Muslim ...",http://www.theatlantic.com/international/archive/2014/07/understanding-what-hamas-wants/374656/,1,"(2014, 7)"
11034,theatlantic,"Despite the news last week that America's healthcare spending will not be rising at the sky-high rate that was once predicted, the fact remains that the U.S. far outspends its peer nations when it comes to healthcare costs per capita. This year the United States will spend almost 18 percent of the gross domestic product (GDP) on healthcare—six percentage points more than the Netherlands, the n...",http://www.theatlantic.com/business/archive/2014/07/why-do-other-rich-nations-spend-so-much-less-on-healthcare/374576/,1,"(2014, 7)"
27354,theatlantic,"Let us know what we missed.\n\nJustin Bieber: A Case Study in Growing Up Cosseted and Feral\n\nVanessa Grigoriadis | Vulture\n\n""He sees himself as Brando, McQueen, Dean. We may see something different—a costume of machismo; a slip of a boy buffed up and doffing his shirt like a South Bronx stoopie in August; a white person fetishizing blackness with the laserlike focus of someone for whom 'be...",http://www.theatlantic.com/entertainment/archive/2014/07/pop-writing-july-1-2014/373758/,1,"(2014, 7)"
27405,theatlantic,"Researchers from the University of California San Diego and Brown University surveyed parents of first-time patients at a pediatric obesity clinic, assessing the families’ willingness to help their children lose weight. The patients, who ranged in age from 5 to 20, had all been classified as overweight or clinically obese, and most had been referred to the clinic for treatment by their regular...",http://www.theatlantic.com/health/archive/2014/07/study-many-parents-are-in-denial-about-their-kids-obesity/375023/,1,"(2014, 7)"
37094,theatlantic,"Louis C.K.'s regular-guy shtick permeates everything about his image: the plain black T-shirts, the self-deprecating humor, his Twitter bio (""I am a comedian and a person and a guy who is sitting here""), even his mass-emailing strategy. Louie—and it feels right to call him ""Louie"" precisely because of this guy-who-is-sitting-here image he's so expertly cultivated—is, of course, not an ordinary...",http://www.theatlantic.com/technology/archive/2014/07/the-brilliance-of-louis-cks-emails-he-writes-like-a-politician/374034/,1,"(2014, 7)"
44174,theatlantic,"The future of marriage, the future of Millennials: two topics the Internet loves to freak out about. Thanks to a new report from Pew, here the twain shall meet: Researchers asked people of all ages whether society is better off if people focus on getting married and having kids.\n\nAmerican Attitudes Toward Marriage and Kids\n\nLooking at this chart is a little like taking a Rorschach inkblot ...",http://www.theatlantic.com/national/archive/2014/07/millennials-society-will-be-just-fine-without-marriage/374060/,1,"(2014, 7)"
53009,theatlantic,"It is too surreal. I still feel like it was just yesterday when I picked up the report on the achievement gap in college. When we originally started working on PMP, we were thinking of national expansion—almost too early. We quickly learned how difficult it is to do our work and are still working to set reasonable expectations of growth. This summer we made changes to our business model. Now w...",http://www.theatlantic.com/education/archive/2014/07/meet-the-22-year-old-who-is-fixing-the-summer-achievement-gap/375024/,1,"(2014, 7)"
56869,theatlantic,"Please consider disabling it for our site, or supporting our work in one of these ways\n\nBefore Twitter existed, before Facebook allowed more than college students to join its network, two brothers made new media that went viral.\n\nTen years ago today, a small Flash animation firm based in Los Angeles uploaded a video to its website. The three-minute cartoon parodied the year’s presidential ...",http://www.theatlantic.com/technology/archive/2014/07/jibjabs-seminal-flash-parody-turns-10/374161/,1,"(2014, 7)"
60087,theatlantic,"Please consider disabling it for our site, or supporting our work in one of these ways\n\nWhen Team Snapchat announced the launch of Snapchat Stories last fall, it expanded the intimacy of the service so that users could “share your day with friends—or everyone.” More recently, though, the service is fixated on the concept of everywhere, with the addition of location-based filters and other ge...",http://www.theatlantic.com/technology/archive/2014/07/why-snapchat-cares-where-you-are/374733/,1,"(2014, 7)"


In [361]:
a = google_news_df_top_pubs_good_dates[google_news_df_top_pubs_good_dates["date_cat"]==(2014, 7)]
a[a["publisher"]== "npr"]

Unnamed: 0,publisher,text,url,count,date_cat
16734,npr,"The Great Blue Hope: Michelle Nunn Tries The Improbable In Ga.\n\ni toggle caption David Goldman/AP David Goldman/AP\n\nGeorgia has been considered safely red territory for more than a decade. But there's a new energy among Democrats in the state, where candidate Michelle Nunn represents the party's best chance of winning a Senate seat in years.\n\nThis is Nunn's first run for public office, b...",http://www.npr.org/2014/07/29/336356952/the-great-blue-hope-michelle-nunn-tries-the-improbable-in-ga,1,"(2014, 7)"
42802,npr,"'The True American' Reveals A Hopeful, Complicated Country\n\nAfter the 9/11 attacks, hate crimes against people who were thought to be Muslim caught the country's attention. In ""The True American,"" Anand Giridharadas follows the stories of one of those victims.\n\nMICHEL MARTIN, HOST:\n\nThis is TELL ME MORE from NPR News. I'm Michel Martin. Happy Fourth of July. For many of us, this is a day...",http://www.npr.org/2014/07/04/328207608/the-true-american-reveals-a-hopeful-complicated-country,1,"(2014, 7)"
50408,npr,"Patients With Low-Cost Insurance Struggle To Find Specialists\n\ni toggle caption Carrie Feibel for NPR Carrie Feibel for NPR\n\nThe Hope Clinic in southwest Houston is in the very heart of Asia Town, a part of the city where bland strip malls hide culinary treasures — Vietnamese pho, Malaysian noodles, Sichuan rabbit and bubble tea.\n\nInside the clinic, internist Charu Sawhney sees patients ...",http://www.npr.org/sections/health-shots/2014/07/16/331419293/patients-with-low-cost-insurance-struggle-to-find-specialists,1,"(2014, 7)"
56503,npr,"Rare Unanimity In Supreme Court Term, With Plenty Of Fireworks\n\ni toggle caption Mark Wilson/Getty Images Mark Wilson/Getty Images\n\nThe nation greets the coming of July each year with fireworks on the National Mall and, days earlier, explosive decisions at the U.S. Supreme Court.\n\nWhile the Mall fireworks dissipate within moments, the court's decisions will have repercussions for decades...",http://www.npr.org/2014/07/06/329235293/rare-unanimity-in-supreme-court-term-with-plenty-of-fireworks,1,"(2014, 7)"
71325,npr,"CBS Lost Appetite For Government Watchdog Stories, Attkisson Says\n\nWhen the investigative reporter Sharyl Attkisson left CBS this year, she did not do so quietly. She contends the network refused to run stories that might damage President Obama.\n\nRENEE MONTAGNE, HOST:\n\nWhen investigative reporter Sharyl Attkisson left CBS this year, she did not go quietly. She contends, the network refus...",http://www.npr.org/2014/07/07/329420338/cbs-lost-appetite-for-government-watchdog-stories-attkisson-says,1,"(2014, 7)"
77539,npr,"Congress' Latest Death Match Involves A Bank You've Never Heard Of\n\ni toggle caption Drew Perine/MCT/Landov Drew Perine/MCT/Landov\n\nIt sits in an imposing building just across Lafayette Square from the White House. Yet the Export-Import Bank, which has been offering credit to foreign purchasers of U.S. goods for 80 years, could start shutting down operations within a matter of weeks.\n\n""T...",http://www.npr.org/2014/07/10/330434582/congress-latest-death-match-involves-a-bank-youve-never-heard-of,1,"(2014, 7)"
89149,npr,"Liverpool Unloads 'The Biter,' Sending Suarez To Barca For $128 Million\n\ni toggle caption Eitan Abramovich/AFP/Getty Images Eitan Abramovich/AFP/Getty Images\n\nLuis Suarez, the Uruguayan striker who became headline news in the U.S. after biting an Italian player during the World Cup, is moving to a new club. He'll play for Barcelona, after the team reached terms with Liverpool in a transfer...",http://www.npr.org/sections/thetwo-way/2014/07/11/330662428/liverpool-unloads-the-biter-sending-suarez-to-barca-for-128-million,1,"(2014, 7)"
101332,npr,"As Supreme Court Term Ends, Journalist Examines Its Decisions\n\nThe Supreme Court term ended Monday. The New York Times correspondent and lawyer Adam Liptak talks with Fresh Air's Terry Gross about what the decisions reveal about the nine justices.\n\nTERRY GROSS, HOST:\n\nThis is FRESH AIR. I'm Terry Gross. The Supreme Court term ended Monday with a controversial decision - the Hobby Lobby c...",http://www.npr.org/2014/07/02/327764940/journalist-examines-end-of-term-supreme-court-decisions,1,"(2014, 7)"
116489,npr,"California Nurses Union Braces For Contract Battle\n\ntoggle caption April Dembosky/KQED\n\nGoing to a union meeting of nurses is a little bit like going to an evangelical church service.\n\n""We all have to stand up, and it's a struggle,"" says Veronica Cambra, a nurse reporting a grievance at Kaiser Hospital in Fremont, Calif., as though she's giving testimony. ""And we will overcome this, OK?""...",http://www.npr.org/sections/health-shots/2014/07/23/332547848/california-nurses-union-braces-for-contract-battle,1,"(2014, 7)"
120057,npr,"The GOP's New Plan To Tackle Poverty: Helpful Or Hurtful?\n\nMICHEL MARTIN, HOST:\n\nThis is TELL ME MORE from NPR. I'm Michel Martin. We're going to start today talking about politics. Summer is supposed to be a slow time for political news but that's certainly not the case this year. There were some major headlines this week. There were conflicting court rulings pertaining to the Affordable ...",http://www.npr.org/2014/07/25/335294461/the-gops-new-plan-to-tackle-poverty-helpful-or-hurtful,1,"(2014, 7)"


In [328]:
def text_to_lda_model(list_of_document_strings, fitted_count_vectorizer_object, n_topics = 20):
    """Inputs: list of document strings & a fitted count vectorizer files (from sklearn)"""
    
    ########################### Step 1 - Convert Text --> Sparse Scipy Matrix  ###########################################################################
    from sklearn.feature_extraction.text import CountVectorizer
    step_description = "Step 1: Converts the list of strings into lists of one-hot vectors, trained to whatever dictionery was uesd int he fitted vectorized object"
    print "Working on", step_description
    print "It does this by: using fitted_count_vectorizer_object.transform(list_of_strings).transpose()"
    document_strings_as_sparse_matrix = fitted_count_vectorizer_object.transform(list_of_document_strings).transpose()  # The count_vectorized_document_strings object will have be v x d where v = entire vocabulary , and d = # of documents  
    print "Above step completed"
    
    ########################### Step 2 - Sparse Scipy Matrix --> Gensim friendly "corpus" object ###########################################################################
    from gensim import matutils
    step_description = "Step 2: We need to convert our sparse `scipy` matrix to a `gensim`-friendly object called a Corpus:"
    print "Working on", step_description 
    print "It does this by: using gensimmatutils.Sparse2Corpus(sparse scipy matrix)"
    corpus_object = matutils.Sparse2Corpus(document_strings_as_sparse_matrix)    # For each document, it prints out: (Word_id , count) across that document
    print "Above step completed"
    
    ########################### Step 3 - (Need as reference) Take fitted_count_vectorizer_object -> Get vocabulary items -> create dictionary of them. ####################
    step_description =  "Step 3: Map matrix rows to words (tokens) -- We need to save a mapping (dict) of row id to word (token) for later use by gensim:"
    print "Working on", step_description 
    print "It does this by: dict((v,k) for k,v in fitted_count_vectorizer_object.vocabulary_.iteritems())"
    id2word = dict((v,k) for k,v in fitted_count_vectorizer_object.vocabulary_.iteritems())   # length matches up to documents
    print "Above step completed"
    
    ########################### Step 4 - Create LDA Model  ##############################################################################################################
    from gensim import models
    step_description =  "Step 4: Creating the LDA model -- direchet distributions etc..."
    print "Working on", step_description
    print "It does this by: gensim.models.models.LdaModel(corpus_object, id2word=id2word, num_topics=n_topics,passes=10) )"
    lda_model = models.LdaModel(corpus_object, id2word=id2word, num_topics=n_topics,passes=10)
    print "returning lda_model with", num_topics," total topics"
    return lda_model
    print "Above step completed"

In [13]:
################ THis is what I'm Focusing on ################

unigram_count_vectorized_50k = count_vectorizer_unigram.transform(list_of_document_strings).transpose()
unigram_count_vectorized_50k.shape ##53114 documents, with 97854 tokens

In [66]:
#ng_vegs = count_vectorizer.transform(list_of_document_strings).transpose() #<- old

In [67]:
# ng_vegs.shape

(39167, 10576)

# 4) LDA

##### Convert to gensim
We need to convert our sparse `scipy` matrix to a `gensim`-friendly object called a Corpus:

In [69]:
# Convert sparse matrix of counts to a gensim corpus
#corpus = matutils.Sparse2Corpus(ng_vegs)

In [15]:
corpus_unigram_50k = matutils.Sparse2Corpus(unigram_count_vectorized_50k)

In [122]:
corpus_unigram_20k = matutils.Sparse2Corpus(unigram_count_vectorized_20k)

##### Map matrix rows to words (tokens)
We need to save a mapping (dict) of row id to word (token) for later use by gensim:

In [72]:
id2word = dict((v,k) for k,v in count_vectorizer.vocabulary_.iteritems())

In [125]:
# id2word_unigram_20k = dict((v,k) for k,v in count_vectorizer_unigram.vocabulary_.iteritems()) <-- old model

In [18]:
id2word_unigram_50k = dict((v,k) for k,v in count_vectorizer_unigram.vocabulary_.iteritems())

## LDA
At this point we can simply plow ahead in creating an LDA model.  It requires our corpus of word counts, mapping of row ids to words, and the number of topics (3).

In [75]:
# # 10 K models...
# # Create lda model (equivalent to "fit" in sklearn)
# lda = models.LdaModel(corpus, id2word=id2word, num_topics=3,passes=10)
# lda_3 = models.LdaModel(corpus, id2word=id2word, num_topics=3,passes=10) #<- if i ever have to reload this...

In [20]:
################ THis is what I'm Focusing on ################

lda_20topics_50k = models.LdaModel(corpus_unigram_50k, id2word=id2word_unigram_50k, num_topics=20,passes=10)

In [21]:
pickle.dump( lda_20topics_50k, open( "lda_20topics_50k.pickle", "wb" ) )

### LDA = Takes a Corpus -> and Outputs a Topic Space.

So, given a new corpus, it will output it onto a new topic space. This is helpful if you get new documents that you want to add to the corpus...

Let's take a look at what happened.  Here are the 5 most important words for each of the 3 topics we found:

In [29]:
lda_20topics_50k.print_topics(num_words = 20,num_topics=20)

[(0,
  u'0.014*court + 0.007*legal + 0.007*case + 0.007*laws + 0.005*congress + 0.005*justice + 0.005*irs + 0.005*supreme + 0.004*rules + 0.004*agency + 0.004*general + 0.004*decision + 0.004*rule + 0.004*constitution + 0.004*department + 0.004*rights + 0.004*judge + 0.004*executive + 0.004*power + 0.004*attorney'),
 (1,
  u'0.029*patients + 0.029*hospital + 0.026*medical + 0.022*hospitals + 0.021*information + 0.018*patient + 0.013*doctors + 0.012*physicians + 0.010*mondaq + 0.009*physician + 0.009*providers + 0.007*users + 0.007*primary + 0.006*doctor + 0.006*site + 0.006*services + 0.006*com + 0.006*practice + 0.006*provide + 0.006*medicine'),
 (2,
  u'0.013*city + 0.007*community + 0.006*york + 0.006*center + 0.005*event + 0.005*st + 0.005*school + 0.005*group + 0.005*photo + 0.005*university + 0.004*south + 0.004*members + 0.004*director + 0.004*board + 0.004*mayor + 0.003*john + 0.003*county + 0.003*native + 0.003*hall + 0.003*member'),
 (3,
  u'0.007*food + 0.007*water + 0.006*c

In [175]:
lda_20_unigram_20k.print_topics(num_words = 10)

[(0,
  u'0.022*company + 0.017*business + 0.010*companies + 0.008*technology + 0.006*market + 0.006*industry + 0.006*online + 0.005*facebook + 0.005*data + 0.005*based'),
 (1,
  u'0.016*religious + 0.014*rights + 0.012*church + 0.011*marriage + 0.011*catholic + 0.009*god + 0.008*gay + 0.007*freedom + 0.007*control + 0.007*gun'),
 (8,
  u'0.034*women + 0.025*school + 0.023*students + 0.022*children + 0.014*education + 0.014*college + 0.012*family + 0.011*young + 0.011*university + 0.010*child'),
 (12,
  u'0.022*cruz + 0.013*clinton + 0.010*military + 0.009*trump + 0.009*putin + 0.008*white + 0.008*war + 0.008*immigration + 0.007*syria + 0.007*russia'),
 (18,
  u'0.019*patients + 0.015*medical + 0.015*hospital + 0.014*information + 0.011*patient + 0.009*use + 0.008*hospitals + 0.008*doctors + 0.008*services + 0.007*mondaq'),
 (16,
  u'0.015*city + 0.011*county + 0.008*police + 0.007*community + 0.006*local + 0.006*center + 0.005*home + 0.005*area + 0.004*st + 0.004*council'),
 (6,
  u'0.

In [171]:
lda_20_trigram_20k.print_topics(num_words=10)

[(18,
  u'0.036*afterward adds + 0.023*allow higher + 0.017*arrangements handled simpson + 0.016*arrant + 0.014*appealing options + 0.009*arranger + 0.009*adding house republicans + 0.009*appealing policy + 0.009*ask foreign + 0.007*americans pay year'),
 (16,
  u'0.010*amerikanischen + 0.009*anobody com + 0.007*approved federal government + 0.006*allegedly asked + 0.005*agencies spend + 0.005*anobody com john + 0.005*adobes ariz opened + 0.004*act applied + 0.004*able cheerleaders + 0.004*ashley river'),
 (9,
  u'0.011*appeal months federal + 0.004*asked define + 0.002*acres wetlands + 0.001*approved bush + 0.001*agreement mr + 0.001*amy walter + 0.001*arrived exactly + 0.001*applause strong continued + 0.001*access energy + 0.001*abortion birth'),
 (13,
  u'0.005*acquisition majority + 0.004*asked really think + 0.003*ahead congressional + 0.003*america attempts + 0.003*americans carry + 0.003*adjustment purposes + 0.003*affect property + 0.002*airmen + 0.002*adherents winteregg + 0.

In [76]:
lda.print_topics(num_topics=3,num_words=10)

[(0,
  u'0.003*life + 0.003*city + 0.002*court + 0.002*world + 0.002*school + 0.002*women + 0.002*says + 0.002*old + 0.002*family + 0.002*home'),
 (1,
  u'0.011*mr + 0.004*romney + 0.004*democrats + 0.004*senate + 0.004*campaign + 0.003*ve + 0.003*conservative + 0.003*tax + 0.003*cruz + 0.003*america'),
 (2,
  u'0.006*coverage + 0.004*plans + 0.004*healthcare + 0.004*medicaid + 0.003*services + 0.003*medical + 0.003*cost + 0.003*tax + 0.003*pay + 0.003*says')]

In [172]:
lda_10topics.print_topics(num_topics=10,num_words=7)

[(0,
  u'0.004*says + 0.004*school + 0.003*old + 0.003*city + 0.003*home + 0.003*life + 0.003*really'),
 (1,
  u'0.010*ve + 0.008*america + 0.007*let + 0.006*world + 0.006*applause + 0.005*trump + 0.005*ll'),
 (2,
  u'0.012*coverage + 0.010*plans + 0.009*healthcare + 0.008*medicaid + 0.007*patients + 0.006*exchange + 0.006*services'),
 (3,
  u'0.013*information + 0.006*content + 0.005*county + 0.005*report + 0.004*police + 0.004*security + 0.004*use'),
 (4,
  u'0.013*court + 0.006*supreme + 0.006*case + 0.004*power + 0.004*mandate + 0.004*conservative + 0.004*decision'),
 (5,
  u'0.016*women + 0.009*abortion + 0.008*religious + 0.008*life + 0.005*rights + 0.004*church + 0.004*control'),
 (6,
  u'0.032*mr + 0.013*romney + 0.010*campaign + 0.006*advertisement + 0.005*governor + 0.005*voters + 0.005*election'),
 (7,
  u'0.016*senate + 0.012*democrats + 0.009*cruz + 0.008*shutdown + 0.006*gop + 0.006*vote + 0.005*sen'),
 (8,
  u'0.046*internet + 0.039*browser + 0.035*longer + 0.032*support

In [96]:
#lda_50topics.num_topics
lda_50topics.print_topics(num_topics=50,num_words=4)

[(0, u'0.018*senate + 0.013*democrats + 0.009*legislation + 0.007*vote'),
 (1, u'0.108*ohio + 0.101*biden + 0.079*vice + 0.065*kasich'),
 (2, u'0.050*christmas + 0.043*http + 0.024*comedy + 0.016*stewart'),
 (3, u'0.085*south + 0.050*north + 0.046*gun + 0.046*carolina'),
 (4, u'0.048*business + 0.040*company + 0.022*small + 0.018*businesses'),
 (5, u'0.081*cgi + 0.034*aarp + 0.026*illinois + 0.021*logica'),
 (6, u'0.038*religious + 0.022*church + 0.020*catholic + 0.012*christian'),
 (7, u'0.026*patients + 0.022*medical + 0.018*hospital + 0.015*doctors'),
 (8, u'0.029*white + 0.028*enrollment + 0.022*sign + 0.018*jan'),
 (9, u'0.049*employees + 0.036*hours + 0.025*employers + 0.025*mandate'),
 (10, u'0.025*kennedy + 0.019*que + 0.015*davis + 0.014*nov'),
 (11, u'0.063*cancer + 0.045*disease + 0.025*heart + 0.023*brain'),
 (12, u'0.021*ve + 0.012*got + 0.011*ll + 0.011*lot'),
 (13, u'0.045*school + 0.035*students + 0.027*education + 0.019*college'),
 (14,
  u'0.082*immigration + 0.067*ex

### The step above is basically the topic space that's created -- for a given topic, it gives the distribution of words...

#### Topic Space
If we want to map our documents to the topic space we need to actually use the LdaModel transformer that we created above, like so:

In [78]:
# Transform the docs from the word space to the topic space (like "transform" in sklearn)

lda_corpus = lda[corpus]

In [None]:
i = 0
for x in lda_corpus:
    i += 1
    if i >3:
        break
    print x

In [None]:
# Store the documents' topic vectors in a list so we can take a peak

lda_docs = [doc for doc in lda_corpus]

Now we can take a look at the document vectors in the topic space, which are measures of the component of each document along each topic.  Thus, at most a document vector can have num_topics=3 nonzero components in the topic space, and most have far fewer.

In [None]:
# Check out the document vectors in the topic space for the first 5 documents
for doc in lda_docs[0:5]:
    print doc
    print

In [16]:
len(google_news_dictionary_article_text.keys())

163184

<hr>

# Creating a usuable dataframe...

In [76]:
google_news_dictionary_article_text = pickle.load( open("google_news_dictionary_article_text.pickle", "rb" ))

In [202]:
news_domains = pd.read_csv("news_domains.csv")

In [203]:
news_domains["domain_stripped"] = news_domains["Domain"]
for i in range(len(news_domains["domain_stripped"])):
    news_domains["domain_stripped"][i] = news_domains["domain_stripped"][i].replace(".com","")
    news_domains["domain_stripped"][i] = news_domains["domain_stripped"][i].replace(".org","")

In [212]:
important_domains

['cnn',
 'huffingtonpost',
 'time',
 'npr',
 'slate',
 'newsweek',
 'usnews',
 'politico',
 'salon',
 'indymedia',
 'democraticunderground',
 'theatlantic',
 'villagevoice',
 'dailykos',
 'eschatonblog',
 'newyorker',
 'thedailybeast',
 'alternet',
 'commondreams',
 'crooksandliars',
 'buzzflash',
 'talkingpointsmemo',
 'moveon',
 'motherjones',
 'amnesty',
 'counterpunch',
 'thenation',
 'antiwar',
 'thinkprogress',
 'rawstory',
 'tnr',
 'plannedparenthood',
 'informationclearinghouse.info',
 'whatreallyhappened',
 'opednews',
 'bad.eserver',
 'politicalwire',
 'wsws',
 'aclu',
 'cidh.oas',
 'mediamatters',
 'feministing',
 'truthout',
 'drudge',
 'prospect',
 'harpers',
 'firedoglake',
 'truthdig',
 'wonkette',
 'americablog',
 'fivethirtyeight',
 'washingtonmonthly',
 'michaelmoore',
 'bartcop',
 'airamerica',
 'oliverwillis',
 'mydd',
 'americanprogress',
 'brookings.edu',
 'zmag',
 'foxnews',
 'wsj',
 'drudgereport',
 'nypost',
 'worldnetdaily',
 'newsmax',
 'freerepublic',
 'wash

In [214]:
35/float(160)

0.21875

In [211]:
len(important_domains)

120

In [219]:
google_news_df.keys()

Index([u'publisher', u'date', u'text', u'url', u'date_cleaned'], dtype='object')

In [280]:
import lonprgging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

from gensim.summarization import summarize
# 1 Article -> 10% of article
summeryA = summarize(text,ratio=0.1)
# 1 Article -> 100 words
summeryB = summarize(text,word_count=100)

In [284]:
summeryB

u'This week: The GOP\'s victory a Florida special election, Rand Paul\'s emergence as the big winner at CPAC, and conservative critiques of Obama\'s appearance on "Between Two Ferns."\nThe CPAC presidential straw poll results are also meaningless, but I remain convinced that Rand Paul is the man to beat in the Republican race, for all the reasons I have outlined previously, and also because he persists as an outlier in his party on foreign policy.'

In [300]:
TextBlob(summeryA).sentences


[Sentence("This week: The GOP's victory a Florida special election, Rand Paul's emergence as the big winner at CPAC, and conservative critiques of Obama's appearance on "Between Two Ferns.""),
 Sentence("The CPAC presidential straw poll results are also meaningless, but I remain convinced that Rand Paul is the man to beat in the Republican race, for all the reasons I have outlined previously, and also because he persists as an outlier in his party on foreign policy."),
 Sentence("Yes, the incumbent Republican Senator John Cornyn easily beat a crackpot far-right challenger, Steve Stockman, but in the most closely contested race for arguably the most powerful office in Texas, Lieutenant Governor, the Republican incumbent, the very conservative David Dewhurst, received only 28 percent of the vote, losing to the very far right Tea Party darling, State Senator Dan Patrick, who received 42 percent."),
 Sentence("A number of prominent conservatives have spoken out against President Obama's Af

In [302]:
from vaderSentiment import vaderSentiment
from textblob import TextBlob

for sentence in TextBlob(summeryA).sentences:
    print vaderSentiment.sentiment(str(sentence))

In [278]:
summeryB

u'This week: The GOP\'s victory a Florida special election, Rand Paul\'s emergence as the big winner at CPAC, and conservative critiques of Obama\'s appearance on "Between Two Ferns."\nThe CPAC presidential straw poll results are also meaningless, but I remain convinced that Rand Paul is the man to beat in the Republican race, for all the reasons I have outlined previously, and also because he persists as an outlier in his party on foreign policy.'

In [225]:
summarize(text,ratio=0.1)

u'This week: The GOP\'s victory a Florida special election, Rand Paul\'s emergence as the big winner at CPAC, and conservative critiques of Obama\'s appearance on "Between Two Ferns."\nThe CPAC presidential straw poll results are also meaningless, but I remain convinced that Rand Paul is the man to beat in the Republican race, for all the reasons I have outlined previously, and also because he persists as an outlier in his party on foreign policy.\nYes, the incumbent Republican Senator John Cornyn easily beat a crackpot far-right challenger, Steve Stockman, but in the most closely contested race for arguably the most powerful office in Texas, Lieutenant Governor, the Republican incumbent, the very conservative David Dewhurst, received only 28 percent of the vote, losing to the very far right Tea Party darling, State Senator Dan Patrick, who received 42 percent.\nA number of prominent conservatives have spoken out against President Obama\'s Affordable Care Act\u2013promoting appearance 

In [229]:
google_news_df.keys()

Index([u'publisher', u'date', u'text', u'url', u'date_cleaned'], dtype='object')

In [237]:
pd.set_option('max_colwidth',400)

In [238]:
df_of_scopes_dates[df_of_scopes_dates["scope"]==(2014, 3)]

Unnamed: 0,scope,scope_type,tfidf_top10,top1,top2,top3,top4,top5,google_titles_for_top1,google_summeries_for_top1,google_titles_for_top2,google_summeries_for_top2,google_titles_for_top3,google_summeries_for_top3,google_titles_for_top4,google_summeries_for_top4,google_titles_for_top5,google_summeries_for_top5
7,"(2014, 3)",date,"[liberals conservatives, military presence, comes home, military spending pentagon, military ukraine, comes future, millennial americans, comers, millennials 2014, millennials 2014 slogging]",liberals conservatives,military presence,comes home,military spending pentagon,military ukraine,"[Why is Obamacare so controversial? - BBC News - BBC.com, Poll: Only Republicans and conservatives say Obamacare is 'too liberal', Obamacare: conservative logic versus liberal facts - Liberal Bias, Paul Krugman: “Obamacare IS the conservative alternative” - PNHP's ..., ObamaCare - Conservapedia, The Real Numbers On 'The Obamacare Effect' Are In-Now Let The ..., Patient Protection and Affordabl...","[[<em>Democrats</em>, ' ... deadline for individuals to avoid penalties is pushed back six weeks to , <em>March 2014</em>, ... The party and a veritable industry of , <em>conservative</em>, think tanks and advocacy ...], [<em>Democrats</em>, ' ... deadline for individuals to avoid penalties is pushed back six weeks to , <em>March 2014</em>, ... The party and a veritable industry of , <em>co...","[Why is Obamacare so controversial? - BBC News - BBC.com, Obamacare Extension Ends: The Tax Penalty and You | Military.com, Here's What Happens If You Don't Sign Up For Obamacare - Business ..., Obamacare's Exchanges Fall Well Short of Enrollment Target, Don't get spun by Internet rumors. - FactCheck.org, Budget experts: Move Tricare beneficiaries to Obamacare, Obamacare Penalties: 3 Things to...","[[<em>Obamacare</em>, , is the largest overhaul of the US ... Oct 23: The deadline for individuals to avoid penalties is pushed back six weeks to , <em>March 2014</em>, .... Illegal immigrants in the US often get better care than the nation's , <em>military</em>, veterans, ... Drone at , <em>Base</em>, 101 in Niger ...], [<em>Obamacare</em>, , is the largest overhaul of the US ... Oct 23: T...","[Why is Obamacare so controversial? - BBC News - BBC.com, ObamaCare Facts: Facts on the Affordable Care Act, ObamaCare | Health Insurance Exchange - ObamaCare Facts, Patient Protection and Affordable Care Act - Wikipedia, the free ..., The House has voted 54 times in four years on Obamacare. Here's the ..., Decision To Waive Obamacare Rules On Small Business Health ..., Key Features of the Aff...","[[<em>Obamacare</em>, , is the largest ... for individuals to avoid penalties is pushed back six weeks to , <em>March 2014</em>, ... Meanwhile, the , <em>House</em>, of Representatives, controlled by the Republicans, <wbr>, has ...</wbr>], [<em>Obamacare</em>, , is the largest ... for individuals to avoid penalties is pushed back six weeks to , <em>March 2014</em>, ... Meanwhile, the , <em>...","[Budget experts: Move Tricare beneficiaries to Obamacare, Sequestration's Impact on Military Spending, 2013 – 2014, China's Military Budget Rises as Pentagon's Retreats | The Fiscal Times, With $8.5 Trillion Unaccounted for, Why Should Congress Increase the ..., pentagon Archives - Global Campaign on Military Spending, Articles about Military Spending - latimes, 2014 United States federal budg...","[[<em>budget</em>, experts suggested the , <em>Pentagon</em>, trim its health , <em>budget</em>, by ... By Patricia Kime, Staff writer 4:41 p.m. EST December 21, , <em>2014</em>, ... The , <em>Defense</em>, Department could slash its enormous health care , <em>budget</em>, by requiring Tricare .... National Guard Bureau picks Vermont soldier's , <em>marching</em>, song.], [<em>budget</e...","[Articles from March, 2014 | PunditFact - PolitiFact, March | 2014 | New York Analysis of Policy and Government, Putin ends army exercise, Russian markets rally - CNBC.com, Russian Jets Called 'More Aggressive' Than During Cold War, Volunteer Battalion Prepares to Defend Ukraine's Capital, Annexation of Crimea by the Russian Federation - Wikipedia, the free ..., Russian military intervention i...","[[<em>March</em>, 30th, , <em>2014</em>, at 6:20 p.m.. , <em>March</em>, 30 shows take on NSA reforms, , <em>Obamacare</em>, and , <em>Ukraine</em>, . Will Congress pick up on ... Russia's , <em>military</em>, intervention in , <em>Ukraine</em>, had pundits and politicians talking. They had their facts ...], [<em>March</em>, 30th, , <em>2014</em>, at 6:20 p.m.. , <em>March</em>, 30 sh..."


In [204]:
important_domains = list(news_domains["domain_stripped"])

In [20]:
import re
import ast

url_list = []
domain_list = []
text_list = []
date_list = []

for url,info in google_news_dictionary_article_text.items():
    ############# 1 Get the url      #######################################################
    url_list.append(url)
    
    ############# 2 Extract out the domain from the url   #######################################################
    domain = url.replace("http://www.","").replace("http://","")
    #Find end of domain...
    end = domain.find(".com")
    if end == -1:   #If it doesn't find the .com ending, it checks if it's a ".org"...
        end = domain.find(".org")
    start = 0
    domain = domain[start:end]
    
    if domain.find(".") != -1:  # If it contains a "period"...
        start = domain.find(".")+1
        domain = domain[start:end]

    domain_list.append(domain)

    ############# 3 Get out the text #######################################################
    text_list.append(info["article_text"])

    ############# 4 Get the date... the hardest part...  #######################################################
    metadata_keys = info["article_metadata"].keys()    
    date = "no_date_found!"

    if "article_date_original" in metadata_keys:
        date = info["article_metadata"]["article_date_original"]
            
    if "date"  in metadata_keys:  # Used for cnn
        date = info["article_metadata"]["date"]

    if "pubDate" in metadata_keys:  # 
        date = info["article_metadata"]["pubDate"]
    
    if "sailthru.date" in metadata_keys:  # 
        date = info["article_metadata"]["sailthru.date"]

    if "publishdate" in metadata_keys:  # 
        date = info["article_metadata"]["publishdate"]
        
    if "dc.date" in metadata_keys:  # Fox News
        date = info["article_metadata"]["dc.date"]
    
    if "pubdate" in metadata_keys:  # CNN News
        date = info["article_metadata"]["pubdate"]
    
    if "date_published" in metadata_keys:  # USNEWS
        date = info["article_metadata"]["date_published"]
        
    if "article.published" in metadata_keys:  # WSJ
        date = info["article_metadata"]["article.published"]
     
    if "article" in metadata_keys:  # thinkprogress
        if "published_time" in info["article_metadata"]["article"]:
            date = info["article_metadata"]["article"]["published_time"]            
        
    if domain == "salon": 
        date = url[21:20+11]
        
    if domain == "washingtontimes":
        year_loc = re.findall("(\/[0-9]{4}\/)", url)[0]        
        start = url.find(year_loc) + 1
        date = url[start:start+11]   ############ ..../2015/nov/17/.....
        
    if domain == "townhall" or domain == "lifenews" or domain == "realclearpolitics" or domain == "dailykos" or domain == "newsbusters" or domain == "rushlimbaugh": # or domain == "npr":
        year_loc = re.findall("(\/[0-9]{4}\/)", url)[0]        
        start = url.find(year_loc) + 1
        date = url[start:start+10]   ######### .../2015/01/05/...
        
    if domain == "politico" or domain == "motherjones" or domain == "rawstory" or domain == "theatlantic" or domain == "heritage" or domain == "americanthinker":
        year_loc = re.findall("(\/[0-9]{4}\/)", url)[0]
        start = url.find(year_loc) + 1
        date = url[start:start+7]   # year + mo

    if domain == "talkingpointsmemo":
        dict_as_string =  info["article_metadata"]["parsely-page"]
        dict_as_string = str(dict_as_string)
        try:
            dict_as_dict = ast.literal_eval(dict_as_string)
            date = dict_as_dict["pub_date"]
        except:
            pass
            
    if domain == "huffingtonpost" : # Huffpost as has two varieties-- this is the other one.
        try:
            year_loc = re.findall("(\/[0-9]{4}\/)", url)[0]        
            start = url.find(year_loc) + 1
            date = url[start:start+10]   ######### .../2015/01/05/...
        except:
            pass

    bad_domains = ["newswithviews","opednews","aclu","npr","antiwar","slate","usnews","newsmax","outsidethebeltway","cnn","huffingtonpost","talkingpointsmemo","spectator","cnsnews","vdare","time","wonkette","gopusa","wsj","prospect","americanprogress","newsweek","lifesitenews"]   #go back to "time" later...
    if date == "no_date_found!":
        date = "N/A"
        if domain in important_domains and domain not in bad_domains:        
        #if domain in "huffingtonpost":
            print "ATTENTION!!! - IMPORTANT DOMAIN"
            print domain
            print url
            print info["article_metadata"].keys()
            print
            print info["article_metadata"]
            print "____________________________________________"
            print
    #elif date != "no_date_found!":
    
    date_list.append(date)

In [292]:
google_news_df = pd.DataFrame([domain_list,date_list,text_list,url_list]).T

In [None]:
dict_of_scopes

In [293]:
google_news_df.columns = ["publisher","date","text","url"]

In [27]:
len(google_news_dictionary_article_text.keys())

163184

In [294]:
google_news_df["date_cleaned"] = str(google_news_df["date"])
n = len(google_news_df["date"])
print n

163184


In [None]:
# Parsing the dates to create the date_cleaned columns...

In [55]:
for i in range(82200,n):
    date_old = str(google_news_df["date"][i])
    if i % 200 == 0:
        print i,
    new_date = parse(date_old)
    google_news_df["date_cleaned"][i] = new_date

    #print i, date_old, new_date

82200 82400 82600 82800 83000 83200 83400 83600 83800 84000 84200 84400 84600 84800 85000 85200 85400 85600 85800 86000 86200 86400 86600 86800 87000 87200 87400 87600 87800 88000 88200 88400 88600 88800 89000 89200 89400 89600 89800 90000 90200 90400 90600 90800 91000 91200 91400 91600 91800 92000 92200 92400 92600 92800 93000 93200 93400 93600 93800 94000 94200 94400 94600 94800 95000 95200 95400 95600 95800 96000 96200 96400 96600 96800 97000 97200 97400 97600 97800 98000 98200 98400 98600 98800 99000 99200 99400 99600 99800 100000 100200 100400 100600 100800 101000 101200 101400 101600 101800 102000 102200 102400 102600 102800 103000 103200 103400 103600 103800 104000 104200 104400 104600 104800 105000 105200 105400 105600 105800 106000 106200 106400 106600 106800 107000 107200 107400 107600 107800 108000 108200 108400 108600 108800 109000 109200 109400 109600 109800 110000 110200 110400 110600 110800 111000 111200 111400 111600 111800 112000 112200 112400 112600 112800 113000 1132

In [195]:
#pickle.dump( google_news_df, open( "google_news_df.pickle", "wb" ) )

google_news_df = pickle.load( open("google_news_df.pickle", "rb" ))




In [198]:
google_news_df.head()

Unnamed: 0,publisher,date,text,url,date_cleaned
0,philly,,But the haters argue that college success does...,http://www.philly.com/philly/columnists/201504...,
1,care2,,"Coming out of the Iowa caucuses, Ted Cruz appe...",http://www.care2.com/causes/ted-cruzs-healthca...,
2,timescall,2015-03-14T09:30:07-0600,"One evening last fall, Dr. D'Anne Rudden was h...",http://www.timescall.com/lifestyles/ci_2770718...,2015-03-14 15:30:07
3,pnj,,"In his newsletters, Panhandle U.S. Rep. Jeff M...",http://www.pnj.com/story/opinion/2015/07/18/vi...,
4,thefiscaltimes,,2016 could be the election year of political s...,http://www.thefiscaltimes.com/2015/06/07/Chaff...,


In [197]:
google_news_df.columns = ["publisher","date","text","url","date_cleaned"]

In [206]:
google_news_df_top_pubs = google_news_df[google_news_df["publisher"].isin(important_domains)]

In [207]:
# See how many of the 160,000 articles I have, have actually good dates... it appears to be about half

dates_count_all = 0
none_type_example = google_news_df["date_cleaned"][0]
for x in google_news_df["date_cleaned"]:
    if x != none_type_example:
        dates_count_all += 1

print dates_count_all        
# See how many of the 35,000 articles from top publishers, which have good dates -- nearly all. Yay!

dates_count_top = 0
for x in google_news_df_top_pubs["date_cleaned"]:
    if x != none_type_example:
        dates_count_top += 1

print dates_count_top

89985
33749


In [208]:
print "All articles in sample:", google_news_df.shape
print "All articles in sample w/ good dates:", dates_count_all
print
print "All articles by top domains:", google_news_df_top_pubs.shape
print "All articles by top domains w/ good dates:", dates_count_top # This is good - we get most of the dates
print "Final dimension of df of interest:", google_news_df_top_pubs_good_dates.shape

All articles in sample: (163184, 5)
All articles in sample w/ good dates: 89985

All articles by top domains: (35166, 5)
All articles by top domains w/ good dates: 33749
Final dimension of df of interest: (33749, 5)


In [210]:
google_news_df.describe()

Unnamed: 0,publisher,date,text,url,date_cleaned
count,163184,163184.0,163184.0,163184,89985
unique,8948,67100.0,150894.0,163184,62790
top,thefiscaltimes,,,http://www.pharmacytimes.com/publications/dire...,2014-01-28 00:00:00
freq,6162,72772.0,3953.0,1,197


In [None]:
# Removing all entries which have bad dates...

date_criteria_list = []
for date in google_news_df_top_pubs["date_cleaned"]:
    status = (date != none_type_example)
    date_criteria_list.append(status)

google_news_df_top_pubs_good_dates = google_news_df_top_pubs[date_criteria_list]
google_news_df_top_pubs_good_dates["count"] = 1

In [None]:
# Creating an extra column which focuses just on year and month of a given date/time object

dates_as_categorical_list = []
for x in google_news_df_top_pubs_good_dates["date_cleaned"]:
#    print x, type(x), x.month, x.year
    dates_as_categorical_list.append((x.year,x.month))

google_news_df_top_pubs_good_dates["date_cat"] = dates_as_categorical_list

In [None]:
del google_news_df_top_pubs_good_dates["date"]
del google_news_df_top_pubs_good_dates["date_cleaned"]

In [15]:
#pickle.dump( google_news_df_top_pubs_good_dates, open( "google_news_df_top_pubs_good_dates.pickle", "wb" ) )

google_news_df_top_pubs_good_dates = pickle.load( open("google_news_df_top_pubs_good_dates.pickle", "rb" ))

# Viewing the counts as groupby objects

In [16]:
google_news_df_top_pubs_good_dates.groupby(by = "date_cat", as_index=False).sum().head(7)

Unnamed: 0,date_cat,count
0,"(2013, 8)",1
1,"(2013, 9)",1079
2,"(2013, 10)",1037
3,"(2013, 11)",1168
4,"(2013, 12)",1195
5,"(2014, 1)",1382
6,"(2014, 2)",1117


In [17]:
google_news_df_top_pubs_good_dates.groupby(by = "publisher", as_index=False).sum()

Unnamed: 0,publisher,count
0,alternet,76
1,americablog,48
2,americanthinker,477
3,antiwar,1
4,cato,78
5,cnn,653
6,cnsnews,121
7,counterpunch,65
8,dailykos,375
9,fivethirtyeight,30


# Creating the Objects...

In [19]:
# Creates two grouped by objects -- which enables you to extract the texts and dates as needed...
# e.g: publishers.get_group("wsj")
# e.g. months.get_group((2015, 5))

publishers = google_news_df_top_pubs_good_dates.groupby(by="publisher", as_index=False)
months = google_news_df_top_pubs_good_dates.groupby(by="date_cat", as_index=False)

In [22]:
google_news_df_top_pubs_good_dates.keys()

Index([u'publisher', u'text', u'url', u'count', u'date_cat'], dtype='object')

In [23]:
date_keys = google_news_df_top_pubs_good_dates["date_cat"].unique()
date_keys = date_keys[:-1]  # Removes (2013, 8) from the list...

In [24]:
date_keys

array([(2014, 7), (2013, 9), (2015, 8), (2013, 10), (2015, 1), (2015, 5),
       (2014, 12), (2014, 6), (2016, 1), (2015, 6), (2015, 12), (2016, 3),
       (2015, 9), (2015, 4), (2015, 11), (2014, 9), (2014, 1), (2013, 11),
       (2015, 3), (2014, 10), (2013, 12), (2014, 8), (2015, 10), (2016, 2),
       (2016, 4), (2014, 4), (2014, 2), (2014, 3), (2014, 11), (2015, 2),
       (2015, 7), (2014, 5), (2016, 5)], dtype=object)

In [26]:
publisher_keys = google_news_df_top_pubs_good_dates["publisher"].unique()
publisher_keys = publisher_keys[:-1]  # Removes "antiwar" domain b/c it's too limited

In [27]:
publisher_keys

array(['thinkprogress', 'politico', 'salon', 'slate', 'townhall', 'cnn',
       'reason', 'foxnews', 'wsj', 'usnews', 'hotair', 'washingtontimes',
       'nationalreview', 'npr', 'huffingtonpost', 'newsmax',
       'weeklystandard', 'rawstory', 'lifenews', 'powerlineblog',
       'motherjones', 'thedailybeast', 'theatlantic', 'nypost', 'cnsnews',
       'realclearpolitics', 'dailykos', 'heritage', 'alternet',
       'americanthinker', 'newyorker', 'newsbusters', 'frontpagemag',
       'villagevoice', 'truthdig', 'wizbangblog', 'rushlimbaugh',
       'mediamatters', 'cato', 'talkingpointsmemo', 'wsws',
       'fivethirtyeight', 'thenation', 'americablog', 'counterpunch'], dtype=object)

In [32]:
dict_of_text_strings = {}

In [33]:
dict_of_scopes = {}

In [77]:
### For clean-up -- there's no reason to have a separate "dict_of_text_strings"... seems like everything can pivot off of the 
## dict of scopes, can fix later

for publisher in publisher_keys:
    # Part 1 - Getting the text information sorted out
    list_of_article_texts = list(publishers.get_group(publisher)["text"])
    dict_of_text_strings[publisher] = list_of_article_texts
    # Part 2 - Getting the urls & titles information sorted out    
    list_of_urls = list(publishers.get_group(publisher)["url"])
    list_of_titles = []
    for url in list_of_urls:
        list_of_titles.append(google_news_dictionary_article_text[url]["article_title"])
    
    
    # Part 3 - Putting it all together
    dict_of_scopes[publisher] = {"list_of_texts": list_of_article_texts,
                                 "list_of_titles" : list_of_titles,
                                "tfidf_top300": "",
                                "tfidf_top10": "",
                                "list_of_dates": list(publishers.get_group(publisher)["date_cat"]),
                                "list_of_urls": list_of_urls}

print "publishers info added to dictionary"
print

for date in date_keys:        
    # Part 1 - Getting the text information sorted out
    list_of_article_texts = list(months.get_group(date)["text"])
    dict_of_text_strings[date] = list_of_article_texts

    # Part 2 - Getting the urls & titles information sorted out    
    list_of_urls = list(months.get_group(date)["url"])
    list_of_titles = []
    for url in list_of_urls:
        list_of_titles.append(google_news_dictionary_article_text[url]["article_title"])
        
    # Part 3 - Putting it all together
    dict_of_scopes[date] = {"list_of_texts": list_of_article_texts,
                                 "list_of_titles" : list_of_titles,
                                "tfidf_top300": "",
                                "tfidf_top10": "",
                                "list_of_dates": list(publishers.get_group(publisher)["date_cat"]),
                                "list_of_urls": list_of_urls}

print "dates info added to dictionary"

publishers info added to dictionary

dates info added to dictionary


In [81]:
# Part 1 - Getting the text information sorted out
date = (2016, 5)
list_of_article_texts = list(months.get_group(date)["text"])
dict_of_text_strings[date] = list_of_article_texts

# Part 2 - Getting the urls & titles information sorted out    
list_of_urls = list(months.get_group(date)["url"])
list_of_titles = []
for url in list_of_urls:
    list_of_titles.append(google_news_dictionary_article_text[url]["article_title"])

# Part 3 - Putting it all together
dict_of_scopes[date] = {"list_of_texts": list_of_article_texts,
                             "list_of_titles" : list_of_titles,
                            "tfidf_top300": "",
                            "tfidf_top10": "",
                            "list_of_dates": list(publishers.get_group(publisher)["date_cat"]),
                            "list_of_urls": list_of_urls}


In [82]:
dict_of_scopes["(2016, 5)"]

KeyError: '(2016, 5)'

In [74]:
date_keys

array([(2013, 9), (2013, 10), (2013, 11), (2013, 12), (2014, 1), (2014, 2),
       (2014, 3), (2014, 4), (2014, 5), (2014, 6), (2014, 7), (2014, 8),
       (2014, 9), (2014, 10), (2014, 11), (2014, 12), (2015, 1), (2015, 2),
       (2015, 3), (2015, 4), (2015, 5), (2015, 6), (2015, 7), (2015, 8),
       (2015, 9), (2015, 10), (2015, 11), (2015, 12), (2016, 1), (2016, 2),
       (2016, 3), (2016, 4), (2016, 5)], dtype=object)

# Now to model the list of strings

In [567]:
def get_tfidf_list_300(scope):
    list_of_strings = dict_of_text_strings[scope]
    print "Parts complete: 1",

    vectorizer = TfidfVectorizer(stop_words="english", max_df=0.95, min_df=2,ngram_range=(1,3))
    #vectorizer = TfidfVectorizer(stop_words="english", max_df=0.75, min_df=2,ngram_range=(1,3), )
    print "2",

    X = vectorizer.fit_transform(list_of_strings)
    print "3",

    indices = np.argsort(vectorizer.idf_)[::-1]
    print "4",

    features = vectorizer.get_feature_names()
    print "5",

    top_n = 300
    top_features = [features[i] for i in indices[:top_n]]
    print "6"

    print "These are the top 10 features for scope:", scope, top_features[0:10]
    return top_features




In [597]:
# Iterating across all the domains and generating a list of the top 300 tfidf words for a given domain...

for scope in dict_of_scopes.keys():
    print scope
    tfidf_top300 = get_tfidf_list_300(scope)
    tfidf_top10 = tfidf_top300[0:10]
    dict_of_scopes[scope]["tfidf_top300"] = tfidf_top300
    dict_of_scopes[scope]["tfidf_top10"] = tfidf_top10
    print

(2015, 11)
Parts complete: 1 2 3 4 5 6
These are the top 10 features for scope: (2015, 11) [u'life federation', u'democratic control', u'democratic candidates debate', u'notify employees', u'notify government', u'democratic candidate governor', u'noting pill', u'noting pill bottles', u'democratic base', u'notion common']

salon
Parts complete: 1 2 3 4 5 6
These are the top 10 features for scope: salon [u'\ufb01rst', u'libertarian ish', u'liberating power', u'liberation army', u'liberation movements', u'bad days', u'liberian president', u'liberian president ellen', u'bad culture', u'libertarian agenda']

(2015, 5)
Parts complete: 1 2 3 4 5 6
These are the top 10 features for scope: (2015, 5) [u'best deal', u'kaufax applauded measure', u'kazakhstan', u'kayaks', u'kay', u'kavanaugh', u'kaufax wrote', u'kaufax said', u'kaufax applauded', u'keepers']

politico
Parts complete: 1 2 3 4 5 6
These are the top 10 features for scope: politico [u'zurawik', u'mayor cory', u'matthew shapanka', u'mat

In [4]:
import pickle
import pandas as pd

In [35]:
#pickle.dump( dict_of_scopes, open( "dict_of_scopes.pickle", "wb" ) )

dict_of_scopes = pickle.load( open("dict_of_scopes.pickle", "rb" ))

In [36]:
dict_of_scopes["salon"].keys()

['list_of_texts',
 'list_of_titles',
 'tfidf_top10',
 'list_of_urls',
 'list_of_dates',
 'tfidf_top300']

In [38]:
scope_type_list = []
scope_list = []
tfidf_top10_list = []
top1_list = []
top2_list = []
top3_list = []
top4_list = []
top5_list = []


for scope in dict_of_scopes.keys():
    #1 Assign the scope in particular -- # scope = []       # "salon, etc... or (2016, 3)
    scope_list.append(scope)
    #print scope, 

    # Get the scope type -- # scope_type = []  # "date-pair or publisher
    scope_type = "date"
    if type(scope) == str:
        scope_type = "publisher"
    #print scope_type,
    scope_type_list.append(scope_type)

    # Extract out the top 5 keywoards
    tfidf_top10 = dict_of_scopes[scope]["tfidf_top10"]
    tfidf_top10_list.append(tfidf_top10)
    
    top1_list.append(tfidf_top10[0])
    top2_list.append(tfidf_top10[1])
    top3_list.append(tfidf_top10[2])
    top4_list.append(tfidf_top10[3])
    top5_list.append(tfidf_top10[4])               

In [39]:
df_of_scopes = pd.DataFrame([scope_list,scope_type_list,tfidf_top10_list,top1_list,top2_list, top3_list, top4_list, top5_list]).T
df_of_scopes.columns = ["scope","scope_type","tfidf_top10","top1","top2", "top3", "top4", "top5"]

In [306]:
df_of_scopes_dates[["scope","tfidf_top10"]].sort_values(by="scope")

Unnamed: 0,scope,tfidf_top10
44,"(2013, 9)","[legal counsel, happened government, paints picture, changes healthcare, pair new, paired, handout, pairs, handling use, pakistan history]"
24,"(2013, 10)","[preventing pregnancy, obamacare politics, obamacare patient, obamacare pays, doesn far, obamacare percent, obamacare perfect, obamacare phone, doesn explain, doesn exist]"
10,"(2013, 11)","[zuckerberg spent 25, including community, including american, including 10, includes broad, includes abortion, included targets, included public option, included public, included joe lieberman]"
25,"(2013, 12)","[legislators country, middle class poor, mikhail, milbank, common ground, mile away, common core state, miles away, miles day, miles headquarters]"
48,"(2014, 1)","[zuma press seeded, january 2014 dustinsiggins, january 6th, january 3rd felt, january 29, january 28th, january 27, january 25 2014, january 2017, january 2014 information]"
23,"(2014, 2)","[zurich, incur, incumbent rep, incumbent reelection aspiring, incumbent reelection, incumbent gov, incubator, incriminating, increasingly unpopular, increasingly lawless]"
7,"(2014, 3)","[liberals conservatives, military presence, comes home, military spending pentagon, military ukraine, comes future, millennial americans, comers, millennials 2014, millennials 2014 slogging]"
36,"(2014, 4)","[zuma, distaste, officer mozilla, officer executive, officer company, office told, office space, office says, disseminated, office people]"
41,"(2014, 5)","[lefties, induce, indonesian, indoctrination, individuals need, individualism, individual rights, individual members, individual market, individual liberty rule]"
53,"(2014, 6)","[zuckerberg jones, dramatic reductions, draw conclusions, draw attention, onstage, draper, oops, op ed pages, dramatically elevated, open air]"


In [312]:
url = get_google_search_url(scope_date = (2014,7), search_token="zurich")

In [320]:
x = 5
print x

In [323]:
dict_of_google_tag_searches[url][0][0]

u'What does Obamacare mean for expats? | US Tax & Financial Services'

In [322]:
dict_of_google_tag_searches[url][0][0]

[u'What does Obamacare mean for expats? | US Tax & Financial Services',
 u'ObamaCare News: Daily ObamaCare Updates - ObamaCare Facts',
 u'ObamaCare 2015 - Obamacare Facts',
 u'What we learned about Obamacare July 8-14, 2014 - AEI',
 u'Effects of a Minimum-Wage Increase on Employment and Family Income',
 u'News Article Archive from July 22, 2014 - Wsj.com - Wall Street Journal',
 u'Binary options no deposit bonus september 2013 dailymotion',
 u'News Update 224 (Sept. 2014) - American Citizens Abroad',
 u'Swiss consumer prices fall most since 1959 in July - timesofindia ...',
 u'Obamacare Insurance Cost Calculator | Auto Insurance Quotes ...']

In [334]:
my_strings = google_news_df_top_pubs_good_dates[["text"]][google_news_df_top_pubs_good_dates["date_cat"]==(2014,7)]

In [335]:
len(my_strings)

1171

In [255]:

df_of_scopes_dates[df_of_scopes_dates["scope"]== (2014, 7)]

Unnamed: 0,scope,scope_type,tfidf_top10,top1,top2,top3,top4,top5,google_titles_for_top1,google_summeries_for_top1,google_titles_for_top2,google_summeries_for_top2,google_titles_for_top3,google_summeries_for_top3,google_titles_for_top4,google_summeries_for_top4,google_titles_for_top5,google_summeries_for_top5
64,"(2014, 7)",date,"[zurich, immigration think, impact birth control, impact birth, immutable, immortal, immorality, imminently, immigration status, impact economy]",zurich,immigration think,impact birth control,impact birth,immutable,"[What does Obamacare mean for expats? | US Tax & Financial Services, ObamaCare News: Daily ObamaCare Updates - ObamaCare Facts, ObamaCare 2015 - Obamacare Facts, What we learned about Obamacare July 8-14, 2014 - AEI, Effects of a Minimum-Wage Increase on Employment and Family Income, News Article Archive from July 22, 2014 - Wsj.com - Wall Street Journal, Binary options no deposit bonus septem...","[[Update: , <em>Obamacare</em>, penalty tax may apply to expats · , <em>Obamacare</em>, and You. Tags, <wbr>: Affordable Care Act, expats, Healthcare Reform, <em>obamacare</em>.</wbr>], [Update: , <em>Obamacare</em>, penalty tax may apply to expats · , <em>Obamacare</em>, and You. Tags, <wbr>: Affordable Care Act, expats, Healthcare Reform, <em>obamacare</em>.</wbr>], [Update: , <em>Obamac...","[Kaiser Health Tracking Poll: July 2014 | The Henry J. Kaiser Family ..., 10 things immigrant families need to know about the Marketplace ..., Thousands of Illegal Immigrants Enrolled in Obamacare - Newsmax.com, ObamaCare News: Daily ObamaCare Updates - ObamaCare Facts, Patient Protection and Affordable Care Act - Wikipedia, the free ..., Immigrant Families Benefit Significantly from Obamacare...","[[Kaiser Health Tracking Poll: , <em>July 2014</em>, ... to other issues like the economy and jobs, the federal budget deficit, education, and , <em>immigration</em>, . .... Nearly six in ten Americans (58 percent) , <em>believe</em>, it is “very” or “somewhat” likely that ...], [Kaiser Health Tracking Poll: , <em>July 2014</em>, ... to other issues like the economy and jobs, the federal bu...","[ObamaCare Birth Control - Obamacare Facts, Birth Control Is Free Under Obamacare, But Not Everyone Got The ..., ObamaCare making free birth control standard for women | Fox News, The Affordable Care Act and cost of contraception - Journalist's ..., In Her Words: Obamacare's Negative Impact on Women - Senate ..., Contraceptive mandate (United States) - Wikipedia, the free ..., Patient Protecti...","[[ covers one type of , <em>birth control</em>, from each of 18 FDA approved ... women on the pill were projected to be paying $0 for it in , <em>2014</em>, (up from only 15% in 2012). ... As of , <em>July</em>, 2015 all plans, except for the following, must provide at least one ... since , <em>ObamaCare's</em>, requirement to provide free , <em>contraception</em>, took , <em>effect</em>,...","[Birth Control Is Free Under Obamacare, But Not Everyone Got The ..., ObamaCare Birth Control - Obamacare Facts, ObamaCare making free birth control standard for women | Fox News, In Her Words: Obamacare's Negative Impact on Women - Senate ..., The Affordable Care Act and cost of contraception - Journalist's ..., Happy Birthday, Obamacare: 5 Years Later, Fact ... - The Daily Signal, Birth cont...","[[<em>Obamacare's</em>, biggest selling points for women is the guarantee of no-cost , <em>birth</em>, control, ... 09/26/, <em>2014</em>, 04:34 pm ET | Updated Sep 29, , <em>2014</em>, ... how it's supposed to work, the , <em>Obamacare birth</em>, control mandate has had a huge , <em>impact</em>, : Many, ... In a survey conducted this , <em>July</em>, , 60 percent of people said they sup...","[What we learned about Obamacare July 8-14, 2014 - AEI, ObamaCare News: Daily ObamaCare Updates - ObamaCare Facts, A death blow for Obamacare? - The Boston Globe, King Barack Rewrites Obamacare… Again | The American Spectator, U.S. GAO - Patient Protection and Affordable Care Act: Preliminary ..., First wave of Obamacare destroys single practice doctors - Page 5 ..., Health Care BS - OBAMACARE...","[[What we learned about , <em>Obamacare July 8</em>, -, <em>14, 2014</em>, . View related content: Health Care. Image Credit: https://twitter.com/OFA. 1.) AEI's Joseph ...], [What we learned about , <em>Obamacare July 8</em>, -, <em>14, 2014</em>, . View related content: Health Care. Image Credit: https://twitter.com/OFA. 1.) AEI's Joseph ...], [What we learned about , <em>Obamacare July 8</em..."


In [256]:
#df_of_scopes_dates[df_of_scopes_dates["scope"]== (2016, 3)]

In [257]:
google_news_df_top_pubs_good_dates[google_news_df_top_pubs_good_dates["date_cat"]==(2014, 7)].shape

(1171, 5)

In [271]:
a = google_news_df_top_pubs_good_dates[google_news_df_top_pubs_good_dates["date_cat"]==(2014, 7)]
a[a["publisher"]== "npr"]

In [360]:
a[a["publisher"]== "theatlantic"]

Unnamed: 0,publisher,text,url,count,date_cat
7978,theatlantic,"Please consider disabling it for our site, or supporting our work in one of these ways\n\nWhy providing women more options to control pregnancies is in the interest of everyone\n\nIn a hotly contested decision this week, the Supreme Court ruled that for-profit employers can opt out of providing certain types of contraception coverage on religious grounds. Ostensibly, the holding is narrow: Onl...",http://www.theatlantic.com/health/archive/2014/07/the-broader-benefits-of-contraception/373856/,1,"(2014, 7)"
10504,theatlantic,"1. We can thank former President Bill Clinton for perfect clarity in his comments about the chaos and horror of Gaza. In an interview on Indian television, Clinton—who told us in his memoir that Palestinian self-destructiveness (in the form of Yasir Arafat’s various delusions and prevarications) undid his effort to bring about a two-state solution to the Middle East conflict—blames the Muslim ...",http://www.theatlantic.com/international/archive/2014/07/understanding-what-hamas-wants/374656/,1,"(2014, 7)"
11034,theatlantic,"Despite the news last week that America's healthcare spending will not be rising at the sky-high rate that was once predicted, the fact remains that the U.S. far outspends its peer nations when it comes to healthcare costs per capita. This year the United States will spend almost 18 percent of the gross domestic product (GDP) on healthcare—six percentage points more than the Netherlands, the n...",http://www.theatlantic.com/business/archive/2014/07/why-do-other-rich-nations-spend-so-much-less-on-healthcare/374576/,1,"(2014, 7)"
27354,theatlantic,"Let us know what we missed.\n\nJustin Bieber: A Case Study in Growing Up Cosseted and Feral\n\nVanessa Grigoriadis | Vulture\n\n""He sees himself as Brando, McQueen, Dean. We may see something different—a costume of machismo; a slip of a boy buffed up and doffing his shirt like a South Bronx stoopie in August; a white person fetishizing blackness with the laserlike focus of someone for whom 'be...",http://www.theatlantic.com/entertainment/archive/2014/07/pop-writing-july-1-2014/373758/,1,"(2014, 7)"
27405,theatlantic,"Researchers from the University of California San Diego and Brown University surveyed parents of first-time patients at a pediatric obesity clinic, assessing the families’ willingness to help their children lose weight. The patients, who ranged in age from 5 to 20, had all been classified as overweight or clinically obese, and most had been referred to the clinic for treatment by their regular...",http://www.theatlantic.com/health/archive/2014/07/study-many-parents-are-in-denial-about-their-kids-obesity/375023/,1,"(2014, 7)"
37094,theatlantic,"Louis C.K.'s regular-guy shtick permeates everything about his image: the plain black T-shirts, the self-deprecating humor, his Twitter bio (""I am a comedian and a person and a guy who is sitting here""), even his mass-emailing strategy. Louie—and it feels right to call him ""Louie"" precisely because of this guy-who-is-sitting-here image he's so expertly cultivated—is, of course, not an ordinary...",http://www.theatlantic.com/technology/archive/2014/07/the-brilliance-of-louis-cks-emails-he-writes-like-a-politician/374034/,1,"(2014, 7)"
44174,theatlantic,"The future of marriage, the future of Millennials: two topics the Internet loves to freak out about. Thanks to a new report from Pew, here the twain shall meet: Researchers asked people of all ages whether society is better off if people focus on getting married and having kids.\n\nAmerican Attitudes Toward Marriage and Kids\n\nLooking at this chart is a little like taking a Rorschach inkblot ...",http://www.theatlantic.com/national/archive/2014/07/millennials-society-will-be-just-fine-without-marriage/374060/,1,"(2014, 7)"
53009,theatlantic,"It is too surreal. I still feel like it was just yesterday when I picked up the report on the achievement gap in college. When we originally started working on PMP, we were thinking of national expansion—almost too early. We quickly learned how difficult it is to do our work and are still working to set reasonable expectations of growth. This summer we made changes to our business model. Now w...",http://www.theatlantic.com/education/archive/2014/07/meet-the-22-year-old-who-is-fixing-the-summer-achievement-gap/375024/,1,"(2014, 7)"
56869,theatlantic,"Please consider disabling it for our site, or supporting our work in one of these ways\n\nBefore Twitter existed, before Facebook allowed more than college students to join its network, two brothers made new media that went viral.\n\nTen years ago today, a small Flash animation firm based in Los Angeles uploaded a video to its website. The three-minute cartoon parodied the year’s presidential ...",http://www.theatlantic.com/technology/archive/2014/07/jibjabs-seminal-flash-parody-turns-10/374161/,1,"(2014, 7)"
60087,theatlantic,"Please consider disabling it for our site, or supporting our work in one of these ways\n\nWhen Team Snapchat announced the launch of Snapchat Stories last fall, it expanded the intimacy of the service so that users could “share your day with friends—or everyone.” More recently, though, the service is fixated on the concept of everywhere, with the addition of location-based filters and other ge...",http://www.theatlantic.com/technology/archive/2014/07/why-snapchat-cares-where-you-are/374733/,1,"(2014, 7)"


In [None]:
a[a["publisher"]== "npr"]

In [41]:
df_of_scopes[["scope","tfidf_top10"]][df_of_scopes["scope_type"]=="date"].sort_values(by="scope")

Unnamed: 0,scope,tfidf_top10
44,"(2013, 9)","[legal counsel, happened government, paints pi..."
24,"(2013, 10)","[preventing pregnancy, obamacare politics, oba..."
10,"(2013, 11)","[zuckerberg spent 25, including community, inc..."
25,"(2013, 12)","[legislators country, middle class poor, mikha..."
48,"(2014, 1)","[zuma press seeded, january 2014 dustinsiggins..."
23,"(2014, 2)","[zurich, incur, incumbent rep, incumbent reele..."
7,"(2014, 3)","[liberals conservatives, military presence, co..."
36,"(2014, 4)","[zuma, distaste, officer mozilla, officer exec..."
41,"(2014, 5)","[lefties, induce, indonesian, indoctrination, ..."
53,"(2014, 6)","[zuckerberg jones, dramatic reductions, draw c..."


In [13]:
# for row in df_of_scopes[["scope","tfidf_top10"]][df_of_scopes["scope_type"]=="date"].sort_values(by="scope"):
#     print row

In [43]:
def get_google_search_url(scope_date, search_token):
    search_year = scope_date[0]
    search_month = scope_date[1]
    #print search_year, search_month

    google_prefix = "https://www.google.com/search?q="
    google_suffix = calendar.month_name[search_month] + "+" + str(search_year)+"+"+search_token.replace(" ","+") +"+" +"obamacare"
    google_url = google_prefix + google_suffix
    return google_url

In [51]:
def get_google_results(google_url):
    """Input: Enter in a google url query
       Output: a tuple (list_of_titles, list_of_links, list_of_summaries, list_of_dates)"""
    
    driver.get(google_url)
    elem = driver.find_element_by_xpath("//*");
    html_code = elem.get_attribute("innerHTML");
    soup = BeautifulSoup(html_code, "lxml");

    div_tags = soup.findAll('div',attrs={ "class" : "rc" })   # List of all results on page -- each div is a separate result

    link_title_list = []
    link_url_list = []
    result_date_list = []
    result_summary_list = []

    for i in range(len(div_tags)):
        link_url = div_tags[i].find("a")["href"]
        # print link_url
        link_title = div_tags[i].find("a").contents[0]


        result_summary = div_tags[0].findAll("span", attrs={ "class" : "st" })[0].contents[1:]

        link_title_list.append(link_title)
        link_url_list.append(link_url)
        result_summary_list.append(result_summary)

    results_tuple = (link_title_list,link_url_list,result_summary_list)
    print link_title_list
    return results_tuple

In [52]:
dict_of_google_tag_searches = {}
driver = webdriver.Firefox()

In [None]:
6 hours => 360 minutes

In [810]:
len(date_pairs) *10 # Total number of searches

330

In [None]:
date_keys.sort()

In [50]:
date_keys

array([(2013, 9), (2013, 10), (2013, 11), (2013, 12), (2014, 1), (2014, 2),
       (2014, 3), (2014, 4), (2014, 5), (2014, 6), (2014, 7), (2014, 8),
       (2014, 9), (2014, 10), (2014, 11), (2014, 12), (2015, 1), (2015, 2),
       (2015, 3), (2015, 4), (2015, 5), (2015, 6), (2015, 7), (2015, 8),
       (2015, 9), (2015, 10), (2015, 11), (2015, 12), (2016, 1), (2016, 2),
       (2016, 3), (2016, 4), (2016, 5)], dtype=object)

In [None]:
for token in dict_of_scopes["(2016,5)"]["tfidf_top10"]:
    time.sleep(15) # Sleeps for 1 minute -- 6 hours is 360 minutes, and there are 330 pairs in total
    print date, token,
    google_url = get_google_search_url(scope_date=date, search_token = token)
    results_tuple = get_google_results(google_url)
    dict_of_google_tag_searches[google_url] = results_tuple
    dict_of_google_tag_searches[(date,token)] = results_tuple
    print

In [53]:
for date in date_keys:    
    for token in dict_of_scopes[date]["tfidf_top10"]:
        time.sleep(15) # Sleeps for 1 minute -- 6 hours is 360 minutes, and there are 330 pairs in total
        print date, token,
        google_url = get_google_search_url(scope_date=date, search_token = token)
        results_tuple = get_google_results(google_url)
        dict_of_google_tag_searches[google_url] = results_tuple
        dict_of_google_tag_searches[(date,token)] = results_tuple
        print

(2013, 9) legal counsel [u'US government shutdown: House votes to delay Obamacare law | US ...', u'What Is Obamacare? A Legal Overview - Law and Daily Life - FindLaw', u"What's in a name? Lots when it comes to Obamacare/ACA - CNBC.com", u'Read the Law | HHS.gov', u'Office of the General Counsel September 3, 2013 Submitted ...', u'Patient Protection and Affordable Care Act - Wikipedia, the free ...', u'United States federal government shutdown of 2013 - Wikipedia, the ...', u'Top 16 myths about the health care law | PolitiFact', u'Remarks by the President on the Affordable Care Act | whitehouse.gov', u'Can They Crush Obamacare? by David Cole | The New York Review ...']

(2013, 9) happened government [u'United States federal government shutdown of 2013 - Wikipedia, the ...', u'Government shutdown in the United States - Wikipedia, the free ...', u'Why Did The U.S. Government Shut Down In October 2013? - Forbes', u'A Brief History of the 2013 Government Shutdown | Mediaite', u'US governmen

KeyError: (2016, 5)

In [73]:
dict_of_scopes["(2016,5)"]

KeyError: '(2016,5)'

In [114]:
df_of_scopes_dates = df_of_scopes[df_of_scopes["scope_type"] == "date"]

In [115]:
df_of_scopes_dates = df_of_scopes_dates.sort_values(by = "scope")

In [116]:
df_of_scopes_dates

Unnamed: 0,scope,scope_type,tfidf_top10,top1,top2,top3,top4,top5
44,"(2013, 9)",date,"[legal counsel, happened government, paints pi...",legal counsel,happened government,paints picture,changes healthcare,pair new
24,"(2013, 10)",date,"[preventing pregnancy, obamacare politics, oba...",preventing pregnancy,obamacare politics,obamacare patient,obamacare pays,doesn far
10,"(2013, 11)",date,"[zuckerberg spent 25, including community, inc...",zuckerberg spent 25,including community,including american,including 10,includes broad
25,"(2013, 12)",date,"[legislators country, middle class poor, mikha...",legislators country,middle class poor,mikhail,milbank,common ground
48,"(2014, 1)",date,"[zuma press seeded, january 2014 dustinsiggins...",zuma press seeded,january 2014 dustinsiggins,january 6th,january 3rd felt,january 29
23,"(2014, 2)",date,"[zurich, incur, incumbent rep, incumbent reele...",zurich,incur,incumbent rep,incumbent reelection aspiring,incumbent reelection
7,"(2014, 3)",date,"[liberals conservatives, military presence, co...",liberals conservatives,military presence,comes home,military spending pentagon,military ukraine
36,"(2014, 4)",date,"[zuma, distaste, officer mozilla, officer exec...",zuma,distaste,officer mozilla,officer executive,officer company
41,"(2014, 5)",date,"[lefties, induce, indonesian, indoctrination, ...",lefties,induce,indonesian,indoctrination,individuals need
53,"(2014, 6)",date,"[zuckerberg jones, dramatic reductions, draw c...",zuckerberg jones,dramatic reductions,draw conclusions,draw attention,onstage


In [243]:
df_of_scopes_dates.head()

Unnamed: 0,scope,scope_type,tfidf_top10,top1,top2,top3,top4,top5,google_titles_for_top1,google_summeries_for_top1,google_titles_for_top2,google_summeries_for_top2,google_titles_for_top3,google_summeries_for_top3,google_titles_for_top4,google_summeries_for_top4,google_titles_for_top5,google_summeries_for_top5
44,"(2013, 9)",date,"[legal counsel, happened government, paints picture, changes healthcare, pair new, paired, handout, pairs, handling use, pakistan history]",legal counsel,happened government,paints picture,changes healthcare,pair new,"[US government shutdown: House votes to delay Obamacare law | US ..., What Is Obamacare? A Legal Overview - Law and Daily Life - FindLaw, What's in a name? Lots when it comes to Obamacare/ACA - CNBC.com, Read the Law | HHS.gov, Office of the General Counsel September 3, 2013 Submitted ..., Patient Protection and Affordable Care Act - Wikipedia, the free ..., United States federal government sh...","[[US government shutdown: House votes to delay , <em>Obamacare law</em>, ... Sunday 29 , <em>September 2013</em>, 12.30 EDT Last modified on Wednesday 11 May .... About half of the Defense , <em>Department's</em>, civilian employees – about 800,000 ...], [US government shutdown: House votes to delay , <em>Obamacare law</em>, ... Sunday 29 , <em>September 2013</em>, 12.30 EDT Last modified...","[United States federal government shutdown of 2013 - Wikipedia, the ..., Government shutdown in the United States - Wikipedia, the free ..., Why Did The U.S. Government Shut Down In October 2013? - Forbes, A Brief History of the 2013 Government Shutdown | Mediaite, US government shutdown: House votes to delay Obamacare law | US ..., Government shutdown: What you need to know - CNNPolitics.com,...","[[<em>government</em>, entered a shutdown ..... Meese's coalition produced a ""blueprint to defunding , <em>Obamacare</em>, "", <wbr>. .... With Congress having failed to agree by late <em>September 2013</em> on the budget ...... asserted that ""none of what is <em>happening</em> in Washington diminishes one iota<wbr> ...</wbr></wbr>], [<em>government</em>, entered a shutdown ..... Meese's coal...","[ObamaCare News: Daily ObamaCare Updates - ObamaCare Facts, Obamacare ""has never been favored by a majority of Americans ..., We Mapped the Uninsured. You'll Notice a Pattern. - The New York ..., One Key Thing No One Knows About Obamacare : Planet Money : NPR, Reality Check: Obamacare Edition - gop.gov, Opinion: Worried about Obamacare? Fear not - CNN.com, Study: Obamacare gave 10 million Amer...","[[<em>September</em>, 26th, 2014 by , <em>ObamaCare</em>, Facts ... 36% to 23% from , <em>2013</em>, to the closing the Health Insurance Marketplace's first open enrollment period. ..... A recent article from the New York Post does a great job at , <em>painting</em>, a , <em>picture</em>, of ...], [<em>September</em>, 26th, 2014 by , <em>ObamaCare</em>, Facts ... 36% to 23% from , <em>2...","[How Obamacare Changed Health Insurance ... Maybe - Forbes, Key Features of the Affordable Care Act By Year | HHS.gov, Current trends in health care: A new landscape with Obamacare ..., 70 Changes to ObamaCare… — So Far | Galen Institute, Health Care Industry Spent $243 Million in 2013 Lobbying - Breitbart, 5 ways Obamacare affects hospitals, doctors and more | NJ.com, What's in a name? Lots w...","[[Nearly 22.8 million people have gained , <em>health insurance</em>, since the ... insurance grew by about 8 million people between , <em>September 2013</em>, and ...], [Nearly 22.8 million people have gained , <em>health insurance</em>, since the ... insurance grew by about 8 million people between , <em>September 2013</em>, and ...], [Nearly 22.8 million people have gained , <em>health ...","[Ted Cruz pulls all-nighter as marathon anti-Obamacare speech churns ..., ObamaCare Facts: Facts on the Affordable Care Act, How unpopular or popular is Obamacare? - The Washington Post, Is the Affordable Care Act Working? - The New York Times, Short-term health insurance, a low-cost alternative to Obamacare |, Senate defeats Cruz filibuster, passes bill that funds Obamacare ..., GOP's Anti-Ob...","[[Wednesday 25 , <em>September 2013</em>, 09.15 EDT Last modified on ... he had swapped his usual ostrich-skin ""argument boots"" for a , <em>pair</em>, of .... Republican senator Ted Cruz launches marathon anti-, <em>Obamacare</em>, speech ...... US , <em>News</em>, .], [Wednesday 25 , <em>September 2013</em>, 09.15 EDT Last modified on ... he had swapped his usual ostrich-skin ""argument bo..."
24,"(2013, 10)",date,"[preventing pregnancy, obamacare politics, obamacare patient, obamacare pays, doesn far, obamacare percent, obamacare perfect, obamacare phone, doesn explain, doesn exist]",preventing pregnancy,obamacare politics,obamacare patient,obamacare pays,doesn far,"[Why Is Maternity Care Such an Issue for Obamacare Opponents ..., ObamaCare Stories: Real Life Stories on ObamaCare, Yes, men should pay for pregnancy coverage, and here's why - latimes, How Obamacare changed maternity coverage - HealthInsurance.org, The Cheapest Shot Against Obamacare, and Why Men Should Pay for ..., The Definitive Guide to How Obamacare is Destroying American Lives ..., CBS'...","[[<em>2013</em>, ; Politics. , <em>Pregnant Obamacare</em>, supporter Karmel Allison nearly faints during President Obama's , <em>October</em>, Rose Garden speech about health care. .... The resentment , <em>against</em>, women who have babies on their own has been part and parcel of the GOP moral argument for decades; in 2012, ...], [<em>2013</em>, ; Politics. , <em>Pregnant Obamacare</em>...","[Rough Obamacare rollout: 4 reasons why - CNNPolitics.com - CNN.com, ObamaCare 2013 - ObamaCare Facts, Ben Carson: Obamacare worst thing 'since slavery' - The Washington ..., President Obama's Oct. 1 remarks on the government shutdown, 20 Obamacare Stats Republicans Don't Want You to See | Mother Jones, The Politics of ObamaCare, Social Security, and Medicare | Scholars ..., From October 2013:...","[[<em>Obamacare</em>, rollout: 4 reasons why. By Tom Cohen, CNN. Updated 9:17 AM ET, Wed , <em>October</em>, 23, , <em>2013</em>, ... Red flags missed on , <em>Obamacare</em>, site 02:31 ... Some reasons , <em>political</em>, , others technical but presidential oversell appears to be one.], [<em>Obamacare</em>, rollout: 4 reasons why. By Tom Cohen, CNN. Updated 9:17 AM ET, Wed , <em>Octob...","[ObamaCare 2013 - ObamaCare Facts, ObamaCare Facts: Facts on the Affordable Care Act, ObamaCare | Health Insurance Exchange - ObamaCare Facts, No, Obamacare Doesn't 'Start' Oct. 1, But Here's What Changes ..., Key Features of the Affordable Care Act By Year | HHS.gov, Key Features of the Affordable Care Act | HHS.gov, ObamaCare (Affordable Care Act) Is Not An Insurance Or Healthcare ..., Patie...","[[<em>Oct 1st, 2013</em>, and closes March 31st, ... , <em>ObamaCare</em>, 2013 Provisions: Doctors that take Medicaid , <em>patients</em>, got a pay ...], [<em>Oct 1st, 2013</em>, and closes March 31st, ... , <em>ObamaCare</em>, 2013 Provisions: Doctors that take Medicaid , <em>patients</em>, got a pay ...], [<em>Oct 1st, 2013</em>, and closes March 31st, ... , <em>ObamaCare</em>, 201...","[ObamaCare 2013 - ObamaCare Facts, ObamaCare | Health Insurance Exchange - ObamaCare Facts, ObamaCare Facts: Facts on the Affordable Care Act, ObamaCare Stories: Real Life Stories on ObamaCare, In Obamacare, Congress must buy insurance from marketplaces but ..., 10 things Obamacare supporters say that aren't entirely true | PolitiFact, Obamacare: Unfair to the young middle class, punished enou...","[[<em>Oct 1st, 2013</em>, and closes March 31st, ... , <em>ObamaCare</em>, 2013 Provisions: Doctors that take Medicaid patients got a , <em>pay</em>, ...], [<em>Oct 1st, 2013</em>, and closes March 31st, ... , <em>ObamaCare</em>, 2013 Provisions: Doctors that take Medicaid patients got a , <em>pay</em>, ...], [<em>Oct 1st, 2013</em>, and closes March 31st, ... , <em>ObamaCare</em>, 201...","[20 Obamacare Stats Republicans Don't Want You to See | Mother Jones, Obamacare's Website Is Crashing Because It Doesn't Want You To ..., ObamaCare 2013 - ObamaCare Facts, Why Does Obamacare Suck So Much? - Obamacare Facts, No, Obamacare Doesn't 'Start' Oct. 1, But Here's What Changes ..., 70 Changes to ObamaCare… — So Far | Galen Institute, How is Obamacare Doing So Far? | BillMoyers.com, Cal...","[[... in , <em>October 2013</em>, , a measly 106,000 people signed up for new health plans in its first ... Number of adults without health insurance in 2013: 41 million. ... Number of times Congress has voted to repeal , <em>Obamacare</em>, (so , <em>far</em>, ): 56.], [... in , <em>October 2013</em>, , a measly 106,000 people signed up for new health plans in its first ... Number of adults ..."
10,"(2013, 11)",date,"[zuckerberg spent 25, including community, including american, including 10, includes broad, includes abortion, included targets, included public option, included public, included joe lieberman]",zuckerberg spent 25,including community,including american,including 10,includes broad,"[FWD.us - Wikipedia, the free encyclopedia, Zuckerberg Defends Obamacare Website - Breitbart, 2013: Year of the Left-Wing Billionaire | Power Line, Facebook Spent $12.5 Million to Protect Zuckerberg Since 2013 ..., Archive for November, 2013 | Politicus USA | Page 3, Mark Zuckerberg's Political Group Is Trashing Obama And Obamacare ..., 23 | November | 2013 | Netter Capital Partners | Page 2, ...","[[<em>Zuckerberg</em>, . .... in the Arctic National Wildlife Refuge, along with support for repealing , <em>Obamacare</em>, . ... In , <em>November 2013</em>, , FWD.us announced that it had hired Darius Contractor as ...], [<em>Zuckerberg</em>, . .... in the Arctic National Wildlife Refuge, along with support for repealing , <em>Obamacare</em>, . ... In , <em>November 2013</em>, , FWD.us anno...","[Obamacare: Seven Major Provisions And How They Affect You - Forbes, Obamacare's Website Won't Be Working By November 30 -- But What ..., ObamaCare Facts: Facts on the Affordable Care Act, The Affordable Care Act and Infertility - RESOLVE: The National ..., November 2013 - Consumer Action, How Obamacare Will Affect Trans Folks and Families | Advocate.com, International Profiles of Health Care ...","[[<em>Obamacare</em>, , or the PPACA if you prefer, will drastically change major aspects of “, <wbr>the finest healthcare system in the world.” Without question ...</wbr>], [<em>Obamacare</em>, , or the PPACA if you prefer, will drastically change major aspects of “, <wbr>the finest healthcare system in the world.” Without question ...</wbr>], [<em>Obamacare</em>, , or the PPACA if you prefer...","[Why is Obamacare so controversial? - BBC News - BBC.com, Americans' Approval of Healthcare Law Declines - Gallup, ObamaCare Facts: Facts on the Affordable Care Act, US Citizens Abroad and OBAMACARE – UPDATED August 1 2015 ..., Statement by the President on the Affordable Care Act | whitehouse.gov, Obama apologizes for insurance cancellations due to Obamacare ..., Obamacare exposed: The guttin...","[[<em>Americans</em>, think of , <em>Obamacare</em>, today? ... The phased rollout of the law has been bedeviled by setbacks, , <em>including</em>, : ... Nov 14: Mr Obama announces insurers can keep customers on existing plans ... , <em>15 November 2013</em>, ...], [<em>Americans</em>, think of , <em>Obamacare</em>, today? ... The phased rollout of the law has been bedeviled by setbacks, ...","[Obamacare's Website Won't Be Working By November 30 -- But What ..., ObamaCare Facts: Facts on the Affordable Care Act, ObamaCare | Health Insurance Exchange - ObamaCare Facts, Obamacare Enrollment Numbers Spike In November - Business Insider, 258,000 Americans Enrolled In Obamacare Exchange Health Plans In ..., Obama apologizes for insurance cancellations due to Obamacare ..., Americans' App...","[[<em>Obamacare</em>, is working by next November. ... , <em>Nov 30, 2013 @ 10:42 AM</em>, views ... The Apothecary's authors , <em>include</em>, : Josh Archambault of the Foundation for .... Plus we'll count the , <em>10</em>, million new enrollees in the Medicaid program as ...], [<em>Obamacare</em>, is working by next November. ... , <em>Nov 30, 2013 @ 10:42 AM</em>, views ... The Apot...","[In fourth year, Obamacare exchange eyes broader role | The CT Mirror, Obamacare a single-payer ploy, says ex-GOP Senator - CNBC.com, White House orders broader Obamacare health plans in 2015 ..., Strategic Move Exempts Health Law From Broader U.S. Statute ..., ObamaCare News: Daily ObamaCare Updates - ObamaCare Facts, Health Insurers' Decisions on Insurance Exchange Participation, Obamacare |...","[[The , <em>broader</em>, mission, as Wadleigh views it, will , <em>include</em>, efforts to ensure that people are using their coverage to get preventive care, to push for ...], [The , <em>broader</em>, mission, as Wadleigh views it, will , <em>include</em>, efforts to ensure that people are using their coverage to get preventive care, to push for ...], [The , <em>broader</em>, mission, ..."
25,"(2013, 12)",date,"[legislators country, middle class poor, mikhail, milbank, common ground, mile away, common core state, miles away, miles day, miles headquarters]",legislators country,middle class poor,mikhail,milbank,common ground,"[Obamacare faces new threat at state level from corporate interest ..., Patient Protection and Affordable Care Act - Wikipedia, the free ..., United States federal government shutdown of 2013 - Wikipedia, the ..., Massachusetts health care reform - Wikipedia, the free encyclopedia, Why is Obamacare so controversial? - BBC News - BBC.com, Obamacare's Medicaid Expansion Is Helping The Uninsured ...","[[Republican , <em>legislators</em>, and rightwing lobbying group draft proposal for state ... Wednesday 20 November , <em>2013</em>, 11.07 EST Last modified on Wednesday 11 May .... in , <em>December</em>, at the council's next nationwide summit in Washington. .... Are you saying that handing over the , <em>country</em>, to corporations will make it ...], [Republican , <em>legislators</em...","[New Health Law Frustrates Many in Middle Class - NYTimes.com, High Obamacare Costs Creating New Middle Class of Uninsured, ObamaCare Stories: Real Life Stories on ObamaCare, Why Obamacare fails poor and middle class - CNN.com, The Forgotten Middle Class of Obamacare: “not poor enough for help ..., What Obamacare Did For Our Middle-Class Family - Smartter Each Day, New York Times *finally* tel...","[[Another Rule in Health Law Is Scaled Back (, <em>December 20, 2013</em>, ) ... Chapmans are caught in the uncomfortable , <em>middle</em>, : not , <em>poor</em>, enough for help, ... “Everybody was thinking that , <em>Obamacare</em>, was going to come in with more ...], [Another Rule in Health Law Is Scaled Back (, <em>December 20, 2013</em>, ) ... Chapmans are caught in the uncomfortable ...","[The Obamacare We Deserve - The New York Times, Obamacare: 365,000 have signed up for insurance on exchanges ..., Some Obamacare insured start new year in limbo - Dec. 31, 2013, As Predicted, ObamaCare Plunges Into 'Utter Chaos' - Forbes, The impossible trinity of ObamaCare | Fox News, Obamacare Horror Stories | Citizens Against Government Waste, Michael Moore: Obamacare sends over $100 billio...","[[Now that , <em>Obamacare</em>, has finally arrived, liberals can stop defending its ... The , <em>Obamacare</em>, We Deserve. By , <em>MICHAEL</em>, MOORE , <em>DEC. 31, 2013</em>, .], [Now that , <em>Obamacare</em>, has finally arrived, liberals can stop defending its ... The , <em>Obamacare</em>, We Deserve. By , <em>MICHAEL</em>, MOORE , <em>DEC. 31, 2013</em>, .], [Now that , <em>O...","[Dana Milbank: In Obamacare speech, Obama makes a desperate ..., Milbank Memorial Fund - Milbank Highlights December 2013, A Picture of Progress on Hospital Errors - Milbank Memorial Fund, The Perils of Health Care Nostalgia - NCBI - National Institutes of Health, A Picture of Progress on Hospital Errors, 2013 December - ShrinkOurGovernment.com, RealClearPolitics December 17, 2013 Archives, Mi...","[[<em>Obamacare</em>, fails, so will this president and his party. ... Got it. By Dana , <em>Milbank</em>, Opinion writer. Opinions. October 21, , <em>2013</em>, ... “No one who decides to purchase a plan has to pay their first premium until , <em>December</em>, 15th. And unlike the day after ...], [<em>Obamacare</em>, fails, so will this president and his party. ... Got it. By Dana , <em...","[Lie of the Year: 'If you like your health care plan, you can keep it ..., Obamacare Health Insurance COOPs Are Unraveling | NCPA, Al Franken Finds Common Ground With Tea Party Conservatives On ..., Reagan Would Have Found Common Ground With Tea Party - Breitbart, PUBLIC NOTICE: Wisconsin: Common Ground Co-Op *FULLY ..., Obamacare Website, Carlos Danger: Democrats' Most Embarrassing ..., Commo...","[[<em>Thursday, December 12th, 2013 at 4:44 p.m.</em>, ... but friends and foes of , <em>Obamacare</em>, have found one slice of , <em>common ground</em>, : The ...], [<em>Thursday, December 12th, 2013 at 4:44 p.m.</em>, ... but friends and foes of , <em>Obamacare</em>, have found one slice of , <em>common ground</em>, : The ...], [<em>Thursday, December 12th, 2013 at 4:44 p.m.</em>, ... ..."
48,"(2014, 1)",date,"[zuma press seeded, january 2014 dustinsiggins, january 6th, january 3rd felt, january 29, january 28th, january 27, january 25 2014, january 2017, january 2014 information]",zuma press seeded,january 2014 dustinsiggins,january 6th,january 3rd felt,january 29,"[Benefits Of ObamaCare: Advantage of ObamaCare - ObamaCare Facts, ObamaCare Dollar Limits - ObamaCare Facts, Obamacare Causes Millions to Lose Their Current Health Insurance, 6 Insurance Companies Created Under Obamacare Collapsed, 2016 Campaign Rhetoric Shakes Markets – By GERALD F. SEIB ..., nomad | Aisle C, Florida Courier - July 04, 2014 by Central Florida Communicators ..., The Top 5 Best...","[[<em>ObamaCare</em>, “grandfathered plans”. Starting , <em>January 1st of 2014</em>, , the following “Ten Essential Benefits” must be included under all ...], [<em>ObamaCare</em>, “grandfathered plans”. Starting , <em>January 1st of 2014</em>, , the following “Ten Essential Benefits” must be included under all ...], [<em>ObamaCare</em>, “grandfathered plans”. Starting , <em>January 1st of ...","[Media Change Definition of Abortion to Promote Obamacare ..., Benefits Of ObamaCare: Advantage of ObamaCare - ObamaCare Facts, ObamaCare Individual Mandate - ObamaCare Facts, Key Features of the Affordable Care Act By Year | HHS.gov, Obama admin. doubles down, demands nuns violate their conscience ..., January 2014 – Nebraska Family Alliance, January | 2014 | wyandotcountyrightolife | Page 10...","[[<em>Obamacare</em>, . National. , <em>Dustin Siggins</em>, and Drew Belsky , <em>Jan 8, 2014</em>, | 2:57PM Washington, DC. Share this story:, <wbr>.</wbr>], [<em>Obamacare</em>, . National. , <em>Dustin Siggins</em>, and Drew Belsky , <em>Jan 8, 2014</em>, | 2:57PM Washington, DC. Share this story:, <wbr>.</wbr>], [<em>Obamacare</em>, . National. , <em>Dustin Siggins</em>, and Drew Bel...","[Money talks: January 6th 2014: Dollars and crafts | The Economist, Monday, January 6th: Obamacare and Me in NYC: What the ..., The Note's Must-Reads for Monday, January 6, 2014 - ABC News, 06 – January – 2014 – CNN Political Ticker - CNN.com Blogs, Obamacare's Employer Mandate Takes Effect: 5 Things to Know - Free ..., ObamaCare in California - Counterpunch, Obama's Numbers (January 2014 Upda...","[[Our correspondents discuss the outlook for the global economy, the state of , <em>Obamacare</em>, and how Etsy, an online marketplace for crafts, has ...], [Our correspondents discuss the outlook for the global economy, the state of , <em>Obamacare</em>, and how Etsy, an online marketplace for crafts, has ...], [Our correspondents discuss the outlook for the global economy, the state of , ...","[January 3, 2014 This is Obamacare: New Year, New Coverage ..., Key Features of the Affordable Care Act | HHS.gov, Benefits Of ObamaCare: Advantage of ObamaCare - ObamaCare Facts, ObamaCare Stories: Real Life Stories on ObamaCare, Patient Protection and Affordable Care Act - Wikipedia, the free ..., Kaiser Health Tracking Poll: January 2014 | The Henry J. Kaiser ..., Is Obama Backing Away from...","[[Starting , <em>January</em>, 1, millions of Americans are covered by provisions of the ... for the “certain dignity” they , <em>felt</em>, from being able to afford insurance.], [Starting , <em>January</em>, 1, millions of Americans are covered by provisions of the ... for the “certain dignity” they , <em>felt</em>, from being able to afford insurance.], [Starting , <em>January</em>, 1,...","[10 things you need to know today: January 29, 2014 - The Week, The Chart That Could Sink Obamacare - Forbes, Ted Cruz's Pants on Fire claim that health care law is nation's 'biggest ..., Obamacare open enrollment: What you need to know - CNN.com, ObamaCare | Health Insurance Exchange - ObamaCare Facts, Sign up for ObamaCare, Obama says | TheHill, Ex-Microsoft Exec Brings Lists And Whiteboard ...","[[10 things you need to know today: , <em>January 29, 2014</em>, . Harold Maass ... House passes bill barring the use of , <em>ObamaCare</em>, subsidies for abortion], [10 things you need to know today: , <em>January 29, 2014</em>, . Harold Maass ... House passes bill barring the use of , <em>ObamaCare</em>, subsidies for abortion], [10 things you need to know today: , <em>January 29, 2014</..."


In [None]:
df_of_scopes_dates.pickle


In [126]:
list(df_of_scopes_dates["top1"])

[u'legal counsel',
 u'preventing pregnancy',
 u'zuckerberg spent 25',
 u'legislators country',
 u'zuma press seeded',
 u'zurich',
 u'liberals conservatives',
 u'zuma',
 u'lefties',
 u'zuckerberg jones',
 u'zurich',
 u'\xeatre',
 u'zuma',
 u'zucker',
 u'liars current political',
 u'zywicki',
 u'zones europe',
 u'zurich',
 u'zwick',
 u'zuckerberg',
 u'best deal',
 u'zug island homeland',
 u'zuma press',
 u'zoo officials say',
 u'leblond doro',
 u'\u017ei\u017eek',
 u'life federation',
 u'zuckerberg initiative',
 u'zuma press',
 u'zurich',
 u'zubik case',
 u'leaving country']

In [242]:
n = df_of_scopes_dates.shape[0]

scope_dates = sorted(list(df_of_scopes_dates["scope"]))
google_titles_for_top1 = []
google_titles_for_top2 = []
google_titles_for_top3 = []
google_titles_for_top4 = []
google_titles_for_top5 = []

google_summeries_for_top1 = []
google_summeries_for_top2 = []
google_summeries_for_top3 = []
google_summeries_for_top4 = []
google_summeries_for_top5 = []



for i in range(n):
    date = scope_dates[i]
    #print date
    
    # For token # 1
    top1_token = list(df_of_scopes_dates["top1"])[i]
    url_1 = get_google_search_url(scope_date = date, search_token = top1_token)
    (link_title_list,link_url_list,result_summary_list) = dict_of_google_tag_searches[url_1]
    google_titles_for_top1.append(link_title_list)
    google_summeries_for_top1.append(result_summary_list)

    # For token # 2
    top2_token = list(df_of_scopes_dates["top2"])[i]
    url_2 = get_google_search_url(scope_date = date, search_token = top2_token)
    (link_title_list,link_url_list,result_summary_list) = dict_of_google_tag_searches[url_2]
    google_titles_for_top2.append(link_title_list)
    google_summeries_for_top2.append(result_summary_list)
    

    # For token # 3
    top3_token = list(df_of_scopes_dates["top3"])[i]
    url_3 = get_google_search_url(scope_date = date, search_token = top3_token)
    (link_title_list,link_url_list,result_summary_list) = dict_of_google_tag_searches[url_3]
    google_titles_for_top3.append(link_title_list)
    google_summeries_for_top3.append(result_summary_list)

    # For token # 4
    top4_token = list(df_of_scopes_dates["top4"])[i]
    url_4 = get_google_search_url(scope_date = date, search_token = top4_token)
    (link_title_list,link_url_list,result_summary_list) = dict_of_google_tag_searches[url_4]
    google_titles_for_top4.append(link_title_list)
    google_summeries_for_top4.append(result_summary_list)
    
    # For token # 5
    top5_token = list(df_of_scopes_dates["top5"])[i]
    url_5 = get_google_search_url(scope_date = date, search_token = top5_token)
    (link_title_list,link_url_list,result_summary_list) = dict_of_google_tag_searches[url_5]
    google_titles_for_top5.append(link_title_list)
    google_summeries_for_top5.append(result_summary_list)
    
df_of_scopes_dates["google_titles_for_top1"] = google_titles_for_top1
df_of_scopes_dates["google_summeries_for_top1"] = google_summeries_for_top1

df_of_scopes_dates["google_titles_for_top2"] = google_titles_for_top2
df_of_scopes_dates["google_summeries_for_top2"] = google_summeries_for_top2

df_of_scopes_dates["google_titles_for_top3"] = google_titles_for_top3
df_of_scopes_dates["google_summeries_for_top3"] = google_summeries_for_top3

df_of_scopes_dates["google_titles_for_top4"] = google_titles_for_top4
df_of_scopes_dates["google_summeries_for_top4"] = google_summeries_for_top4

df_of_scopes_dates["google_titles_for_top5"] = google_titles_for_top5
df_of_scopes_dates["google_summeries_for_top5"] = google_summeries_for_top5

In [152]:
#display.max_colwidth = 100
#pd.describe_option()

In [176]:
import pickletools

In [177]:
from pandas.io.pickle import to_pickle

In [184]:
pickle.HIGHEST_PROTOCOL = 2

In [186]:
import sys

In [191]:
sys.getrecursionlimit()

3000

In [190]:
sys.setrecursionlimit(3000)  # Changing from 1000 to 30000

In [192]:
df_of_scopes_dates.to_pickle("df_of_scopes_dates.pickle")

In [None]:
dict_of_google_tag_searches

In [183]:
pickle.dump( df_of_scopes_dates, open( "df_of_scopes_dates.pickle", "wb" ) )

#google_news_df = pickle.load( open("google_news_df.pickle", "rb" ))

RuntimeError: maximum recursion depth exceeded

In [164]:
df_of_scopes_dates[["top4","google_titles_for_top4","google_summeries_for_top4"]]

Unnamed: 0,top4,google_titles_for_top4,google_summeries_for_top4
44,changes healthcare,[How Obamacare Changed Health Insurance ... Ma...,"[[Nearly 22.8 million people have gained , <em..."
24,obamacare pays,"[ObamaCare 2013 - ObamaCare Facts, ObamaCare |...","[[<em>Oct 1st, 2013</em>, and closes March 31..."
10,including 10,[Obamacare's Website Won't Be Working By Novem...,"[[<em>Obamacare</em>, is working by next Nove..."
25,milbank,"[Dana Milbank: In Obamacare speech, Obama make...","[[<em>Obamacare</em>, fails, so will this pre..."
48,january 3rd felt,"[January 3, 2014 This is Obamacare: New Year, ...","[[Starting , <em>January</em>, 1, millions of..."
23,incumbent reelection aspiring,"[CNN's GUT CHECK for February 10, 2014 – CNN P...","[[CNN's GUT CHECK | for , <em>February</em>, ..."
7,military spending pentagon,[Budget experts: Move Tricare beneficiaries to...,"[[<em>budget</em>, experts suggested the , <e..."
36,officer executive,[Healthcare players see more changes coming to...,"[[<em>Wed Apr 2, 2014 4:54pm EDT</em>, . Relat..."
41,indoctrination,[Cass Sunstein: Chinese student indoctrination...,"[[<em>indoctrination</em>, study , <em>may</e..."
53,draw attention,[Patient Protection and Affordable Care Act - ...,"[[<em>Obamacare</em>, , ... Significant reform..."


In [133]:
google_titles_for_top1

[[u'US government shutdown: House votes to delay Obamacare law | US ...',
  u'What Is Obamacare? A Legal Overview - Law and Daily Life - FindLaw',
  u"What's in a name? Lots when it comes to Obamacare/ACA - CNBC.com",
  u'Read the Law | HHS.gov',
  u'Office of the General Counsel September 3, 2013 Submitted ...',
  u'Patient Protection and Affordable Care Act - Wikipedia, the free ...',
  u'United States federal government shutdown of 2013 - Wikipedia, the ...',
  u'Top 16 myths about the health care law | PolitiFact',
  u'Remarks by the President on the Affordable Care Act | whitehouse.gov',
  u'Can They Crush Obamacare? by David Cole | The New York Review ...'],
 [u'Why Is Maternity Care Such an Issue for Obamacare Opponents ...',
  u'ObamaCare Stories: Real Life Stories on ObamaCare',
  u"Yes, men should pay for pregnancy coverage, and here's why - latimes",
  u'How Obamacare changed maternity coverage - HealthInsurance.org',
  u'The Cheapest Shot Against Obamacare, and Why Men Shou

In [193]:
pickle.dump( dict_of_google_tag_searches, open( "dict_of_google_tag_searches.pickle", "wb" ) )

#google_news_df = pickle.load( open("google_news_df.pickle", "rb" ))

In [2]:
dict_of_google_tag_searches.keys()

NameError: name 'dict_of_google_tag_searches' is not defined

In [804]:
import sys
sys.getrecursionlimit()
sys.setrecursionlimit(3000)  # Changing from 1000 to 30000
df_of_scopes_dates.to_pickle("df_of_scopes_dates.pickle")

pickle.dump( df_of_scopes_dates, open( "df_of_scopes_dates.pickle", "wb" ) )

#google_news_df = pickle.load( open("google_news_df.pickle", "rb" ))


In [534]:
re.sub('<[^<]+?>', '', x[0])

u'Kasich takes Ohio; Trump also projected to win '

In [527]:
from HTMLParser import HTMLParser
stripper = HTMLParser()

In [532]:
stripper.feed(x[1])

TypeError: coercing to Unicode: need string or buffer, Tag found

In [47]:
import requests
import requests
import json
from bs4 import BeautifulSoup
import pprint

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from newspaper import Article
import requests
import json
from bs4 import BeautifulSoup
import pprint

import time

import cPickle as pickle

<hr>

In [None]:
# from collections import defaultdict

# vectorizer = TfidfVectorizer(stop_words="english", max_df=0.95, min_df=2,ngram_range=(1,3))
# X = vectorizer.fit_transform(lectures)
# features_by_gram = defaultdict(list)

# for f, w in zip(vectorizer.get_feature_names(), vectorizer.idf_):
#     features_by_gram[len(f.split(' '))].append((f, w))
# top_n = 20
# for gram, features in features_by_gram.iteritems():
#     top_features = sorted(features, key=lambda x: x[1], reverse=True)[:top_n]
#     top_features = [f[0] for f in top_features]
#     print '{}-gram top:'.format(gram), top_features
#     print