In [1]:
import os
from datetime import datetime
import requests
import time
import pandas as pd
import spacy
nlp = spacy.load('en_core_web_sm')
from collections import Counter

In [2]:
#read in pickle file as dataframe and drop unnecessary columns
master_df = pd.read_pickle("vader_picke_file.pkl").drop(columns=['web_url', 'snippet', '_id', 'article', 'date', 'VADER snippet neg', 'VADER snippet neu', 'VADER snippet pos', 'VADER snippet compound'])
master_df

Unnamed: 0,abstract,pub_date
0,"It was a year full of big environmental news, ...",2010-12-31T23:00:27+0000
1,The first federal regulations on climate-alter...,2010-12-31T01:47:00+0000
2,About $11 billion in oil and natural gas trade...,2010-12-30T20:21:30+0000
3,"Recent years have been humbling for oil bulls,...",2010-12-30T02:34:46+0000
4,Crude oil traded over $90 for a fifth consecut...,2010-12-29T14:06:12+0000
...,...,...
1515,Weakened demand for semiconductors is hurting ...,2019-02-04T11:36:40+0000
1516,"Innovation, investment and inviting geology ha...",2019-02-03T23:17:02+0000
1517,On one side of a barricade in remote British C...,2019-01-27T23:45:27+0000
1518,The United States’ recognition of Venezuela’s ...,2019-01-25T00:29:52+0000


In [3]:
master_df['pub_date'] = pd.to_datetime(master_df['pub_date'])

In [4]:
datetime.now()

datetime.datetime(2020, 2, 9, 21, 31, 28, 382925)

In [5]:
#Converting date to year month day format

master_df['pub_date'] = master_df['pub_date'].dt.strftime('%Y-%m-%d')

In [6]:
master_df.sort_values(by=['pub_date'], inplace=True, ascending=True)
master_df

Unnamed: 0,abstract,pub_date
229,The two countries failed to renew an agreement...,2010-01-04
227,PetroBakken Energy said that it had paid $262 ...,2010-01-05
226,"The secretary, Ken Salazar, said the goal of t...",2010-01-06
225,"If adopted, the standard will impose large cos...",2010-01-07
223,Spectra Energy has been quietly investigating ...,2010-01-13
...,...,...
1446,It remains to be seen whether the reduction in...,2019-12-05
1445,The oil giant’s shares reached prices on the R...,2019-12-11
1444,Immense amounts of methane are escaping from o...,2019-12-12
1441,Twelve states and the District of Columbia rel...,2019-12-17


In [7]:
all_news_per_day = []
#Iterating through unique dates
for day in master_df['pub_date'].unique():
    #Finding all records with the corresponding day
    corr_day = master_df.loc[master_df['pub_date']== day]
    # Combine the news for those days
    combined_news = " ".join(s for s in corr_day["abstract"])
    #appending combined news to all news per day with the current day
    all_news_per_day.append((combined_news,day))

In [8]:
combined_news_day = pd.DataFrame(all_news_per_day,columns=['combined_absract', 'Date'])
combined_news_day

Unnamed: 0,combined_absract,Date
0,The two countries failed to renew an agreement...,2010-01-04
1,PetroBakken Energy said that it had paid $262 ...,2010-01-05
2,"The secretary, Ken Salazar, said the goal of t...",2010-01-06
3,"If adopted, the standard will impose large cos...",2010-01-07
4,Spectra Energy has been quietly investigating ...,2010-01-13
...,...,...
965,It remains to be seen whether the reduction in...,2019-12-05
966,The oil giant’s shares reached prices on the R...,2019-12-11
967,Immense amounts of methane are escaping from o...,2019-12-12
968,Twelve states and the District of Columbia rel...,2019-12-17


In [9]:
# Tokenization and Lemmatization
tokens = []
lemma = []
for doc in nlp.pipe(combined_news_day['combined_absract'].astype('unicode').values, batch_size=50,
                        n_threads=3):
    if doc.is_parsed:
        tokens.append([n.text for n in doc])
        lemma.append([n.lemma_ for n in doc
                        if not n.is_stop
                        and not n.is_punct])
    else:
        # We want to make sure that the lists of parsed results have the
        # same number of entries of the original Dataframe, so add some blanks in case the parse fails
        tokens.append(None)
        lemma.append(None)
combined_news_day['abstract_tokens'] = tokens
combined_news_day['abstract_lemma'] = lemma

In [10]:
combined_news_day

Unnamed: 0,combined_absract,Date,abstract_tokens,abstract_lemma
0,The two countries failed to renew an agreement...,2010-01-04,"[The, two, countries, failed, to, renew, an, a...","[country, fail, renew, agreement, crude, oil, ..."
1,PetroBakken Energy said that it had paid $262 ...,2010-01-05,"[PetroBakken, Energy, said, that, it, had, pai...","[PetroBakken, Energy, say, pay, $, 262, millio..."
2,"The secretary, Ken Salazar, said the goal of t...",2010-01-06,"[The, secretary, ,, Ken, Salazar, ,, said, the...","[secretary, Ken, Salazar, say, goal, new, revi..."
3,"If adopted, the standard will impose large cos...",2010-01-07,"[If, adopted, ,, the, standard, will, impose, ...","[adopt, standard, impose, large, cost, local, ..."
4,Spectra Energy has been quietly investigating ...,2010-01-13,"[Spectra, Energy, has, been, quietly, investig...","[Spectra, Energy, quietly, investigate, approa..."
...,...,...,...,...
965,It remains to be seen whether the reduction in...,2019-12-05,"[It, remains, to, be, seen, whether, the, redu...","[remain, see, reduction, output, impact, price..."
966,The oil giant’s shares reached prices on the R...,2019-12-11,"[The, oil, giant, ’s, shares, reached, prices,...","[oil, giant, share, reach, price, Riyadh, exch..."
967,Immense amounts of methane are escaping from o...,2019-12-12,"[Immense, amounts, of, methane, are, escaping,...","[immense, amount, methane, escape, oil, gas, s..."
968,Twelve states and the District of Columbia rel...,2019-12-17,"[Twelve, states, and, the, District, of, Colum...","[state, District, Columbia, release, draft, pl..."


In [11]:
# Counting the words
counts = []
for x in combined_news_day['abstract_lemma']:
    counts.append(Counter(x).most_common(10))

In [12]:
combined_news_day['counts'] = counts

In [13]:
combined_news_day

Unnamed: 0,combined_absract,Date,abstract_tokens,abstract_lemma,counts
0,The two countries failed to renew an agreement...,2010-01-04,"[The, two, countries, failed, to, renew, an, a...","[country, fail, renew, agreement, crude, oil, ...","[(country, 1), (fail, 1), (renew, 1), (agreeme..."
1,PetroBakken Energy said that it had paid $262 ...,2010-01-05,"[PetroBakken, Energy, said, that, it, had, pai...","[PetroBakken, Energy, say, pay, $, 262, millio...","[(Energy, 2), (PetroBakken, 1), (say, 1), (pay..."
2,"The secretary, Ken Salazar, said the goal of t...",2010-01-06,"[The, secretary, ,, Ken, Salazar, ,, said, the...","[secretary, Ken, Salazar, say, goal, new, revi...","[(secretary, 1), (Ken, 1), (Salazar, 1), (say,..."
3,"If adopted, the standard will impose large cos...",2010-01-07,"[If, adopted, ,, the, standard, will, impose, ...","[adopt, standard, impose, large, cost, local, ...","[(adopt, 1), (standard, 1), (impose, 1), (larg..."
4,Spectra Energy has been quietly investigating ...,2010-01-13,"[Spectra, Energy, has, been, quietly, investig...","[Spectra, Energy, quietly, investigate, approa...","[(Spectra, 1), (Energy, 1), (quietly, 1), (inv..."
...,...,...,...,...,...
965,It remains to be seen whether the reduction in...,2019-12-05,"[It, remains, to, be, seen, whether, the, redu...","[remain, see, reduction, output, impact, price...","[(remain, 1), (see, 1), (reduction, 1), (outpu..."
966,The oil giant’s shares reached prices on the R...,2019-12-11,"[The, oil, giant, ’s, shares, reached, prices,...","[oil, giant, share, reach, price, Riyadh, exch...","[(oil, 1), (giant, 1), (share, 1), (reach, 1),..."
967,Immense amounts of methane are escaping from o...,2019-12-12,"[Immense, amounts, of, methane, are, escaping,...","[immense, amount, methane, escape, oil, gas, s...","[(immense, 1), (amount, 1), (methane, 1), (esc..."
968,Twelve states and the District of Columbia rel...,2019-12-17,"[Twelve, states, and, the, District, of, Colum...","[state, District, Columbia, release, draft, pl...","[(state, 1), (District, 1), (Columbia, 1), (re..."


In [14]:
#adding Top world column to search on
combined_news_day['top_word'] = combined_news_day['counts'].apply(lambda x: x[0][0])

In [15]:
combined_news_day['top_word']

0        country
1         Energy
2      secretary
3          adopt
4        Spectra
         ...    
965       remain
966          oil
967      immense
968        state
969         size
Name: top_word, Length: 970, dtype: object

In [16]:
combined_news_day

Unnamed: 0,combined_absract,Date,abstract_tokens,abstract_lemma,counts,top_word
0,The two countries failed to renew an agreement...,2010-01-04,"[The, two, countries, failed, to, renew, an, a...","[country, fail, renew, agreement, crude, oil, ...","[(country, 1), (fail, 1), (renew, 1), (agreeme...",country
1,PetroBakken Energy said that it had paid $262 ...,2010-01-05,"[PetroBakken, Energy, said, that, it, had, pai...","[PetroBakken, Energy, say, pay, $, 262, millio...","[(Energy, 2), (PetroBakken, 1), (say, 1), (pay...",Energy
2,"The secretary, Ken Salazar, said the goal of t...",2010-01-06,"[The, secretary, ,, Ken, Salazar, ,, said, the...","[secretary, Ken, Salazar, say, goal, new, revi...","[(secretary, 1), (Ken, 1), (Salazar, 1), (say,...",secretary
3,"If adopted, the standard will impose large cos...",2010-01-07,"[If, adopted, ,, the, standard, will, impose, ...","[adopt, standard, impose, large, cost, local, ...","[(adopt, 1), (standard, 1), (impose, 1), (larg...",adopt
4,Spectra Energy has been quietly investigating ...,2010-01-13,"[Spectra, Energy, has, been, quietly, investig...","[Spectra, Energy, quietly, investigate, approa...","[(Spectra, 1), (Energy, 1), (quietly, 1), (inv...",Spectra
...,...,...,...,...,...,...
965,It remains to be seen whether the reduction in...,2019-12-05,"[It, remains, to, be, seen, whether, the, redu...","[remain, see, reduction, output, impact, price...","[(remain, 1), (see, 1), (reduction, 1), (outpu...",remain
966,The oil giant’s shares reached prices on the R...,2019-12-11,"[The, oil, giant, ’s, shares, reached, prices,...","[oil, giant, share, reach, price, Riyadh, exch...","[(oil, 1), (giant, 1), (share, 1), (reach, 1),...",oil
967,Immense amounts of methane are escaping from o...,2019-12-12,"[Immense, amounts, of, methane, are, escaping,...","[immense, amount, methane, escape, oil, gas, s...","[(immense, 1), (amount, 1), (methane, 1), (esc...",immense
968,Twelve states and the District of Columbia rel...,2019-12-17,"[Twelve, states, and, the, District, of, Colum...","[state, District, Columbia, release, draft, pl...","[(state, 1), (District, 1), (Columbia, 1), (re...",state


In [17]:
#filtering out required top words
combined_news_day.loc[combined_news_day['top_word'] == 'gas']

Unnamed: 0,combined_absract,Date,abstract_tokens,abstract_lemma,counts,top_word
106,Environmental concerns and other issues could ...,2010-10-13,"[Environmental, concerns, and, other, issues, ...","[environmental, concern, issue, complicate, ex...","[(gas, 2), (environmental, 1), (concern, 1), (...",gas
178,Natural gas is cheaper than oil. It’s cleaner....,2011-04-12,"[Natural, gas, is, cheaper, than, oil, ., It, ...","[natural, gas, cheap, oil, clean, Congress, ac...","[(gas, 2), (oil, 2), (natural, 1), (cheap, 1),...",gas
250,Americans have signed millions of leases allow...,2011-12-02,"[Americans, have, signed, millions, of, leases...","[Americans, sign, million, lease, allow, oil, ...","[(gas, 2), (Americans, 1), (sign, 1), (million...",gas
280,The Thai oil and gas company PTT Exploration a...,2012-02-24,"[The, Thai, oil, and, gas, company, PTT, Explo...","[Thai, oil, gas, company, PTT, Exploration, Pr...","[(gas, 2), ($, 2), (billion, 2), (exploration,...",gas
297,"Are gas prices soaring, or collapsing? It depe...",2012-03-30,"[Are, gas, prices, soaring, ,, or, collapsing,...","[gas, price, soar, collapse, depend, gas, talk...","[(gas, 3), (price, 3), (soar, 1), (collapse, 1...",gas
336,"Gas and coal power won't disappear soon, but w...",2012-08-16,"[Gas, and, coal, power, wo, n't, disappear, so...","[gas, coal, power, will, disappear, soon, carb...","[(gas, 1), (coal, 1), (power, 1), (will, 1), (...",gas
343,The oil and natural gas boom might distract us...,2012-09-05,"[The, oil, and, natural, gas, boom, might, dis...","[oil, natural, gas, boom, distract, urgent, ch...","[(gas, 2), (oil, 1), (natural, 1), (boom, 1), ...",gas
351,Excelerate Energy is positioning itself to cap...,2012-10-23,"[Excelerate, Energy, is, positioning, itself, ...","[Excelerate, Energy, position, capitalize, ame...","[(gas, 2), (international, 2), (energy, 2), (E...",gas
354,Exxon Mobil and Royal Dutch Shell reported lac...,2012-11-01,"[Exxon, Mobil, and, Royal, Dutch, Shell, repor...","[Exxon, Mobil, Royal, Dutch, Shell, report, la...","[(gas, 2), (Island, 2), (Exxon, 1), (Mobil, 1)...",gas
379,Oil and gas made Norway one of the richest cou...,2013-01-28,"[Oil, and, gas, made, Norway, one, of, the, ri...","[oil, gas, Norway, rich, country, siege, kill,...","[(gas, 2), (Hess, 2), (plan, 2), (oil, 1), (No...",gas


In [18]:
combined_news_day.loc[combined_news_day['top_word'] == 'Energy']

Unnamed: 0,combined_absract,Date,abstract_tokens,abstract_lemma,counts,top_word
1,PetroBakken Energy said that it had paid $262 ...,2010-01-05,"[PetroBakken, Energy, said, that, it, had, pai...","[PetroBakken, Energy, say, pay, $, 262, millio...","[(Energy, 2), (PetroBakken, 1), (say, 1), (pay...",Energy
190,Energy Secretary Chu names a panel to study th...,2011-05-06,"[Energy, Secretary, Chu, names, a, panel, to, ...","[Energy, Secretary, Chu, name, panel, study, c...","[(Energy, 1), (Secretary, 1), (Chu, 1), (name,...",Energy
260,The French oil giant Total is acquiring a big ...,2012-01-03,"[The, French, oil, giant, Total, is, acquiring...","[french, oil, giant, Total, acquire, big, piec...","[(Energy, 2), (shale, 2), (operation, 2), ($, ...",Energy


In [21]:
combined_news_day.loc[combined_news_day['top_word'] == 'oil']

Unnamed: 0,combined_absract,Date,abstract_tokens,abstract_lemma,counts,top_word
11,Hoping to increase Venezuela’s flagging oil pr...,2010-02-11,"[Hoping, to, increase, Venezuela, ’s, flagging...","[hope, increase, Venezuela, flag, oil, product...","[(oil, 2), (hope, 1), (increase, 1), (Venezuel...",oil
14,The oil and gas driller is looking to dispose ...,2010-02-17,"[The, oil, and, gas, driller, is, looking, to,...","[oil, gas, driller, look, dispose, $, 2, billi...","[(oil, 1), (gas, 1), (driller, 1), (look, 1), ...",oil
15,Exxon Mobil replaced 100 percent of 2009 oil p...,2010-02-18,"[Exxon, Mobil, replaced, 100, percent, of, 200...","[Exxon, Mobil, replace, 100, percent, 2009, oi...","[(oil, 2), (Exxon, 1), (Mobil, 1), (replace, 1...",oil
34,The Norwegian energy company says it is workin...,2010-04-20,"[The, Norwegian, energy, company, says, it, is...","[norwegian, energy, company, say, work, new, t...","[(oil, 2), (norwegian, 1), (energy, 1), (compa...",oil
38,As nerve-racking as the recent oil spill in th...,2010-04-29,"[As, nerve, -, racking, as, the, recent, oil, ...","[nerve, rack, recent, oil, spill, gulf, reason...","[(oil, 2), (nerve, 1), (rack, 1), (recent, 1),...",oil
...,...,...,...,...,...,...
904,"For decades, opposition to drilling has left t...",2018-12-03,"[For, decades, ,, opposition, to, drilling, ha...","[decade, opposition, drilling, leave, Arctic, ...","[(oil, 2), (decade, 1), (opposition, 1), (dril...",oil
907,The Trump administration has touted increased ...,2018-12-07,"[The, Trump, administration, has, touted, incr...","[Trump, administration, tout, increase, energy...","[(oil, 2), (Trump, 1), (administration, 1), (t...",oil
916,"Innovation, investment and inviting geology ha...",2019-02-03,"[Innovation, ,, investment, and, inviting, geo...","[Innovation, investment, invite, geology, give...","[(oil, 2), (Innovation, 1), (investment, 1), (...",oil
942,"The skinny waterway, the gateway for a third o...",2019-06-13,"[The, skinny, waterway, ,, the, gateway, for, ...","[skinny, waterway, gateway, crude, oil, tanker...","[(oil, 2), (skinny, 1), (waterway, 1), (gatewa...",oil


In [22]:
combined_news_day.loc[combined_news_day['top_word'] == 'Trump']

Unnamed: 0,combined_absract,Date,abstract_tokens,abstract_lemma,counts,top_word
902,The Trump administration’s policy of encouragi...,2018-11-27,"[The, Trump, administration, ’s, policy, of, e...","[Trump, administration, policy, encourage, oil...","[(Trump, 1), (administration, 1), (policy, 1),...",Trump


In [23]:
combined_news_day.loc[combined_news_day['top_word'] == 'Shell']

Unnamed: 0,combined_absract,Date,abstract_tokens,abstract_lemma,counts,top_word
214,Shell was under pressure after it emerged that...,2011-08-16,"[Shell, was, under, pressure, after, it, emerg...","[Shell, pressure, emerge, company, wait, day, ...","[(Shell, 1), (pressure, 1), (emerge, 1), (comp...",Shell
215,Shell’s plan to drill four exploratory oil wel...,2011-08-18,"[Shell, ’s, plan, to, drill, four, exploratory...","[Shell, plan, drill, exploratory, oil, well, A...","[(Shell, 1), (plan, 1), (drill, 1), (explorato...",Shell
313,The oil and natural gas exploration company Co...,2012-05-23,"[The, oil, and, natural, gas, exploration, com...","[oil, natural, gas, exploration, company, Cove...","[(Shell, 2), (oil, 1), (natural, 1), (gas, 1),...",Shell
619,"Shell is dismantling its North Sea project, a ...",2015-02-18,"[Shell, is, dismantling, its, North, Sea, proj...","[Shell, dismantle, North, Sea, project, step, ...","[(Shell, 1), (dismantle, 1), (North, 1), (Sea,...",Shell


In [25]:
combined_news_day.loc[combined_news_day['top_word'] == 'Saudi']

Unnamed: 0,combined_absract,Date,abstract_tokens,abstract_lemma,counts,top_word
716,"Saudi Arabia, Russia, Venezuela and Qatar agre...",2016-02-16,"[Saudi, Arabia, ,, Russia, ,, Venezuela, and, ...","[Saudi, Arabia, Russia, Venezuela, Qatar, agre...","[(Saudi, 1), (Arabia, 1), (Russia, 1), (Venezu...",Saudi
866,Saudi Arabia’s national oil company has served...,2018-05-08,"[Saudi, Arabia, ’s, national, oil, company, ha...","[Saudi, Arabia, national, oil, company, serve,...","[(Saudi, 1), (Arabia, 1), (national, 1), (oil,...",Saudi
