In [81]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import requests
from bs4 import BeautifulSoup
from datetime import datetime
from requests_futures.sessions import FuturesSession
from urllib.parse import quote
import os
import dill

%matplotlib inline

In [2]:
def get_news(ticker, from_date, to_date):
    url = ('https://newsapi.org/v2/everything?'
           'q={}&'
           'from={}&'
           'to={}&'
           'sortBy=popularity&'
           'apiKey=ea4ad2eea1c5495592584f60eee40aac'.format(ticker, from_date, to_date))
    response = requests.get(url)
    return response.json()

In [76]:
get_news('apple', '2019-04-02', '2019-04-02')

{'articles': [{'author': 'Chelsea Stone, Shep McAllister, Tercius, and Corey Foster on Kinja Deals, shared by Chelsea Stone to Lifehacker',
   'content': 'A discounted Shark ION vacuum, Amazon foam mattresses, and a Mobil 1 rebate lead off Tuesdays best deals from around the web.\r\n Bookmark Kinja Deals and follow us on Twitter to never miss a deal.\r\nIf youve been holding out for a deal on the cellular-equipped … [+34981 chars]',
   'description': 'A discounted Shark ION vacuum, Amazon foam mattresses, and a Mobil 1 rebate lead off Tuesday’s best deals from around the web. Read more...',
   'publishedAt': '2019-04-02T14:40:00Z',
   'source': {'id': None, 'name': 'Theinventory.com'},
   'title': "Tuesday's Best Deals: Bowflex Dumbbells, Refurb iPad Pros, Cuisinart Cast Iron, and More",
   'url': 'https://kinjadeals.theinventory.com/tuesdays-best-deals-bowflex-dumbbells-refurb-ipad-pro-1833741117',
   'urlToImage': 'https://i.kinja-img.com/gawker-media/image/upload/s--DhNPuu-1--/c_fil

In [85]:
def convert_date(date_str):
    date = datetime.strptime(date_str, "%Y-%m-%d")
    return date.strftime("%Y/%m/%d")

def get_page_args(i, params_str):
    return {"url":"https://www.wsj.com/search/term.html?&" + params_str,
            "params":{"page":i}}

def get_headlines(response):
    soup = BeautifulSoup(response.text, 'lxml')
    news = soup.find_all('h3', attrs={'class':'headline'})
    result = [(line.select('a')[0]['href'], line.text.strip()) for line in news]
    return result

def wsj_scrapper(keyword, start_date, end_date):
    cache_dir = 'cache/'
    if not os.path.exists(cache_dir):
        os.mkdir(cache_dir)
    
    # read from cache file if already exist
    file_name = '_'.join(keyword.split()) + '_' + start_date + '_' + end_date + '.pkd'
    if os.path.exists(cache_dir + file_name):
        news_list = dill.load(open(cache_dir + file_name, 'rb'))
        return news_list
    
    params = {"KEYWORDS":quote(keyword), \
              'min-date':convert_date(start_date), \
              'max-date':convert_date(end_date), \
              'isAdvanced':'true', \
              'andor':'AND', \
              'sort':'date-desc', \
              'source':'wsjarticle,wsjblogs,wsjvideo,interactivemedia,sitesearch,wsjpro'}
    params_str = "&".join("%s=%s" % (k,v) for k,v in params.items())
    
    # process page and return result count
    response = requests.get(**get_page_args(1, params_str))
    # print("Making request to url... {}".format(response.url))
    soup = BeautifulSoup(response.text, 'lxml')
    page_count = int(soup.find(attrs={'class':'results-menu-wrapper bottom'})\
                     .find(attrs={'class':'results-count'}).text.split()[-1])
    news = soup.find_all('h3', attrs={'class':'headline'})
    
    # multithread all available pages
    news_list = []
    session = FuturesSession(max_workers=5)
    futures = [session.get(**get_page_args(i, params_str)) for i in range(1, page_count+1)]
    for future in futures:
        news_list.extend(get_headlines(future.result()))
    
    # cache result
    dill.dump(news_list, open(cache_dir + file_name, 'wb'))

    return news_list

In [84]:
wsj_scrapper('ANALOG DEVICES', '2018-04-02', '2018-06-10')

Making request to url... https://www.wsj.com/search/term.html?&KEYWORDS=ANALOG%20DEVICES&min-date=2018/04/02&max-date=2018/06/10&isAdvanced=true&andor=AND&sort=date-desc&source=wsjarticle,wsjblogs,wsjvideo,interactivemedia,sitesearch,wsjpro&page=1


[('https://www.wsj.com/articles/stocks-to-watch-morgan-stanley-hp-salesforce-amazon-tesla-analog-devices-michael-kors-1527686708?mod=searchresults&page=1&pos=1',
  'Stocks to Watch: Morgan Stanley, HP, Salesforce, Amazon, Tesla, Analog Devices, Michael Kors'),
 ('/articles/disabled-gamers-get-a-new-controller-from-microsoft-1526549401?mod=searchresults&page=1&pos=2',
  'Microsoft Is Giving Disabled Gamers a Better Controller'),
 ('/articles/columbus-discovers-the-amazon-1525635011?mod=searchresults&page=1&pos=3',
  'Columbus Discovers the Amazon')]

In [86]:
!pip install jupyterthemes

Collecting jupyterthemes
[?25l  Downloading https://files.pythonhosted.org/packages/8a/08/9dee6dfd7f2aad6c30282d55c8f495b4dc1e4747b4e2bdbeb80572ddf312/jupyterthemes-0.20.0-py2.py3-none-any.whl (7.0MB)
[K    100% |████████████████████████████████| 7.0MB 3.3MB/s eta 0:00:01
[?25hCollecting ipython>=5.4.1 (from jupyterthemes)
[?25l  Downloading https://files.pythonhosted.org/packages/46/b5/ca080401b8dbde51a0f4377b4e22ce02b266340a1cda389b6dea702d06d1/ipython-7.4.0-py3-none-any.whl (769kB)
[K    100% |████████████████████████████████| 778kB 7.9MB/s eta 0:00:01
[?25hCollecting notebook>=5.6.0 (from jupyterthemes)
[?25l  Downloading https://files.pythonhosted.org/packages/f6/36/89ebfffc9dd8c8dbd81c1ffb53e3d4233ee666414c143959477cb07cc5f5/notebook-5.7.8-py2.py3-none-any.whl (9.0MB)
[K    100% |████████████████████████████████| 9.0MB 2.5MB/s eta 0:00:01
[?25hCollecting lesscpy>=0.11.2 (from jupyterthemes)
[?25l  Downloading https://files.pythonhosted.org/packages/10/d0/fdd9874972e07ae

  Building wheel for prometheus-client (setup.py) ... [?25ldone
[?25h  Stored in directory: /Users/niniliu/Library/Caches/pip/wheels/4b/04/b8/3709c73e7453f311ebd46ad581b89642543213f995e2659b9e
Successfully built backcall prometheus-client
[31mnotebook 5.7.8 has requirement jupyter-core>=4.4.0, but you'll have jupyter-core 4.3.0 which is incompatible.[0m
Installing collected packages: backcall, prompt-toolkit, ipython, pyzmq, terminado, Send2Trash, jupyter-client, prometheus-client, notebook, lesscpy, jupyterthemes
  Found existing installation: prompt-toolkit 1.0.14
    Uninstalling prompt-toolkit-1.0.14:
      Successfully uninstalled prompt-toolkit-1.0.14
  Found existing installation: ipython 5.3.0
[31mCannot uninstall 'ipython'. It is a distutils installed project and thus we cannot accurately determine which files belong to it which would lead to only a partial uninstall.[0m


In [87]:
!jt -t chesterish

/bin/sh: jt: command not found
