In [None]:
import os
import sys
import time
import datetime
import util
from bs4 import BeautifulSoup


class ReutersCrawler(object):

    def __init__(self):
        self.ticker_list_filename = './input/apple_ticker.csv'
        self.news_filename = './input/news_reuters.csv'

    def fetch_content(self, task, date_range):
        # https://uk.reuters.com/info/disclaimer
        ticker, name, exchange, market_cap = task
        print("%s - %s - %s - %s" % (ticker, name, exchange, market_cap))

        suffix = {'AMEX': '.A', 'NASDAQ': '.O', 'NYSE': '.N'}
        # e.g. https://www.reuters.com/finance/stocks/company-news/BIDU.O?date=09262017
        
#         url = "https://www.reuters.com/finance/stocks/company-news/AAPL.O"
        url = "https://www.reuters.com/finance/stocks/company-news/" + ticker.strip() + suffix[exchange]
        print(url)
        today = datetime.datetime.today().strftime("%Y%m%d")

        news_num = self.get_news_num_whenever(url)
        print(news_num)
        if news_num:
            # this company has news, then fetch for N consecutive days in the past
            has_content, no_news_days = self.fetch_within_date_range(news_num, url, date_range, task, ticker)
            if not has_content:
                print('%s has no content within date range' % ticker)
        else:
            print("%s has no news at all, set as LOWEST priority" % (ticker))
    
    def get_news_num_whenever(self, url):
        # check the website to see if the ticker has any news
        # return the number of news
        soup = util.get_soup_with_repeat(url, repeat_times=4)
        if soup:
            return len(soup.find_all("div", {'class': ['topStory', 'feature']}))
        return 0

    def fetch_within_date_range(self, news_num, url, date_range, task, ticker):
        # if it doesn't have a single news for X consecutive days, stop iterating dates
        # set this ticker into the second-lowest priority list
        missing_days = 0
        has_content = False
        no_news_days = []
        for timestamp in date_range:
            print('trying '+timestamp)  # print timestamp on the same line
            new_time = timestamp[4:] + timestamp[:4] # change 20151231 to 12312015 to match reuters format
            soup = util.get_soup_with_repeat(url + "?date=" + new_time)
            if soup and self.parse_and_save_news(soup, task, ticker, timestamp):
                missing_days = 0 # if get news, reset missing_days as 0
                has_content = True
            else:
                missing_days += 1

        return has_content, no_news_days

    def parse_and_save_news(self, soup, task, ticker, timestamp):
        content = soup.find_all("div", {'class': ['topStory', 'feature']})
        if not content:
            return False
        with open(self.news_filename, 'a+', newline='\n') as fout:
            for i in range(len(content)):
                title = content[i].h2.get_text().replace(",", " ").replace("\n", " ")
                body = content[i].p.get_text().replace(",", " ").replace("\n", " ")

                if i == 0 and soup.find_all("div", class_="topStory"):
                    news_type = 'topStory'
                else:
                    news_type = 'normal'

                try:
                    fout.write(','.join([ticker, task[1], timestamp, title, body, news_type]) + '\n')
                except:
                    pass
#                 fout.write(','.join([ticker, task[1], timestamp, title, body, news_type])+ '\n')
        return True

    def run(self, numdays=1000, start_date='12/31/1999'):
        """Start crawler back to numdays"""
        date_range = util.generate_past_n_days(numdays, start_date) # look back on the past X days
#         print(date_range)
        # store low-priority task and run later
        delayed_tasks = {'LOWEST': set(), 'LOW': set()}
        with open(self.ticker_list_filename, encoding="utf-8") as ticker_list:
            for line in ticker_list:  # iterate all possible tickers
                task = tuple(line.strip().split(','))
                if len(task) < 4:
                    continue
                ticker, name, exchange, market_cap = task
                if ticker in finished_tickers:
                    continue
                if ticker in failed_tickers:
                    priority = failed_tickers[ticker]
                    delayed_tasks[priority].add(task)
                    continue
                self.fetch_content(task, date_range)

        # run task with low priority
        for task in delayed_tasks['LOW']:
            self.fetch_content(task, date_range)
        # run task with lowest priority
        for task in delayed_tasks['LOWEST']:
            self.fetch_content(task, date_range)


def main():
    reuter_crawler = ReutersCrawler()
    reuter_crawler.run(6847, '07/12/2017')

if __name__ == "__main__":
    main()