In [1]:
from newsapi import NewsApiClient

with open("news-api_token.txt", "r") as f:
    api_key = f.readline().strip()

In [111]:
class NewsAPI:
    def __init__(self, token, sources = None):
        """
        token   -- klucz uwierzytelniający
        sources -- opcjonalna lista id źródeł zgodna z konwencją News API
        """
        self.api = NewsApiClient(api_key = token)
        
        if sources is None:
            self.set_sources_all()
        else:
            assert isinstance(sources, list)
            self.sources = sources
    
    def set_sources_all(self):
        """ Metoda ustawia pole `sources` na wszystkie dostępne źródła angielskojęzyczne. """
        
        self.sources = [src["id"] for src in self.api.get_sources(language="en")["sources"]]
        return self
    
    def get_all_articles(self, keyword, **kwargs):
        """ keyword -- wyszukiwana fraza """
        
        assert isinstance(self.sources, list)
        
        i = 0
        n = len(self.sources)
        articles = []
        
        while 20*i < n:
            curr_response = self.api.get_everything(
                q       = keyword,
                sources = ",".join(self.sources[i:min(20*(i+1)-1, n)]),  # Przetwarzamy 20 na raz, bo takie ograniczenie ma News API
                **kwargs
            )
            articles += curr_response.get("articles")
            i += 1
        
        self.articles = articles
        
        return self
    
    def sort_urls_by_source(self):
        """ Metoda upakowuje adresy artykułów do słownika według id źródeł. """
        
        assert hasattr(self, "articles")
        urls_by_source = {}
        
        for art in self.articles:
            curr_source_id = art.get("source").get("id")
            
            # Dodanie klucza, jeśli go nie ma
            if curr_source_id not in d.keys():
                urls_by_source[curr_source_id] = []
            
            urls_by_source[curr_source_id].append(art.get("url"))
        
        self.urls = urls_by_source
        
        return self
    
    def scrape_urls(self):
        assert hasattr(self, "urls")
        # Scraper, który ma słownik {id-stronki: <css sselector treści artykułu>} i zwraca jego sformatowaną treść
        pass
    
    def process(self, keyword, **kwargs):
        self.get_all_articles(keyword, **kwargs)
        self.sort_urls_by_source()
        #self.scrape_urls()

In [112]:
api = NewsAPI(api_key)

In [114]:
api.get_all_articles("tesla", from_param="2019-03-10")

<__main__.NewsAPI at 0x77f05ba400>

In [115]:
api.sort_urls_by_source()

<__main__.NewsAPI at 0x77f05ba400>

In [116]:
api.urls

{'business-insider': ['https://www.businessinsider.com/teslas-closing-all-its-stores-and-sell-online-might-be-brilliant-2019-3'],
 'bloomberg': ['https://www.bloomberg.com/tosv2.html?vid=&uuid=e504ece0-43ce-11e9-bd99-1f78ee3aeb83&url=L25ld3MvYXJ0aWNsZXMvMjAxOS0wMy0xMS90ZXNsYS10by1yYWlzZS1wcmljZXMtY2xvc2UtZmV3ZXItc3RvcmVzLXRoYW4tZmxhZ2dlZA=='],
 'associated-press': ['https://www.apnews.com/a880afcbad214c6e8d07300dbd36bb37'],
 'bbc-news': ['https://www.bbc.co.uk/news/business-47521940'],
 'breitbart-news': ['https://www.breitbart.com/tech/2019/03/11/teslas-former-security-ops-manager-provides-whistleblower-info-to-sec/'],
 'cnn': ['https://www.cnn.com/business/live-news/stock-market-news-today-031119/index.html'],
 'cnbc': ['https://www.cnbc.com/2019/03/11/teslas-biggest-problem-is-customer-service-new-bernstein-survey.html'],
 'google-news': ['http://feedproxy.google.com/~r/hypebeast/feed/~3/LBRtL8NQcAE/'],
 'engadget': ['https://www.engadget.com/2019/03/11/tesla-decides-to-keep-more-st