-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrapeNews.py
101 lines (97 loc) · 3.7 KB
/
scrapeNews.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
#!/usr/bin/python
'''
The script to extract current news headlines
'''
import newspaper
import re
import secrets
import requests
import os
import sys
os.chdir("../")
### scrape news from various news sites through newspaper lib
### needs REDO
def bitcoin_news(news_url):
url_file = 'data/urls.txt'
cnn_source = newspaper.build(news_url, memoize_articles=False)
print '\n====> Grab %d articles from %s' % (cnn_source.size(), news_url)
## extract urls
article_urls = (article.url for article in cnn_source.articles)
## grab bitcoin news
#searches = (re.search(r'.+\/(\d+?)\/(\d+?)\/(\d+?)\/.+bitcoin.+', url) for url in article_urls)
searches = (re.search(r'.+bitcoin.+', url) for url in article_urls)
news = (match.group(0) for match in searches if match)
for d in news:
print d
'''
## find all dates
search_dates = (re.search(r'\/(\d+?)\/(\d+?)\/(\d+?)\/', url) for url in article_urls)
dates = ((match.group(1), match.group(2), match.group(3)) for match in search_dates if match)
## find only news with bitcoin in headline
search_news = (re.search(r'.+bitcoin.+', url) for url in article_urls)
bitcoin_news = (match.group(0) for match in search_news if match)
for b in bitcoin_news:
print b
with open(url_file, 'a+') as f:
for a_url in article_urls:
f.write(a_url + '\n')
'''
### scrape news from NYT through standard API
### topic: the keyword to search titles for
### topic could be a series of keywords connected with comma,
### which need to be parsed properly
def scrapeNYT(topic, filename):
topics = topic.split(",")
## file to save news
data_file = 'data/%s.csv' % filename
nyt_url = "http://api.nytimes.com/svc/search/v2/articlesearch.json"
## specify query start and end date
bdate = '20120101'
edate = '20151231'
params = {
'begin_date': bdate,
'end_date': edate,
# filter based on Lucene syntax
'fq': 'headline:("%s")' % '" "'.join(topics),
'api-key': secrets.NYT_API_KEY,
'sort': 'oldest'}
## get the size of news first
resp = requests.get(nyt_url, params=params)
if resp.status_code == 200:
resp_obj = resp.json()
size = resp_obj['response']['meta']['hits']
print "====> %s articles from %s to %s" % (size, bdate, edate)
with open(data_file, 'w+') as f:
## page through all results
p = 0
while size > 0:
params['page'] = p
resp_page = requests.get(nyt_url, params=params)
if resp_page.status_code == 200:
resp_obj = resp_page.json()
docs = resp_obj['response']['docs']
for d in docs:
f.write('%s,"%s"\n' % (d['pub_date'].encode('utf8'), d['headline']['main'].encode('utf8')))
p += 1
size -= len(docs)
if __name__ == '__main__':
if len(sys.argv) != 4 or sys.argv[1] != "1":
print ""
print "============= script to scrape news from different sources ======="
print "============="
print "usage: python scrapeNews.py method_id input_param file_name"
print "method_id: 0 - google news (not working), 1 - NYT"
print "input_param: NYT - search keyword"
print "file_name: the name of the output file"
print "============="
exit(0)
method_id = int(sys.argv[1])
input_param = sys.argv[2]
fname = sys.argv[3]
if method_id == 1:
scrapeNYT(input_param, fname)
'''
# scrape every popular news source
for src in newspaper.popular_urls():
bitcoin_news(src)
'''