-
Notifications
You must be signed in to change notification settings - Fork 6
/
task.py
268 lines (226 loc) · 8.67 KB
/
task.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
import requests
import re
import random
import time
import urllib3
import logging
from config import Config
from bs4 import BeautifulSoup
urllib3.disable_warnings()
logging.basicConfig(level=logging.WARNING)
HTTP_ERROR_MSG = 'HTTP error {res.status_code} - {res.reason}'
config = Config()
line_bot_api = config.line_bot_api
client_id = config.client_id
client_secret = config.client_secret
album_id = config.album_id
API_Get_Image = config.API_Get_Image
class Crawler:
rs = requests.session()
def __init__(self, target_url, method='get', target=None):
print('Start Crawler....{}'.format(self.__class__.__name__))
self.url = target_url
self.content = self.analyze(method, target)
def analyze(self, method, target):
if method == 'get':
res = self.rs.get(self.url, verify=False)
else:
# post
if target == 'Beauty':
url = '/bbs/Beauty/index.html'
else:
url = '/bbs/Gossiping/index.html'
load = {
'from': url,
'yes': 'yes'
}
res = self.rs.post('https://www.ptt.cc/ask/over18', verify=False, data=load)
res.encoding = 'utf-8'
soup = BeautifulSoup(res.text, 'html.parser')
return soup
class EynyMovie(Crawler):
def parser(self):
result = ''
for url in self.content.select('.bm_c tbody .xst'):
href = url['href']
title = url.text
if '11379780-1-3' in href:
continue
if self.pattern_mega(title):
result += '{}\nhttp://www.eyny.com/{}\n\n'.format(title, href)
return result
@staticmethod
def pattern_mega(text):
patterns = [
'mega', 'mg', 'mu', 'MEGA', 'ME', 'MU',
'me', 'mu', 'mega', 'GD', 'MG', 'google',
]
match = False
for pattern in patterns:
if re.search(pattern, text, re.IGNORECASE):
match = True
break
return match
class AppleNews(Crawler):
def parser(self):
result = ''
base = 'https://tw.appledaily.com'
for index, data in enumerate(self.content.select('.flex-feature a')):
if index == 5:
break
title = data.find('img')['alt']
url = data['href']
result += '{}{} {}\n\n'.format(base, url, title)
return result
class ArticleInfo:
def __init__(self, title=None, url=None, rate=None):
self.title = title
self.url = url
self.push = rate
class PttBeauty(Crawler):
parser_page = 2 # crawler count
push_rate = 10 # 推文
def parser(self):
url = 'https://www.ptt.cc/bbs/Beauty/index{}.html'
index_seqs = PttBeauty.get_all_index(self.content, url, self.parser_page)
articles = []
for page in index_seqs:
try:
res = self.rs.get(page, verify=False)
res.raise_for_status()
except requests.exceptions.HTTPError as exc:
logging.warning(HTTP_ERROR_MSG.format(res=exc.response))
except requests.exceptions.ConnectionError:
logging.error('Connection error')
else:
articles += self.crawler_info(res)
time.sleep(0.05)
return ''.join('[{} push] {}\n{}\n\n'.format(article.push,
article.title,
article.url)
for article in reversed(articles))
def crawler_info(self, res):
soup = BeautifulSoup(res.text, 'html.parser')
articles = []
# 抓取 文章標題 網址 推文數
for r_ent in soup.find_all(class_="r-ent"):
try:
# 先得到每篇文章的篇url
link = r_ent.find('a')['href']
if not link:
break
# 確定得到url再去抓 標題 以及 推文數
title = r_ent.find(class_="title").text.strip()
rate = r_ent.find(class_="nrec").text
url = 'https://www.ptt.cc' + link
if rate:
rate = 100 if rate.startswith('爆') else rate
rate = -1 * int(rate[1]) if rate.startswith('X') else rate
else:
rate = 0
# 比對推文數
if int(rate) >= self.push_rate:
articles.append(ArticleInfo(title, url, rate))
except Exception as e:
print('本文已被刪除', e)
return articles
@staticmethod
def get_all_index(content, url, parser_page):
max_page = PttBeauty.get_max_page(content.select('.btn.wide')[1]['href'])
return (
url.format(page)
for page in range(max_page - parser_page + 1, max_page + 1, 1)
)
@staticmethod
def get_max_page(content):
start_index = content.find('index')
end_index = content.find('.html')
page_number = content[start_index + 5: end_index]
return int(page_number) + 1
class PttGossiping(Crawler):
parser_page = 2 # crawler count
def parser(self):
url = 'https://www.ptt.cc/bbs/Gossiping/index{}.html'
index_seqs = PttBeauty.get_all_index(self.content, url, self.parser_page)
articles = []
for page in index_seqs:
try:
res = self.rs.get(page, verify=False)
res.raise_for_status()
except requests.exceptions.HTTPError as exc:
logging.warning(HTTP_ERROR_MSG.format(res=exc.response))
except requests.exceptions.ConnectionError:
logging.error('Connection error')
else:
articles += self.crawler_info(res)
time.sleep(0.05)
result = ''
for index, article in enumerate(reversed(articles)):
if index == 15:
break
result += '{}\n{}\n\n'.format(article.title, article.url)
return result
@staticmethod
def crawler_info(res):
soup = BeautifulSoup(res.text, 'html.parser')
articles = []
for r_ent in soup.find_all(class_="r-ent"):
try:
# 先得到每篇文章的篇url
link = r_ent.find('a')['href']
if not link:
break
# 確定得到url再去抓 標題 以及 推文數
title = r_ent.find(class_="title").text.strip()
url = 'https://www.ptt.cc' + link
articles.append(ArticleInfo(title, url))
except Exception as e:
print('本文已被刪除', e)
return articles
class PttHot(Crawler):
def parser(self):
result = ''
for data in self.content.select('#list div.row2 div span.listTitle'):
title = data.text
href = data.find('a')['href']
if href == "796-59l9":
break
result += '{}\nhttp://disp.cc/b/{}\n\n'.format(title, href)
return result
class Movie(Crawler):
def parser(self):
result = ''
for index, data in enumerate(self.content.select('ul.filmNextListAll a')):
if index == 20:
break
title = data.text.replace('\t', '').replace('\r', '')
result += '{}\nhttp://www.atmovies.com.tw{}\n'.format(title, data['href'])
return result
class TechNews(Crawler):
def parser(self):
result = ''
for index, data in enumerate(self.content.select('article div h1.entry-title a')):
if index == 12:
break
result += '{}\n{}\n\n'.format(data.text, data['href'])
return result
class Panx(Crawler):
def parser(self):
result = ''
for data in self.content.select('div.container div.row div.desc_wrap h2 a'):
result += '{}\n{}\n\n'.format(data.text, data['href'])
return result
class OilPrice(Crawler):
def parser(self):
title = self.content.select('#main')[0].text.replace('\n', '').split('(')[0]
gas_price = self.content.select('#gas-price')[0].text.replace('\n\n\n', '').replace(' ', '')
cpc = self.content.select('#cpc')[0].text.replace(' ', '')
return '{}\n{}{}'.format(title, gas_price, cpc)
class YoutubeVideo(Crawler):
def parser(self):
videos = ['https://www.youtube.com{}'.format(data.find('a')['href'])
for data in self.content.select('.yt-lockup-title')]
return videos
@staticmethod
def random(videos):
return videos[random.randint(0, len(videos) - 1)]