-
Notifications
You must be signed in to change notification settings - Fork 0
/
tzgg.py
85 lines (69 loc) · 3.53 KB
/
tzgg.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import re
import requests
from lxml import etree
from basespider import BaseSpider
'''
Spider of 通知公告 http://news.zjnu.edu.cn/
'''
__author__ = 'ddMax'
TABLE_NAME = 'news_tzgg'
class TZGGSpider(BaseSpider):
def __init__(self):
print('Start fetching...')
def getsource(self, url):
html = requests.get(url)
html.encoding = 'gbk'
return html.text
def changepage(self, url, total_page):
now_page = int(re.search('pageindex=(\d+)', url, re.S).group(1))
page_group = []
for i in range(now_page, total_page + 1):
link = re.sub('pageindex=\d+', 'pageindex=%s' % i, url, re.S)
page_group.append(link)
return page_group
# ç”±äºŽæ ‡é¢˜, ID与æ¯ä¸€å—新闻信æ¯çš„特å¾ä¸æ˜Žæ˜¾ï¼Œå› æ¤å•ç‹¬æŠ“å–æ‰€æœ‰æ–°é—»æ ‡é¢? def getalltitlesandids(self, source):
titles = re.findall(r'<SPAN style="FONT-WEIGHT:.*?_blank>(.*?)</a>', source, re.S)
for count, each in enumerate(titles):
titles[count] = each.strip()
article_ids = re.findall(r'<SPAN style="FONT-WEIGHT:.*?article_id=(\d*)"', source, re.S)
# 以倒åºçš„æ–¹å¼è¿”回列è¡? return titles[::-1], article_ids[::-1]
# æå–æ¯å—新闻信æ¯ï¼ˆé™¤æ ‡é¢˜å’ŒIDï¼? def getallsection(self, source):
allsection = re.findall(r'<td bgcolor=FloralWhite>(.*?)</table></td>', source, re.S)
# 以倒åºçš„æ–¹å¼è¿”回列è¡? return allsection[::-1]
# 处ç†æ ‡é¢˜+新闻信æ¯ï¼Œæ•´åˆåˆ°å—å…¸ä¸? def getinfo(self, eachsection, title, articleId):
info = dict()
info['title'] = title
info['articleId'] = int(articleId)
# info['overview'] = re.search(r'<div class="article_show" align="left">\s*(.*?)</div>', eachsection, re.I).group(1).strip()
info['overview'] = self.patchstr(str(etree.HTML(eachsection).xpath(r'//tr[2]/td/div/text()')[0]).strip())
date_author_hits = str(re.findall(r'COLOR: #006600; ">(.*?)</span>', eachsection, re.S))
info['date'] = re.search(r'----(.*?) ', date_author_hits, re.S).group(1).strip()
info['author'] = re.search(r'供稿�.*?) ', date_author_hits, re.S).group(1).strip()
info['hits'] = re.search(r'æµè§ˆæ¬¡æ•°ï¼?\d*)', date_author_hits, re.S).group(1).strip()
return info
def saveinfo(self, classinfo):
f = open('info.txt', 'a')
for each in classinfo:
f.writelines('title:' + each['title'] + '\n')
f.writelines('articleId:' + each['articleId'] + '\n')
f.writelines('overview:' + each['overview'] + '\n')
f.writelines('date:' + each['date'] + '\n')
f.writelines('author:' + each['author'] + '\n')
f.writelines('hits:' + each['hits'] + '\n')
f.writelines('\n')
f.close()
if __name__ == '__main__':
url = 'http://www.zjnu.edu.cn/news/common/article_list.aspx?border_id=3&pageindex=1'
spider = TZGGSpider()
# all_links = spider.changepage(url, 103)
all_links = spider.changepage(url, 4)
for count, link in enumerate(all_links):
print('Parsing ' + link + ':')
html = spider.getsource(link)
all_titles, all_article_ids = spider.getalltitlesandids(html)
all_sections = spider.getallsection(html)
for i, each in enumerate(all_sections):
info = spider.getinfo(each, all_titles[i], all_article_ids[i])
print('Saving news ' + all_article_ids[i] + ' to database... ', end='')
spider.savetodb(TABLE_NAME, info)
print('Done!')