-
Notifications
You must be signed in to change notification settings - Fork 0
/
testJiexi.py
84 lines (74 loc) · 3.92 KB
/
testJiexi.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
# -*- coding: utf-8 -*-
__author__ = 'vincentgong'
import json
import re
def jiexi(content):
tmp = re.findall(r'pl\.content\.homeFeed\.index.*html\":\"(.*)\"}\)', content)
# for tmp_r in tmp:
# content = content.replace(tmp_r, 's')
max = 0
for i in tmp:
if max < len(i):
max = len(i)
content = i
content = content.replace('WB_detail', 'WB_detailWB_detail')
# get all things
WB_single = re.findall(r"WB\_detail(.+?)WB\_detail", content)
# for i in range(0,len(WB_single)):
# {'text': 微博信息内容, 'count': 转发数, 'wid': 微博ID, 'name': 微博作者的用户信息字段, 'uid': 用户UID,
# 'nick': 用户昵称, 'self': u['self'], 'timestamp': 微博创建时间, 'source': 微博来源,
# 'location': 用户所在地, 'country_code': u['country_code'],
# 'province_code': 用户所在省级ID, 'city_code': 用户所在城市ID, 'geo': 地理信息字段,
# 'emotionurl': u['emotionurl'], 'emotiontype': u['emotiontype']
# })
# {'text': u['text'], 'count': u['reposts_count'], 'wid': u['id'], 'name': u['user']['name'],
# 'uid': u['user']['id'],
# 'nick': u['user']['screen_name'], 'self': 'null', 'timestamp': u['created_at'], 'source': u['source'],
# 'location': u['user']['location'], 'country_code': '',
# 'province_code': u['user']['province'], 'city_code': u['user']['city'], 'geo': u['geo'],
# # 'emotionurl': u['emotionurl'], 'emotiontype': u['emotiontype']
# 'link': u['user']['id']
# })
user = []
for WB in WB_single:
# print(' ')
# print(WB)
# print(' ')
WB_text = ''.join(re.findall(r"WB\_text[^>]*>(.*?)<\\/div", WB)).replace('\\n', '').replace('\\"', '"').replace(
'\\/', '/').strip() #.lstrip('\\n').strip()
# if WB_text inclued WB_media_expand is miniPage !!!!!!
WB_geo = ''.join(re.findall(r"place.{13}?(.+).title", WB_text)).strip()
if not '_' in WB_geo:
WB_geo = WB_geo[1:]
print(WB_geo)
WB_geo_title = ''.join(re.findall(r"place.+title..?(.+)..href", WB_text)).strip()
print(WB_geo_title)
WB_source = ''.join(re.findall(r'WB\_text[^>]*>.*nofollow\\">(.*?)<', WB)) # checked
WB_collect = 0
WB_collect_tmp = re.findall(r'搜藏.*?(\d+)', WB)
if len(WB_collect_tmp) >0:
WB_collect = re.findall(r'搜藏.*?(\d+)', WB)[0] # checked
WB_comment = re.findall(r'评论.*?(\d+)', WB)[0] # checked
WB_forward = re.findall(r'转发.*?(\d+)', WB)[0] # checked
WB_like = ''.join(re.findall(r'WB\_text[^>]*>.*praised.*?\(([0-9]*)', WB)) # checked
# WB_like = re.findall(r'W_icon.icon_praised_b."><..i> <em>(\d+)<..em>', WB)[0] # checked
#print(WB_like)
# WB_mid = ''.join(re.findall(r' mid=\\"([0-9]*)', WB))
# WB_wid = re.findall(r'mid=(\d*).*?转发', WB)[-1] # checked
WB_wid = re.findall(r'mid=.*?(\d*)', WB)[0] # checked
WB_name = ''.join(re.findall(r'nick-name=\\"([^"]*)\\"', WB))
WB_uid = ''.join(re.findall(r'fuid=([^"]*)\\"', WB)) # checked
WB_timestamp = re.findall(r'date=\\"([^"]*)\\"', WB)[0] # checked
user.append({'text': WB_text, 'collected_count': WB_collect, 'mid': WB_wid, 'name': WB_name,
'uid': WB_uid, 'nick': WB_name, 'self': 'dontknow', 'timestamp': WB_timestamp, 'source': WB_source,
'location': 'null', 'country_code': '', 'province_code': 'null', 'city_code': 'null',
'geo': WB_geo, 'geo_title': WB_geo_title,
'link': WB_uid, 'forward': WB_forward, 'like': WB_like, 'comment': WB_comment})
return user
def writefile(self,filename,content):
fw = file(filename,'w')
fw.write(content)
fw.close()
content = open('/Users/vincentgong/Documents/workspaces/Pyworks/weibo-master/output/5148876354/page1-source1.txt').read()
result = json.dumps(jiexi(content))
print result