-
Notifications
You must be signed in to change notification settings - Fork 1
/
main.py
364 lines (300 loc) · 13.4 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
# -*- coding: utf-8 -*-
import urllib
import logging
import re
from google.appengine.ext import db
from google.appengine.ext import webapp
from google.appengine.api import urlfetch
from django.utils import simplejson as json
from google.appengine.ext.webapp.util import run_wsgi_app
# 'README' will tell you how to fill the below 3 values
USER_ID=None # user_id is immutable, better than screen_name
WEIBO_GSID=None
WEIBO_ST=None
# Be careful if you want to modify the request api url
TWITTER_REQAPI = 'http://api.twitter.com/1/statuses/user_timeline.json?user_id=%d&trim_user=true&include_rts=true&include_entities=true&since_id=%d'
# blocked by Weibo.com
INNOCENT_NETLOCS = ['goo.gl', 'bit.ly', 'is.gd', 'tinyurl.com']
# not show user ids in the tweet due to disturbing and security,
# replace "@username" with customized string("@nil" by default),
# instead string can be empty, that means remove all the mentions.
PLZ_HIDE_ID = True
PLZ_HIDE_ID_INSTEAD = "@x"
# Configuration END.
### DO NOT CHANGE THE FOLLOWING VALUES UNLESS YOU ARE SURE ###
WEIBO_KICK_URL='http://weibo.cn/mblog/sendmblog?st=%s&st=%s&vt=4&gsid=%s' % (WEIBO_ST, WEIBO_ST, WEIBO_GSID)
WEIBO_HOME_URL='http://weibo.cn/?vt=1&gsid=%s' % WEIBO_GSID
WEIBO_KICK_URL_DEADLINE=30
WEIBO_HOME_URL_DEADLINE=12
TRIED_TIMES_MAX=5
UA='Opera/9.80 (Android; Linux; Opera Mobi/ADR-1011151731; U; en) Presto/2.5.28 Version/10.1'
###
class DB(db.Model):
kicked = db.BooleanProperty(default=False)
tweet_id = db.IntegerProperty()
tweet = db.StringProperty(multiline=True)
user_id = db.IntegerProperty()
failed = db.BooleanProperty(default=False) # bloody damn censorship or sina's block
whyfailed = db.TextProperty()
tried_times = db.IntegerProperty(default=0)
tco_urls = db.StringListProperty()
tco_expanded_urls = db.StringListProperty()
class MainPage(webapp.RequestHandler):
def get(self):
self.response.out.write("fair enough")
class WhyfailedPage(webapp.RequestHandler):
def get_failed_tweets(self):
return DB.all().filter('failed', True).order('-tweet_id')
def get(self, tid):
if tid:
k = DB.get_by_id(int(tid))
if k is not None:
self.response.out.write("%s" % k.whyfailed)
else:
failed_tweets = self.get_failed_tweets()
self.response.out.write("""
<html>
<body>""")
for t in failed_tweets:
id_ = t.key().id()
self.response.out.write("""
<a href="/whyfailed/%s">%d</a>
""" % (id_, id_))
self.response.out.write("""
</body>
</html>
""")
class NexttweetHandler(webapp.RequestHandler):
@classmethod
def next_tweet(cls):
# next tweet to be kicked
return DB.all().filter('kicked', False).filter('failed', False).order('tweet_id').get()
def get(self):
nexttweet = self.next_tweet()
if nexttweet is None:
self.response.out.write('')
else:
self.response.out.write(nexttweet.tweet)
class LatesttweetHandler(webapp.RequestHandler):
@classmethod
def lastest_tweet(cls):
# latest tweet not kicked(maybe not to be kicked immediately)
return DB.all().filter('kicked', False).filter('failed', False).order('-tweet_id').get()
def get(self):
lastest = self.lastest_tweet()
if lastest is None:
self.response.out.write('')
else:
self.response.out.write(lastest.tweet)
class LasttweetHandler(webapp.RequestHandler):
@classmethod
def last_tweet(cls):
# last tweet which has been tried to kick(maybe kicked or failed or both)
# it's not easy to get !(failed == False and kicked == False),
# so the tweet whose id is smaller than the next tweet is what we want.
next_ = NexttweetHandler.next_tweet()
if next_ is None: # not tweets in the kick peeding
return DB.all().order('-tweet_id').get()
else:
return DB.all().filter('tweet_id <', next_.tweet_id).order('-tweet_id').get()
def get(self):
last = self.last_tweet()
if last is None:
self.response.out.write('')
else:
self.response.out.write('Failed: %s <|> ' % last.failed)
self.response.out.write('Kicked: %s <|> ' % last.kicked)
self.response.out.write(last.tweet)
class FetchtweetsHandler(webapp.RequestHandler):
@classmethod
def expand(self, url):
"""Expand short url to its original."""
try:
res = urlfetch.fetch(url=url, method="HEAD", follow_redirects=False, deadline=10)
except Exception, e:
logging.error("expand url %s failed\nerror:%s" % (url, e))
# there will still be an chance to expand the url when kicks,
# so leave a flag here.
return None
return res.headers['Location'] if res.status_code in (301, 302) else url
def expand_tco_urls(self, tcos):
"""Return pairs of t.co short urls and their expanded urls.
If the expanded url is in the malice urls list of Weibo.com, expand
it to the unshorten one.
< [{"url":"http://t.co/xxx", "expanded_url":"http://loooooongurl"},
{"url":"http://t.co/yyy", "expanded_url":"http://goo.gl/abc"}]
> (["http://t.co/xxx", "http://t.co/yyy"],
["http://loooooongurl", "http://zzzzzzzzzzzzzz"])
Twitter forces to wrap the url longer than 20 characters regardless of
whether the url was already shortened by other services like goo.gl or
bit.ly.
Support expanding short urls see INNOCENT_NETLOCS
"""
from urlparse import urlsplit
global INNOCENT_NETLOCS
tco_urls = []
tco_expanded_urls = []
for e in tcos:
if e["url"] and e["expanded_url"]: # some urls not expanded(null)
tco_urls.append(e["url"])
u = e["expanded_url"]
o = urlsplit(u)
uu = self.expand(u) if o.netloc in INNOCENT_NETLOCS else u
tco_expanded_urls.append(uu)
return (tco_urls, tco_expanded_urls)
# fetch user's tweets which are not reply to anyone
def fetch(self, sid):
global USER_ID, TWITTER_REQAPI
# Twitter limits GAE quite a lot, but other proxy api is supported,
# see README
req = TWITTER_REQAPI % (USER_ID, sid)
try:
res = urlfetch.fetch(url=req, deadline=10)
except Exception, e:
logging.error("fetch: %s" % e)
return
if res.status_code != 200:
msg = "twitter api maybe request too much.\n%s" % res.content
logging.info(msg)
self.response.out.write(msg)
return
else:
logging.info("fetchtweets: we got something.")
tweets = json.loads(res.content)
for t in tweets:
tweet = t["text"].strip()
# skip the reply tweet
if t["in_reply_to_status_id"] is not None:
continue
# you can use '@ bla bla bla...' to force not to kick
if tweet[0] == '@' and tweet[1] in ' abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789':
continue
# store the t.co like(goo.gl/bit.ly) urls and their expanded urls
# 'include_entities' should be turned on in the api calling
# "entities":{"urls":["url":"", "expanded_url":""]}
tco_urls, tco_expanded_urls = self.expand_tco_urls(t["entities"]["urls"])
try:
DB(tweet_id=t["id"], tweet=t["text"], user_id=USER_ID,
tco_urls=tco_urls, tco_expanded_urls=tco_expanded_urls).put()
except Exception, e:
logging.error("save entity error: %s" % e)
break
#self.response.out.write("tid: %d, t: %s, uid: %d" % (t["id"], t["text"], t["user"]["id"]))
# end for
def get_since_id(self):
if DB.all().count(1) == 0: # it's really no any tweet fetched
return 12345678 # this tweet is quite early, so...
k = DB.all().order('-tweet_id').get()
return k.tweet_id
def get(self):
since_id = self.get_since_id()
self.fetch(since_id)
# self.response.out.write(since_id)
class KickassHandler(webapp.RequestHandler):
def hide_id(self, tweet, instead_str):
"""Replace "@username" in the tweet with other string like "@nil".
Name user id will have same postfix, like:
"Hej @blabla RT @blabla: hello @yadayada" =>
"Hej @nil_0 RT @nil_0: hello @nil_1"
If instead_str is empty, every "@xxx" will be removed.
"""
ids = re.findall('@\w+', tweet)
if ids:
ids = list(set(ids))
for i in range(len(ids)):
tweet = tweet.replace(ids[i],
instead_str + "_%d" % i if instead_str else "")
return tweet
def unwrap_tco(self, t, tco_pairs):
"""Replace t.co urls in the tweet with the expanded ones.
Twitter forces to wrap all urls using t.co, but t.co urls are blocked by weibo.com.
parameter t is a datastore entity.
tco_pairs = [(u'http://t.co/AbC123', u'http://looooongurl.com/blabla')]
"""
tweet = t.tweet
for tco_url, tco_expanded_url in tco_pairs:
# yet another chance to expand the short url not succeeded last time
if tco_expanded_url is None:
u = FetchtweetsHandler.expand(tco_url)
if u is None:
tco_expanded_url = tco_url
else:
t.tco_expanded_urls[t.tco_urls.index(tco_url)] = u
t.put()
tweet = tweet.replace(tco_url, tco_expanded_url)
return tweet
def get_weibo_count(self, msg):
global WEIBO_GSID, WEIBO_HOME_URL, WEIBO_HOME_URL_DEADLINE, UA
headers = {'User-Agent': UA}
try:
res = urlfetch.fetch(url=WEIBO_HOME_URL, headers=headers,
deadline=WEIBO_HOME_URL_DEADLINE)
except Exception, e:
raise Exception(e) # just exit...
# match WEIBO_GSID">微博[362]</a>
patt = r'%s">微博\[(\d+)\]</a>' % WEIBO_GSID
v = re.search(patt, res.content)
try:
count = int(v.group(1))
except Exception, e:
logging.error("get count(%s) error: \n%s\n%s" % (msg, WEIBO_HOME_URL, res.content))
raise Exception(e)
return count
def get(self):
global TRIED_TIMES_MAX, WEIBO_KICK_URL, WEIBO_KICK_URL_DEADLINE, UA, \
PLZ_HIDE_ID, PLZ_HIDE_ID_INSTEAD
t = NexttweetHandler.next_tweet()
if t is None:
self.response.out.write("No tweet to kick.")
return
if t.tried_times > TRIED_TIMES_MAX:
t.failed = True
t.put()
logging.info("Set failed: %s" % t.tweet)
return
headers = {'Content-Type': 'application/x-www-form-urlencoded',
'User-Agent': UA}
t_expanded = self.unwrap_tco(t, zip(t.tco_urls, t.tco_expanded_urls))
t_expanded = self.hide_id(t_expanded, PLZ_HIDE_ID_INSTEAD) if PLZ_HIDE_ID else t_expanded
form_fields = {'rl': '0', 'content': t_expanded.encode('utf-8')}
form_data = urllib.urlencode(form_fields)
count_before_kick = self.get_weibo_count('before')
try:
result = urlfetch.fetch(url=WEIBO_KICK_URL, payload=form_data,
method=urlfetch.POST, headers=headers,
deadline=WEIBO_KICK_URL_DEADLINE)
except Exception, e:
logging.error("kick: %s\n%s" % (e, t_expanded))
self.response.out.write("%s" % "kick: urlfetch failed...")
return
msg = "Kicked!" # kicked, but maybe not posted
t.tried_times += 1
t.put()
if result.content.find("发布成功!") == -1: # not find it
count_after_kick = self.get_weibo_count('after')
# count incr 1 if really kicked, but of course there is still
# a probability that count is increased by user posting a tweet
# from the weibo page during our kicking procedure.
if count_before_kick < count_after_kick:
t.kicked = True
msg += "(%d:%d) Count increased, maybe OK." % (count_before_kick, count_after_kick)
else:
t.whyfailed = result.content.decode("utf-8")
msg += "(%d:%d) Mostly maybe blocked or censored! Try less." % (count_before_kick, count_after_kick)
t.tried_times += 1 # mostly blocked, no need try TRIED_TIMES_MAX times
else:
t.kicked = True
msg += " Mostly succeeded."
logging.info(msg + "\n%s" % t_expanded)
t.put()
application = webapp.WSGIApplication([('/kickass', KickassHandler),
('/nexttweet', NexttweetHandler),
('/latesttweet', LatesttweetHandler),
('/lasttweet', LasttweetHandler),
('/fetchtweets', FetchtweetsHandler),
('/whyfailed/(.*)', WhyfailedPage),
('/.*', MainPage)], debug=True)
def main():
run_wsgi_app(application)
if __name__ == '__main__':
main()