-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy paththread_to_tree_grouped.py
103 lines (75 loc) · 2.72 KB
/
thread_to_tree_grouped.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import sys
import json
from pprint import pprint
def partition_replies_deep(tweets, curr, _thread=None, _replies=None):
is_first_level = _thread is None
if _thread is None:
_thread = []
if _replies is None:
_replies = []
curr_id = curr['rest_id']
replies = [
t for t in tweets
if t['legacy'].get('in_reply_to_status_id_str') == curr_id
]
replies_same_author = []
for r in replies:
if r['legacy']['user_id_str'] == curr['legacy']['user_id_str']:
replies_same_author.append(r)
else:
if is_first_level:
_replies.append(r)
else:
_replies.append({**r, '_quoted': curr})
if replies_same_author:
q = None
self_thread = curr['legacy'].get('self_thread', {}).get('id_str')
if self_thread:
for i in replies_same_author:
if self_thread == i['legacy'].get('self_thread', {}).get('id_str'):
q = i
break
if not q:
q = replies_same_author[0]
_thread.append(q)
for i in replies_same_author:
if i['rest_id'] == q['rest_id']:
continue # skip thread
if not any(1 for t in _replies if t['rest_id'] == i['rest_id']):
_replies.append(i)
for i in replies_same_author:
partition_replies_deep(tweets, i, _thread, _replies)
return _thread, _replies
def process_replies(tweets, curr, depth=0, tree=None):
if tree is None:
tree = [] # flat tree (each item has depth value)
parts = [curr]
tree.append({**curr, '_parts': parts, '_depth': depth})
replies_same_author, replies_others = partition_replies_deep(tweets, curr)
parts.extend(replies_same_author)
for r in replies_others:
process_replies(tweets, r, depth + 1, tree)
return tree
def main():
thread_id = sys.argv[1]
with open(f'threads/thread_{thread_id}.json', 'r') as f:
thread = json.load(f)
tweets = thread['tweets']
users = thread['users']
users_by_id = {u['rest_id']: u for u in users}
main_tweet = [i for i in tweets if i['rest_id'] == thread_id][0]
author_id = main_tweet['legacy']['user_id_str']
tweets.sort(key=lambda t: (-int(t['legacy']['user_id_str'] == author_id), -t['legacy']['favorite_count'], t['rest_id']))
tree = process_replies(tweets, main_tweet)
with open(f'tree_{thread_id}.json', 'w') as f:
json.dump(
{
'tweets': tweets,
'users_by_id': users_by_id,
'tree': tree,
},
f,
ensure_ascii=False
)
if __name__ == '__main__':
main()