-
Notifications
You must be signed in to change notification settings - Fork 0
/
Hash_filter.py
31 lines (28 loc) · 1 KB
/
Hash_filter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
#Jul5:try shrinking data size by filtering out less then 2 comments
import hashlib
import os
Commenter = "../CommentUser/"
output = "../BigCommenterHashed/"
#output_threshold = 0.01
user_set_map = {}
users = set()
def map_filter_init():
global user_set_map,users
for username in os.listdir(Commenter):
users.add(username)
file = os.path.join(Commenter, username)
with open(file,'r')as userfile:
hashed_comment = set()
repeat = 0
for line in userfile:
hash_ogj = hashlib.md5(line)
hashed_comment.add(hash_ogj.hexdigest())
#filter out less then 2 comment cases
if len(hashed_comment) > 2:
user_set_map[username] = hashed_comment
output_file = os.path.join(output,username)
output_fp = open(output_file, 'w')
for elem in hashed_comment:
output_fp.write(str(elem) + '\n')
if __name__ == "__main__":
map_filter_init()