-
Notifications
You must be signed in to change notification settings - Fork 0
/
Repeat_detection.py
66 lines (59 loc) · 2.09 KB
/
Repeat_detection.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
#modified Jun23 Found repeated comments!
#verified on local small set and is ready to upload with the whole folder after commenter extraction
#modified Jun30 count on full data set
#filter out spammers with features and repeated post more then threshold
import os
threshold = 0.1
Commenter = "../CommentUser/"
output = "../Repeated_Count.txt"
output2 = "../Spam_account.txt"
def repeat_detection():
global threshold, Commenter, output
for username in os.listdir(Commenter):
file = os.path.join(Commenter,username)
with open(file, 'r')as userfile:
seen = set()
repeat = 0
for line in userfile:
if line in seen:
repeat += 1
else:
seen.add(line)
if repeat >= 1:
# print "user: " + username
# print "repeat: " + str(repeat)
f = open(output, 'a')
f.write(username+" "+str(repeat)+'\n')
def is_spam(line):
if str.isalnum(line):
return True
spam_kw = {'QQ', 'email', 'http','.com'}
for kw in spam_kw:
if str.find(line, kw) > 0:
return True
def spammer_detection():
#keep detecting repeat lines, but also track spammers in a file called output2
#added Jun30 2014
global threshold, Commenter, output
for username in os.listdir(Commenter):
file = os.path.join(Commenter,username)
with open(file, 'r')as userfile:
seen = set()
repeat = 0
spam = 0
for line in userfile:
if line in seen:
repeat += 1
else:
seen.add(line)
if is_spam(line):
spam += 1
rate = float(repeat)/float(len(seen))
if repeat >= 1:
f = open(output, 'a')
f.write(username+" "+str(repeat)+'\n')
if rate > threshold or spam > 0:
f1 = open(output2, 'a')
f1.write(username + " "+ str(repeat)+'\n')
if __name__ == '__main__':
spammer_detection()