-
Notifications
You must be signed in to change notification settings - Fork 0
/
tokenizer.py
67 lines (49 loc) · 1.57 KB
/
tokenizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import re
__author__ = 'Igor Ekishev'
REGEX_EMAIL = '([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)'
TOKENS = "abcdefghijklmnopqrstuvwxyz0123456789-$'"
WHITESPACE = '!()#%^&@*+}{][|/*":;.,<>?=\`~_'
REGEX_IP = '(?:[0-9]{1,3}\.){3}[0-9]{1,3}'
REGEX_HEX_COLOUR = '#(?:[0-9a-fA-F]{3}){1,2}'
REGEX_HTML = '<[^<]+?>'
def tokenize(body):
# tokenize string, input string, output list of strings
urls = extract_urls(body)
list_of_colors = re.findall(REGEX_HEX_COLOUR, body)
list_of_ips = re.findall(REGEX_IP, body)
tokens = re.sub(REGEX_HTML, '', body) + urls # remove html tags from text
for el in WHITESPACE:
tokens = tokens.translate(str.maketrans(el, ' ')) # clear text from 'noise'
tokens = tokens.lower().split()
tokens = remove_clear_ints(tokens)
for el in list_of_ips: # append IPs and HEX colours to tokens
tokens.append(el)
for el in list_of_colors:
tokens.append(str.lower(el))
return tokens
def extract_urls(text):
urls = re.compile('(?i)<a([^>]+)>(.+?)</a>')
list = urls.findall(text)
sez = ''
for i in list:
for j in i:
sez += j
return sez
def is_number(s):
try:
int(s)
return True
except ValueError:
return False
def integers_in(list):
for i in range(len(list)):
if is_number(list[i]):
return True
else:
False
def remove_clear_ints(tokens):
while integers_in(tokens):
for elements in tokens:
if is_number(elements):
tokens.remove(elements)
return tokens