-
Notifications
You must be signed in to change notification settings - Fork 0
/
web_reader.py
111 lines (90 loc) · 3.73 KB
/
web_reader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
# web_reader.py a program to read Censys data.
import json
import sys
import re
from bs4 import BeautifulSoup
# Functions to replace text in very simple ways. The second is needed for cases
# where we want to preserve a quote (otherwise we risk breaking the json
# format).
def replace_with_blank(match):
return " "
def replace_with_quote(match):
return '\\"'
def stripTags(text):
scripts = re.compile(r'<script.*?/script>')
css = re.compile(r'<style.*?/style>')
tags = re.compile(r'<.*?>')
text = scripts.sub('', text)
text = css.sub('', text)
text = tags.sub('', text)
return text
translation_table = dict.fromkeys(map(ord, '\n'), None)
# Regular expressions to match the problematic patterns in our text
multichar = re.compile(r'\\u[0-9A-Za-z]{3,6}')
escape_and_quote = re.compile(r'\\h"')
escape = re.compile(r'\\[ h]')
# This function takes as input:
# text - a string
# replace_function - a function that will determine a suitable replacement
# regular_expression - the pattern that will identify what is replaced
def repair(text, replace_function, regular_expression):
return regular_expression.sub(replace_function, text)
if __name__ == "__main__":
# A couple of values that control whether detailed error information is
# logged
LOG_ERRORS = False
MAX_ERRORS_LOGGED = 1
input_file = open('sample1000.json', encoding=('utf-8'))
output_file = open('web_data.txt', 'w')
training_categories = open('web_categories.txt', 'w')
translation_table = dict.fromkeys(map(ord, '\n'), None)
error_count = 0
worked = 0
for raw_line in input_file:
line = raw_line
# A series of modifications to the line, each making one type of
# # change.
line = repair(line, replace_with_quote, escape_and_quote)
line = repair(line, replace_with_blank, multichar)
line = repair(line, replace_with_blank, escape)
line = repair(line, replace_with_blank, escape)
line = stripTags(line)
try:
# Load JSON and specify branch
data_dict = json.loads(line)
html = data_dict["p80"]["http"]["get"]["body"]
# Begin cleanup of HTML
souped = BeautifulSoup(html, 'html.parser')
htmltext = souped.get_text()
lines = (line.strip() for line in htmltext.splitlines())
results = '\n'.join(chunk for chunk in lines if lines)
results = results.translate(translation_table)
# Output to text file
output_file.write("Domain: ")
output_file.write(data_dict["domain"])
output_file.write(" : ")
output_file.write(results)
output_file.write('\n')
training_categories.write("Domain: "+ data_dict["domain"]+' | '+'\n')
worked += 1
except Exception as error_desc:
error_count += 1
if LOG_ERRORS:
# Print the problematic line
print(raw_line)
print(line)
# Print the details
print(str(error_desc))
# If it is an error that specifies a position, print 10 characters
# either side of that position.
error = str(error_desc)
if "column" in error:
position = int(error.split('column')[1].strip().split()[0])
left = max(0, position - 50)
right = min(len(line), position + 50)
print(line[left:position])
print(line[position])
print(line[position+1:right])
if error_count > MAX_ERRORS_LOGGED:
sys.exit(0)
print("Worked:", worked, " Still broken:", error_count)