-
Notifications
You must be signed in to change notification settings - Fork 0
/
format.py
93 lines (77 loc) · 2.84 KB
/
format.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
"""
Formats the cases text for easy reading.
(Remove double spacing, repetitive titles and html links)
Save each case as a separate txt file.
"""
import extract
import nltk as nl
import re
class Format:
def pretty_case(self, case):
"""
Cleans formating of pages in a case
"""
clean = []
for page in case:
page = page.split("\n") # split on paragraphs
for para in page:
if para != "": # remove empty paragraphs
para = para.replace(" . . .", "")
clean += self.pretty_sentence(para)
clean = self.pretty_head(clean)
return clean
def pretty_head(self, sentences):
"""
Removes the headnote at the begining of a case
"""
clean = []
start = False
for sent in sentences:
judge = sent.split(" ")[0] #first word of a sentence
if judge == "LORD" or judge == "LADY" or judge == "BARONESS": #NOTE might need more options for Lady Hale?
start = True # flag to indicate start of the body proper
clean.append("\n-------------NEW JUDGE---------------") # marking of a new judge
clean.append(sent)
elif start == True and sent != "": # remove empty sentences
clean.append(sent)
return clean
def pretty_sentence(self, paragraph):
"""
Cleans formating of sentences in a paragraph
"""
clean = []
sentences = nl.sent_tokenize(paragraph)
for sent in sentences:
sent = sent.lstrip() # removes chars from begining of a sentence
sent = sent.replace("(back to preceding text)", "") # removes HTMl navigator link
clean.append(sent)
return clean
def save(self, case, name, folder, link):
"""
save the case as name.txt in /corpus
"""
try:
fname = "../corpus/" + folder + "/" + name + ".txt"
file = open(fname, "w")
self.write(case, file, link)
except IOError:
print("Courld not find the file: ", fname)
def write(self, case, file, link):
"""
write the case line by line in file
"""
file.write(link + "\n")
[file.write(line + "\n") for line in case]
if __name__ == "__main__":
fm = Format()
ex = extract.Extract()
# Should write a cleanup case in the test folder
case = []
ex.extract_case("https://publications.parliament.uk/pa/ld199697/ldjudgmt/jd961121/smith01.htm", case)
clean = fm.pretty_case(case)
fm.save(clean, "1", "test")
# Should catch error in file name and print it
case = []
ex.extract_case("https://publications.parliament.uk/pa/ld200809/ldjudgmt/jd090617/assom.htm", case)
clean = fm.pretty_case(case)
fm.save(clean, "2", "test2")