-
Notifications
You must be signed in to change notification settings - Fork 0
/
patterns.py
133 lines (98 loc) · 4.17 KB
/
patterns.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import re
from string import upper, lower
def titleCase(s):
return re.sub(r"[A-Za-z]+('[A-Za-z]+)?",
lambda mo: mo.group(0)[0].upper() +
mo.group(0)[1:].lower(),
s)
def removeSpaceBeforePunctuation(match, para):
"""
Match : Space before punctuation
Fix : Remove space before punctuation"""
return match.group(2)
def addSpaceAfterPunctuation(match, para):
""" Match : Letter right after punctuation
Fix : Add a space after punctuation"""
print match.groups()
if match.group(1) is None:
return match.group(3)+" "
else:
return match.group(2)+" "
def capitalizeFirst(match, para):
""" Match : Space before punctuation
Fix : Remove space before punctuation"""
return match.group(0).upper()
def removeExtraSpaces(match, para):
""" Match : Multiple spaces
Fix : Replace with single space"""
return " "
def addTildeBeforeCite(match, para):
""" Match : /cite without a tilde before. Either a space or a letter.
Fix : Remove any spaces and replace with tilde."""
return "~"+match.group(2)
def titleCaseFirstWord(match, para):
""" Match : Section reference with 's' not capital in section.
Fix : Capitalize the 's' in section."""
return match.group(1)[0].upper() + match.group(1)[1:].lower()+match.group(2)
def convertToTitleCase(match, para):
""" Match : Non-title case cheapter/ section heading
Fix : Title cased"""
return titleCase(match.group(0))
def convertToSentenceCase(match, para):
return match.group(0)[0].upper()+match.group(0)[1:].lower()
def removeRepeatedPhrase(match, para):
""" Match : Repeated phrase.
Fix : Remove repeated phrase"""
# newpara = para[0:match.start()] + para[match.start():match.start()
# + (match.end() - match.start()) / 2] + para[match.end()
# - 1:len(para)]
newpara = match.group(2)+match.group(3)
return newpara
# Store this in a dictionary with a short hand description, tags and the replacementFunction for the tag
patterns = (
# r'\\(sub)+section':["ONLY FIRST WORD CAPITALIZED IN SUBSECTIONS", 'c', convertFirstLetterToCapital],
{"regex": r'((?<=(\\subsection\{))|(?<=(\\subsubsection\{))|(?<=(\\paragraph\{))|(?<=(\\subparagraph\{)))(([^A-Z](.*?))|([A-Z](.*?)[A-Z](.*?)))(?=\})',
"description": 'Sentence Case For Subsections And Below',
"tags": 'c',
"function": convertToSentenceCase},
{"regex": r'((?<=(\\section\{))|(?<=(\\chapter\{)))((|(.*) )[a-z].*)(?=\})',
"description": 'Title Case For Sections And Chapters',
"tags": 'c',
"function": convertToTitleCase},
{"regex": r'( +)([\.,;:])',
"description": 'Space Before Punctuation',
"tags": 'acehmrfp',
"function": removeSpaceBeforePunctuation},
{"regex": r'((\.)(?![\s\d\]\}\)]))|([,;:\?\]\)\}])(?=[a-zA-Z0-9])',
"description": 'No Space After Punctuation',
"tags": 'acehmrfp',
"function": addSpaceAfterPunctuation},
{"regex": r'((?<=(\.\s))|(?<=(\n\n))|(?<=\A))[a-z]',
"description": 'Missing Capitalization Of First Word After Full Stop',
"tags": 'acehmpb',
"function": capitalizeFirst},
{"regex": r'(\s*)(?<!~)((\\cite)|(\\ref))',
"description": 'Tilde Mark Needed Before Cite / Ref',
"tags": 'ac',
"function": addTildeBeforeCite},
{"regex": r'(chapter)(~\\ref)',
"description": 'Capitalize C In Chapter',
"tags": 'c',
"function": titleCaseFirstWord},
{"regex": r'(section)(~\\ref)',
"description": 'Capitalize S In Section',
"tags": 'c',
"function": titleCaseFirstWord},
{"regex": r'(?i)((?<=\s)|(?<=^))([A-Za-z][A-Za-z ]*)([^\w\d]+)\2((?=([ \n\.,;]))|(?=$))',
"description": 'Repeated Phrase',
"tags": 'ce',
"function": removeRepeatedPhrase},
)
def capitalizeFirstLetter(word):
return upper(word[0]) + word[1:len(word)]
def uncapitalizeFirstLetter(word):
return lower(word[0]) + word[1:len(word)]
def notFullyCapital(word):
""" Checks whether the word is fully capital. If so ,
it is likely to be some sort of abbreviation or acronym."""
return not word.isupper()