-
Notifications
You must be signed in to change notification settings - Fork 0
/
trim_sent.py
32 lines (26 loc) · 926 Bytes
/
trim_sent.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
import codecs
import sys
def trim(lines, long_th, short_th):
""" Remove all sentences in lines that are longer than long_th,
or shorter than short_th.
"""
return [
line for line in lines
if len(line.split()) >= short_th
and len(line.split()) <= long_th]
if __name__ == '__main__':
short_threshold = int(sys.argv[1])
long_threshold = int(sys.argv[2])
filename = sys.argv[3]
lines = codecs.open(filename, 'r', 'utf-8').readlines()
print('longest sent: {}'.format(
max([len(line.split()) for line in lines])))
print('shortest sent: {}'.format(
min([len(line.split()) for line in lines])))
lines = trim(lines, long_threshold, short_threshold)
with codecs.open(
filename + '.trim[' + str(short_threshold) + '-'
+ str(long_threshold) + ']', 'w', 'utf-8'
) as f:
for line in lines:
f.write(line)