-
Notifications
You must be signed in to change notification settings - Fork 5
/
char_segmentation.py
40 lines (31 loc) · 1.33 KB
/
char_segmentation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
"""
For character segmentation.
Written by Ye Kyaw Thu.
Last updated: 20 Jan 2024
Usage:
python ./char_segmentation.py --input ./MyanmarTotal.my.clean.nosymbol --output ./char_seg/MyanmarTotal.my.clean.nosymbol.char
"""
import sys
import argparse
import re # Importing the regular expressions module
def segment_characters(text):
segmented = ' '.join(text)
# Replace multiple spaces with a single space and strip leading/trailing spaces
cleaned = re.sub(r'\s+', ' ', segmented).strip()
return cleaned
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Character segmentation of text.')
parser.add_argument('-i', '--input', type=str, default=None, help='Input file path. If not specified, will read from stdin.')
parser.add_argument('-o', '--output', type=str, default=None, help='Output file path. If not specified, will write to stdout.')
args = parser.parse_args()
if args.input:
with open(args.input, 'r', encoding='utf-8') as file:
lines = file.readlines()
else:
lines = sys.stdin.readlines()
segmented_lines = [segment_characters(line) for line in lines]
if args.output:
with open(args.output, 'w', encoding='utf-8') as file:
file.write('\n'.join(segmented_lines))
else:
sys.stdout.write('\n'.join(segmented_lines))