-
Notifications
You must be signed in to change notification settings - Fork 5
/
ngram_segmentation.py
64 lines (52 loc) · 2.17 KB
/
ngram_segmentation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
"""
For simulation of possible word segmentation.
Written by Ye Kyaw Thu, LU Lab., Myanmar
Last updated: 28 Dec 2023
How to use:
$ python ./ngram_segmentation.py --help
python ./ngram_segmentation.py --input ./corpus.txt --ngram 3
python ./ngram_segmentation.py --input ./corpus.txt --ngram 3 --output out
python ./ngram_segmentation.py --input ./corpus.txt --ngram 3 --output out --max 20
"""
import argparse
import itertools
import random
def segment_line(line, max_segment_length, max_sentences=None):
"""Segment the line into segments, each containing up to max_segment_length words."""
words = line.split()
n = len(words)
def segment_helper(start, end):
"""Helper function to create segments."""
if start == n:
yield []
elif start <= n:
for mid in range(start + 1, min(start + max_segment_length, n) + 1):
for segment in segment_helper(mid, end):
yield ["".join(words[start:mid])] + segment
all_segments = [" ".join(result) for result in segment_helper(0, n)]
if max_sentences and len(all_segments) > max_sentences:
return random.sample(all_segments, max_sentences)
return all_segments
def main():
parser = argparse.ArgumentParser(description='N-gram Segmentation Script')
parser.add_argument('--input', type=str, required=True, help='Input file path')
parser.add_argument('--ngram', type=int, required=True, help='N-gram size')
parser.add_argument('--output', type=str, help='Output file path')
parser.add_argument('-m', '--max', type=int, help='Maximum number of sentences to generate per input line')
args = parser.parse_args()
with open(args.input, 'r', encoding='utf-8') as file:
lines = file.readlines()
segmented_lines = []
for line in lines:
line = line.strip()
if line:
segmented_lines.extend(segment_line(line, args.ngram, args.max))
if args.output:
with open(args.output, 'w', encoding='utf-8') as file:
for line in segmented_lines:
file.write(line + '\n')
else:
for line in segmented_lines:
print(line)
if __name__ == '__main__':
main()