/
suffixes_corpus_to_morfessor_segmentation.py
78 lines (63 loc) · 2.66 KB
/
suffixes_corpus_to_morfessor_segmentation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import sys
import os
import argparse
import math
import morfessor
from collections import defaultdict
from parsing import parse_segmentation
parser = argparse.ArgumentParser(description='Generates a segmenation dictionary to be used to train a Morfessor model')
parser.add_argument('--suffixes',help='Suffixes for segmenting unknown words.' )
parser.add_argument('--stopwords',help='Words analyzed by hunspell' )
parser.add_argument('--vocabulary',help='Vocabulary of corpus to be segmented' )
args = parser.parse_args()
numKnownUnambigous=0
numKnownAmbigous=0
numUnknown=0
freqKnownUnambigous=0
freqKnownAmbigous=0
freqUnknown=0
#Load vocabulary: we need frequencies for statistics
freqs=defaultdict(int)
with open(args.vocabulary) as freqs_f:
for line in freqs_f:
line=line.rstrip("\n")
parts=line.split(" ")
freqs[parts[1]]=int(parts[0])
#suffix -> [ suf, fix]
suffixes=dict()
with open(args.suffixes) as suf_f:
for line in suf_f:
parts=line.rstrip("\n").split("\t")
if parts[0] not in suffixes:
suffixes[parts[0]]=set()
suffixes[parts[0]].add(tuple(parts[1].split(" ")))
stopwords=set()
with open(args.stopwords) as stop_f:
for line in stop_f:
line=line.rstrip("\n")
stopwords.add(line)
solution={}
for k in freqs:
if k not in stopwords and len(k) > 2:
numUnknown+=1
freqUnknown+=freqs[k]
print("Candidate segmentations for unk {}:".format(k),file=sys.stderr)
matchingSuffixes=[ suf for suf in suffixes if k.endswith(suf) ]
if len(matchingSuffixes) > 0:
segmentations=[ [k] ]
for suf in matchingSuffixes:
for tupleSegments in suffixes[suf]:
segmentations.append( [ k[:-len(suf)] ]+list(tupleSegments) )
solution[k] = segmentations
#print solution
for w in sorted(solution.keys()):
segmentations=solution[w]
segmentations_str=[" ".join(s) for s in segmentations]
print(w+" "+", ".join(segmentations_str))
#Print stats
totalVoc=numUnknown+numKnownAmbigous+numKnownUnambigous
if totalVoc > 0:
print( "Stats on vocabulary: unamb: {} ({}) amb: {} ({}) unk: {} ({}) total: {}".format(numKnownUnambigous,numKnownUnambigous*1.0/totalVoc, numKnownAmbigous,numKnownAmbigous*1.0/totalVoc, numUnknown,numUnknown*1.0/totalVoc,totalVoc) ,file=sys.stderr)
totalFreq=freqUnknown+freqKnownAmbigous+freqKnownUnambigous
if totalFreq > 0:
print( "Stats on frequency: unamb: {} ({}) amb: {} ({}) unk: {} ({}) total: {}".format(freqKnownUnambigous,freqKnownUnambigous*1.0/totalFreq, freqKnownAmbigous,freqKnownAmbigous*1.0/totalFreq, freqUnknown,freqUnknown*1.0/totalFreq ,totalFreq) ,file=sys.stderr)