-
Notifications
You must be signed in to change notification settings - Fork 5
/
syl2freq.py
50 lines (39 loc) · 1.88 KB
/
syl2freq.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import sys
import re
# Written by Ye Kyaw Thu, Affiliate Professor, IDRI, CADT, Cambodia
# Calculating syllable frequency
# Last updated: 25 Sept 2022
# How to run: python ./syl2freq.py ./eg-corpus.txt
#
#
# sklearn library က ကိုယ့်စက်ထဲမှာ မရှိသေးရင် အောက်ပါ command နဲ့ install လုပ်ပါ
# pip install sklearn
# Referene: https://investigate.ai/text-analysis/how-to-make-scikit-learn-natural-language-processing-work-with-japanese-chinese/
# https://stackoverflow.com/questions/28328372/why-isnt-the-token-pattern-parameter-in-tfidfvectorizer-working-with-scikit-lea
# https://www.davidsbatista.net/blog/2018/02/28/TfidfVectorizer/
def sylbreak_my(line):
myConsonant = "က-အ"
enChar = "a-zA-Z0-9"
otherChar = "ဣဤဥဦဧဩဪဿ၌၍၏၀-၉၊။!-/:-@[-`{-~\s"
ssSymbol = '္'
aThat = '်'
#Regular expression pattern for Myanmar syllable breaking
#*** a consonant not after a subscript symbol AND a consonant is not followed by a-That character or a subscript symbol
BreakPattern = re.compile(r"((?<!" + ssSymbol + r")["+ myConsonant + r"](?![" + aThat + ssSymbol + r"])" + r"|[" + enChar + otherChar + r"])")
line = line.replace(" ",'')
line = BreakPattern.sub(" " + r"\1", line)
line = line.strip()
#print(line.split())
return line.split()
with open(sys.argv[1]) as f:
corpus = f.read().splitlines()
#print("input corpus:\n", corpus, "\n")
# Dummy stop word example, you have to replace it
my_stop_words = ['၊', '။', '၏', '၍', '၌']
vectorizer = CountVectorizer(tokenizer=sylbreak_my, stop_words=my_stop_words)
matrix = vectorizer.fit_transform(corpus)
syllable_df = pd.DataFrame(matrix.toarray(),
columns=vectorizer.get_feature_names_out())
print(syllable_df)