-
Notifications
You must be signed in to change notification settings - Fork 5
/
syl2onehot-sklearn.py
89 lines (75 loc) · 3.65 KB
/
syl2onehot-sklearn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import sys
import pandas as pd
import re
import numpy as np
from numpy import array
import itertools
from numpy import argmax
# Written by Ye Kyaw Thu, Affiliate Professor, IDRI, CADT, Cambodia
# example code for syllable to one-hot encoding (for using as an embedding tool)
# Last updated: 28 Sept 2022
# How to run:
# $ python ./syl2onehot-sklearn.py ./eg-corpus.txt
# References
# https://github.com/gearmonkey/tfidf-python/blob/master/tfidf.py
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html
# https://stackoverflow.com/questions/69218610/onehotencoder-expected-2d-array-got-1d-array-instead
# https://machinelearningmastery.com/how-to-one-hot-encode-sequence-data-in-python/
# https://github.com/yashu-seth/dummyPy
# syllable breaking လုပ်ပေးမယ့် function ပါ
def sylbreak_my(line):
myConsonant = "က-အ"
enChar = "a-zA-Z0-9"
otherChar = "ဣဤဥဦဧဩဪဿ၌၍၏၀-၉၊။!-/:-@[-`{-~\s"
ssSymbol = '္'
aThat = '်'
#Regular expression pattern for Myanmar syllable breaking
#*** a consonant not after a subscript symbol AND a consonant is not followed by a-That character or a subscript symbol
BreakPattern = re.compile(r"((?<!" + ssSymbol + r")["+ myConsonant + r"](?![" + aThat + ssSymbol + r"])" + r"|[" + enChar + otherChar + r"])")
line = line.replace(" ",'')
line = BreakPattern.sub(" " + r"\1", line)
line = line.strip()
#print(line.split())
return line.split()
# argument 1 အဖြစ် ပေးလိုက်တဲ့ ဖိုင်ကို ဖွင့်ပြီး စာကြောင်း တစ်ကြောင်းချင်းစီကို
# corpus ဆိုတဲ့ list ထဲမှာ သိမ်း
with open(sys.argv[1]) as f:
corpus = []
for line in f:
corpus.append(line.rstrip())
#print(corpus)
#print(type(corpus))
# ဒီ function က corpus list ထဲမှာ ရှိတဲ့ list တစ်ခုချင်းစီကို looping ပတ်ပြီးတော့
# sylbreak_my function ကို ပို့ပေးပြီးတော့ syllable unit ဖြတ်ခိုင်းတဲ့ function ပါ
def list_sylbreak(pass_corpus):
corpus_syl = []
for line in pass_corpus:
corpus_syl.append(sylbreak_my(line))
return corpus_syl
corpus_syl = list_sylbreak(corpus)
#Extract values from syllable breaked corpus list
#Flatten the list by using itertools
#(FYI: faster than "list comprehension" and "iterate over the list of lists")
syllables = list(itertools.chain(*corpus_syl))
#Label encoding or encoding for all syllables
label_encoder = LabelEncoder()
syllables = array(syllables)
integer_encoded = label_encoder.fit_transform(syllables)
#One-Hot Encoding
onehot_encoder = OneHotEncoder(sparse='False')
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded).toarray()
for input_sentence in corpus_syl:
# input စာကြောင်းကိုပါ တွဲပြီး ရိုက်ထုတ်ပြမှ လူအတွက်က ကြည့်ရအဆင်ပြေလို့
#print("input sentence: ", input_sentence)
nparray_string = np.empty(onehot_encoded[0, :].shape, dtype=int)
#print("nparray_string info: ", nparray_string.shape)
for w in input_sentence:
# looping numpy array
for x in onehot_encoded:
inverted = label_encoder.inverse_transform([argmax(x)])
if (inverted == w ):
nparray_string = np.vstack((nparray_string, x))
break
print(nparray_string[1:])