Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

support for multiliungal tokenizer with hooks for malayalam, adding range support #27

Open
wants to merge 38 commits into
base: multilingual
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
adfff78
using multilinguarl bert
dsplog Nov 30, 2023
5ce542e
addign support for malayalam
dsplog Nov 30, 2023
6da16b6
with multilingual tokenizer, adding subword support, adding malayalam…
dsplog Nov 30, 2023
9983db5
adding support for range i.e (10-20) cases, malayalam
dsplog Nov 30, 2023
2633144
adding support for range i.e (10-20) cases, malayalam
dsplog Nov 30, 2023
be3473d
support for malayalam pre-processing
dsplog Dec 2, 2023
8fee41a
keeping default unchanged
dsplog Dec 2, 2023
884a031
keeping default unchanged
dsplog Dec 2, 2023
6ecf98b
keeping default unchanged
dsplog Dec 2, 2023
cf62fcc
keeping default unchanged
dsplog Dec 2, 2023
235bf79
keeping default unchanged
dsplog Dec 2, 2023
f6f75c8
keeping default unchanged
dsplog Dec 2, 2023
5558f25
keeping default unchanged
dsplog Dec 2, 2023
3a94f25
adding multilingual configurations
dsplog Dec 2, 2023
b7145b1
correcting readme
dsplog Dec 2, 2023
1d79ef2
baselines
dsplog Dec 2, 2023
5bc4009
baselines
dsplog Dec 2, 2023
13b5f03
adding missed global_phonemizer
dsplog Dec 2, 2023
e0a2960
adding support for decimal numbers in malayalam
dsplog Dec 3, 2023
afa71f6
adding support for malayalam
dsplog Dec 8, 2023
3556dd5
fix for range and date
dsplog Dec 9, 2023
e257169
removing multilingaul things
dsplog Dec 9, 2023
0002a50
fix
dsplog Dec 9, 2023
d3d6ccf
fix
dsplog Dec 9, 2023
7558963
fix
dsplog Dec 9, 2023
139bbbd
fix
dsplog Dec 9, 2023
4cb495f
fix
dsplog Dec 9, 2023
70f3dc4
fix
dsplog Dec 10, 2023
e74e681
fix
dsplog Dec 10, 2023
b95ed66
fix
dsplog Dec 10, 2023
fa93ea4
fix
dsplog Dec 10, 2023
5182944
fix
dsplog Dec 10, 2023
29a5ff1
fix
dsplog Dec 10, 2023
d76fb5d
fix
dsplog Dec 10, 2023
a7325b3
fix
dsplog Dec 10, 2023
592293a
Merge pull request #33 from dsplog/range_date_fix
yl4579 Dec 12, 2023
85bd638
Update train.ipynb
yl4579 Jan 8, 2024
4198635
Merge pull request #42 from yl4579/yl4579-patch-1
yl4579 Jan 8, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 35 additions & 0 deletions converters/Range.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@

from singleton_decorator import singleton
import re
from .Cardinal import Cardinal

@singleton
class Range:
"""
Steps:
- Check for - splitting numbers

Note:
Punctuation always stays the same
"""
def __init__(self):
super().__init__()
self.cardinal = Cardinal()

def convert(self, token: str) -> str:
numbers = re.split('-', token)
if len(numbers) == 1 :
token = self.cardinal.convert(numbers[0])
elif len(numbers) == 2 :

token = self.cardinal.convert(numbers[0])
token += ' to '
token += self.cardinal.convert(numbers[1])

else :
token = ''
for number in numbers :
token += self.cardinal.convert(number)
token += ' '

return token
24 changes: 20 additions & 4 deletions text_normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import unicodedata

import os, sys
import re

from converters.Plain import Plain
from converters.Punct import Punct
Expand All @@ -23,6 +24,8 @@
from converters.Telephone import Telephone
from converters.Address import Address
from converters.Roman import Roman
from converters.Range import Range


months = ['jan',
'feb',
Expand Down Expand Up @@ -64,7 +67,8 @@
"FRACTION": Fraction(),
"TELEPHONE": Telephone(),
"ADDRESS": Address(),
"ROMAN": Roman()
"ROMAN": Roman(),
"RANGE": Range()
}

def split_given_size(a, size):
Expand Down Expand Up @@ -108,6 +112,9 @@ def is_fraction(inputString):
def is_decimal(inputString):
return "." in inputString

def is_range(inputString) :
return "-" in inputString

def is_url(inputString):
return "//" in inputString or ".com" in inputString or ".html" in inputString

Expand All @@ -119,10 +126,10 @@ def normalize_single(text, prev_text = "", next_text = ""):
text = labels['ELECTRONIC'].convert(text).upper()
elif has_numbers(text):
if has_month(prev_text):
prev_text = prev_text.lower()
prev_text = labels['DATE'].get_month(prev_text.lower())
text = labels['DATE'].convert(prev_text + " " + text).replace(prev_text, "").strip()
elif has_month(next_text):
next_text = next_text.lower()
next_text = labels['DATE'].get_month(next_text.lower())
text = labels['DATE'].convert(text + " " + next_text).replace(next_text, "").strip()
elif is_oridinal(text):
text = labels['ORDINAL'].convert(text)
Expand All @@ -136,6 +143,8 @@ def normalize_single(text, prev_text = "", next_text = ""):
text = labels['DECIMAL'].convert(text)
elif is_cardinal(text):
text = labels['CARDINAL'].convert(text)
elif is_range(text):
text = labels['RANGE'].convert(text)
else:
text = labels['DATE'].convert(text)

Expand All @@ -149,6 +158,7 @@ def normalize_single(text, prev_text = "", next_text = ""):
def normalize_text(text):
text = remove_accents(text).replace('–', ' to ').replace('-', ' - ').replace(":p", ": p").replace(":P", ": P").replace(":d", ": d").replace(":D", ": D")
words = word_tokenize(text)

df = pd.DataFrame(words, columns=['before'])

df['after'] = df['before']
Expand All @@ -157,4 +167,10 @@ def normalize_text(text):

df['after'] = df['previous'].apply(lambda m: normalize_single(m.split('|')[1], m.split('|')[0], m.split('|')[2]))

return TreebankWordDetokenizer().detokenize(df['after'].tolist()).replace("’ s", "'s").replace(" 's", "'s")
return TreebankWordDetokenizer().detokenize(df['after'].tolist()).replace("’ s", "'s").replace(" 's", "'s")

if __name__ == '__main__' :
text = 'hello (23 Jan 2020, 12:10 AM)'
out = normalize_text(text)
print(out)

2 changes: 1 addition & 1 deletion train.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@
" \n",
" bert = AlbertModel(albert_base_configuration)\n",
" bert = MultiTaskModel(bert, \n",
" num_vocab=max([m['token'] for m in token_maps.values()]), \n",
" num_vocab=1 + max([m['token'] for m in token_maps.values()]), \n",
" num_tokens=config['model_params']['vocab_size'],\n",
" hidden_size=config['model_params']['hidden_size'])\n",
" \n",
Expand Down