In [1]:
import pandas as pd
import numpy as np
from tokenizers.trainers import WordPieceTrainer, BpeTrainer, WordLevelTrainer
from tokenizers import AddedToken, Tokenizer
from tokenizers.models import WordPiece, BPE, WordLevel
from tokenizers import normalizers
from tokenizers.pre_tokenizers import Whitespace, WhitespaceSplit, Split
from tokenizers.normalizers import NFD, Lowercase
from tokenizers.processors import BertProcessing, RobertaProcessing
from transformers import PreTrainedTokenizerFast

In [2]:
PATH_TO_FOLDER = "../../../"
%cd $PATH_TO_FOLDER

/home/souvic/mounted/btp/vahini/Name2Demographics


In [3]:
import sys  

sys.path.insert(0, 'Models/ERData/PreProcessing/')
from er_preprocess import *

In [4]:
# get data
er = ERData_with_dup()

In [5]:
names = er['Name'].values

In [6]:
len(names)

22217062

In [7]:
# bert word piece tokenizer
tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
tokenizer.pre_tokenizer = WhitespaceSplit()
tokenizer.normalizer = normalizers.Sequence([NFD()])
trainer = WordPieceTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"], vocab_size=10000)
tokenizer.train_from_iterator(names, trainer=trainer)
tokenizer.post_processor = BertProcessing(sep=("[SEP]", tokenizer.get_vocab()['[SEP]']), cls=("[CLS]", tokenizer.get_vocab()['[CLS]']))






In [9]:
from transformers import BertTokenizerFast

new_tokenizer = BertTokenizerFast(tokenizer_object=tokenizer)

In [11]:
new_tokenizer.save_pretrained('Models/ERData/Tokenizer/'+"trained_tokenizer/indiannames-tokenizer")

('Models/ERData/Tokenizer/trained_tokenizer/indiannames-tokenizer/tokenizer_config.json',
 'Models/ERData/Tokenizer/trained_tokenizer/indiannames-tokenizer/special_tokens_map.json',
 'Models/ERData/Tokenizer/trained_tokenizer/indiannames-tokenizer/vocab.txt',
 'Models/ERData/Tokenizer/trained_tokenizer/indiannames-tokenizer/added_tokens.json',
 'Models/ERData/Tokenizer/trained_tokenizer/indiannames-tokenizer/tokenizer.json')

In [12]:
output = tokenizer.encode("sk shelley lamkang sankhil")
output.tokens

['[CLS]', 'sk', 'shel', '##ley', 'lamkang', 'sankh', '##il', '[SEP]']

In [13]:
# Character level tokenization
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
tokenizer.pre_tokenizer = WhitespaceSplit()
tokenizer.normalizer = normalizers.Sequence([NFD()])
trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"], vocab_size=500)
tokenizer.train_from_iterator(names, trainer=trainer)
tokenizer.post_processor = BertProcessing(sep=("[SEP]", tokenizer.get_vocab()['[SEP]']), cls=("[CLS]", tokenizer.get_vocab()['[CLS]']))

tokenizer.get_vocab()
fast_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer)






In [14]:
fast_tokenizer.convert_ids_to_tokens(fast_tokenizer("prasanna PARASURAMA")['input_ids'])

fast_tokenizer.save_pretrained('Models/ERData/Tokenizer/'+"trained_tokenizer/indiannames-char-tokenizer")

('Models/ERData/Tokenizer/trained_tokenizer/indiannames-char-tokenizer/tokenizer_config.json',
 'Models/ERData/Tokenizer/trained_tokenizer/indiannames-char-tokenizer/special_tokens_map.json',
 'Models/ERData/Tokenizer/trained_tokenizer/indiannames-char-tokenizer/tokenizer.json')

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
muriltokenizer = AutoTokenizer.from_pretrained("google/muril-base-cased")

berttokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")


In [None]:
muriltokenizer.tokenize("sk shelley lamkang sankhil")

In [None]:
berttokenizer.tokenize("sk shelley lamkang sankhil")

In [None]:
tokenizer.encode("sk shelley lamkang sankhil").tokens