In [1]:
from transformers import AutoTokenizer
from collections import defaultdict
import copy
import pandas as pd
from math import log

class UnigramTokenizer:
    def __init__(self, model_name, corpus_file_path):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.corpus_file_path = corpus_file_path
        self.word_freqs = defaultdict(int)
        self.char_freqs = defaultdict(int)
        self.subwords_freqs = defaultdict(int)
        self.model = {}
        self._read_corpus()
        self._calculate_frequencies()
        self._build_model()

    def _read_corpus(self):
        with open(self.corpus_file_path, 'r', encoding='utf-8') as file:
            self.corpus = file.readlines()

    def _calculate_frequencies(self):
        for text in self.corpus:
            words_with_offsets = self.tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)
            new_words = [word for word, offset in words_with_offsets]
            for word in new_words:
                self.word_freqs[word] += 1
                for i in range(len(word)):
                    self.char_freqs[word[i]] += self.word_freqs[word]
                    for j in range(i + 2, len(word) + 1):
                        self.subwords_freqs[word[i:j]] += self.word_freqs[word]

    def _build_model(self):
        total_sum = sum([freq for token, freq in self.char_freqs.items()])
        token_freqs = list(self.char_freqs.items()) + sorted(self.subwords_freqs.items(), key=lambda x: x[1], reverse=True)
        token_freqs = {token: freq for token, freq in token_freqs}
        self.model = {token: -log(freq / total_sum) for token, freq in token_freqs.items()}

    def encode_word(self, word):
        best_segmentations = [{"start": 0, "score": 1}] + [{"start": None, "score": None} for _ in range(len(word))]
        for start_idx in range(len(word)):
            best_score_at_start = best_segmentations[start_idx]["score"]
            for end_idx in range(start_idx + 1, len(word) + 1):
                token = word[start_idx:end_idx]
                if token in self.model and best_score_at_start is not None:
                    score = self.model[token] + best_score_at_start
                    if (best_segmentations[end_idx]["score"] is None or best_segmentations[end_idx]["score"] > score):
                        best_segmentations[end_idx] = {"start": start_idx, "score": score}

        segmentation = best_segmentations[-1]
        if segmentation["score"] is None:
            return ["<unk>"], None

        score = segmentation["score"]
        start = segmentation["start"]
        end = len(word)
        tokens = []
        while start != 0:
            tokens.insert(0, word[start:end])
            next_start = best_segmentations[start]["start"]
            end = start
            start = next_start
        tokens.insert(0, word[start:end])
        return tokens, score

    def tokenize(self, text):
        words_with_offsets = self.tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)
        pre_tokenized_text = [word for word, offset in words_with_offsets]
        encoded_words = [self.encode_word(word)[0] for word in pre_tokenized_text]
        return sum(encoded_words, [])

    def compute_loss(self):
        loss = 0
        for word, freq in self.word_freqs.items():
            _, word_loss = self.encode_word(word)
            loss += freq * word_loss
        return loss

    def compute_scores(self):
        scores = {}
        model_loss = self.compute_loss()
        for token, score in self.model.items():
            if len(token) == 1:
                continue
            model_without_token = copy.deepcopy(self.model)
            _ = model_without_token.pop(token)
            scores[token] = self.compute_loss(model_without_token) - model_loss
        return scores

    def tokenize_file(self, input_file_path, output_file_path):
        with open(input_file_path, 'r', encoding='utf-8') as file:
            text = file.read()

        tokens = self.tokenize(text)

        df = pd.DataFrame(tokens, columns=['Word'])
        word_counts = df['Word'].value_counts().reset_index()
        word_counts.columns = ['Word', 'Frequency']
        df['Length'] = df['Word'].apply(len)
        unique_words = df.drop_duplicates(subset=['Word'])
        result = pd.merge(unique_words, word_counts, on='Word')
        result.to_csv(output_file_path, index=False)


# Usage
model_name = "xlm-roberta-base"
corpus_file_path = "odia_sentences.txt"
tokenizer = UnigramTokenizer(model_name, corpus_file_path)
tokenizer.tokenize_file("odia_sentences_text.txt", "Uni_val.csv")


None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [3]:
from transformers import AutoTokenizer
from collections import defaultdict
import copy
import pandas as pd
from math import log

class UnigramTokenizer:
    def __init__(self, model_name, corpus_file_path):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.corpus_file_path = corpus_file_path
        self.word_freqs = defaultdict(int)
        self.char_freqs = defaultdict(int)
        self.subwords_freqs = defaultdict(int)
        self.model = {}
        self._read_corpus()
        self._calculate_frequencies()
        self._build_model()

    def _read_corpus(self):
        with open(self.corpus_file_path, 'r', encoding='utf-8') as file:
            self.corpus = file.readlines()

    def _calculate_frequencies(self):
        for text in self.corpus:
            words_with_offsets = self.tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)
            new_words = [word for word, offset in words_with_offsets]
            for word in new_words:
                self.word_freqs[word] += 1
                for i in range(len(word)):
                    self.char_freqs[word[i]] += self.word_freqs[word]
                    for j in range(i + 2, len(word) + 1):
                        self.subwords_freqs[word[i:j]] += self.word_freqs[word]

    def _build_model(self):
        total_sum = sum([freq for token, freq in self.char_freqs.items()])
        token_freqs = list(self.char_freqs.items()) + sorted(self.subwords_freqs.items(), key=lambda x: x[1], reverse=True)
        token_freqs = {token: freq for token, freq in token_freqs}
        self.model = {token: -log(freq / total_sum) for token, freq in token_freqs.items()}

    def encode_word(self, word):
        best_segmentations = [{"start": 0, "score": 1}] + [{"start": None, "score": None} for _ in range(len(word))]
        for start_idx in range(len(word)):
            best_score_at_start = best_segmentations[start_idx]["score"]
            for end_idx in range(start_idx + 1, len(word) + 1):
                token = word[start_idx:end_idx]
                if token in self.model and best_score_at_start is not None:
                    score = self.model[token] + best_score_at_start
                    if (best_segmentations[end_idx]["score"] is None or best_segmentations[end_idx]["score"] > score):
                        best_segmentations[end_idx] = {"start": start_idx, "score": score}

        segmentation = best_segmentations[-1]
        if segmentation["score"] is None:
            return ["<unk>"], None

        score = segmentation["score"]
        start = segmentation["start"]
        end = len(word)
        tokens = []
        while start != 0:
            tokens.insert(0, word[start:end])
            next_start = best_segmentations[start]["start"]
            end = start
            start = next_start
        tokens.insert(0, word[start:end])
        return tokens, score

    def tokenize(self, text):
        words_with_offsets = self.tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)
        pre_tokenized_text = [word for word, offset in words_with_offsets]
        encoded_words = [self.encode_word(word)[0] for word in pre_tokenized_text]
        return sum(encoded_words, [])

    def compute_loss(self):
        loss = 0
        for word, freq in self.word_freqs.items():
            _, word_loss = self.encode_word(word)
            loss += freq * word_loss
        return loss

    def compute_scores(self):
        scores = {}
        model_loss = self.compute_loss()
        for token, score in self.model.items():
            if len(token) == 1:
                continue
            model_without_token = copy.deepcopy(self.model)
            _ = model_without_token.pop(token)
            scores[token] = self.compute_loss(model_without_token) - model_loss
        return scores

    def tokenize_file(self, input_file_path, output_file_path):
        with open(input_file_path, 'r', encoding='utf-8') as file:
            text = file.read()

        tokens = self.tokenize(text)

        df = pd.DataFrame(tokens, columns=['Word'])
        word_counts = df['Word'].value_counts().reset_index()
        word_counts.columns = ['Word', 'Frequency']
        df['Length'] = df['Word'].apply(len)
        unique_words = df.drop_duplicates(subset=['Word'])
        result = pd.merge(unique_words, word_counts, on='Word')
        result.to_csv(output_file_path, index=False)


# Usage
model_name = "xlm-roberta-base"
corpus_file_path = "odia_sentences.txt"
tokenizer = UnigramTokenizer(model_name, corpus_file_path)
tokenizer.tokenize_file("odia_sentences_text.txt", "Uni_val.txt")