In [None]:
import os
import pandas as pd
from tqdm import tqdm
from tokenizers import BertWordPieceTokenizer
from transformers import PreTrainedTokenizerFast, AutoTokenizer
import json

In [None]:

def train_tokenizer(data_list, vocab_size=32768, model_name="BertWordPieceTokenizer"):
    """
    Train a BertWordPiece tokenizer for Hindi data.

    Args:
    - data_list: A list of sentences for training.
    - vocab_size: The vocabulary size for the tokenizer.
    - model_name: The name to save the trained tokenizer model.
    """
    # Define special tokens
    bos_tok = "<sos>"
    eos_tok = "<end_of_sen>"

    # Special characters list could include Hindi numerals or commonly used punctuation
    special_char = ["०", "१", "२", "३", "४", "५", "६", "७", "८", "९", "।", "॥"]

    # Initialize the tokenizer
    tokenizer = BertWordPieceTokenizer()

    # Train the tokenizer on the Hindi data
    tokenizer.train_from_iterator(
        data_list,
        vocab_size=vocab_size,
        min_frequency=5,
        special_tokens=["<pad>", "<unk>","[UNK]", bos_tok, eos_tok, "<user>", "<assistant>"] + special_char,
        show_progress=True,
    )

    # Wrap the tokenizer with Hugging Face’s PreTrainedTokenizerFast
    transformer_tokenizer = PreTrainedTokenizerFast(
        tokenizer_object=tokenizer,
        bos_token=bos_tok,
        eos_token=eos_tok,
        unk_token="[UNK]",
        pad_token="<pad>",
        mask_token="<mask>",
        padding_side="left",
        truncation_side="right",
        additional_special_tokens=["<user>", "<assistant>"],
        clean_up_tokenization_spaces=False,
    )

    # Save the tokenizer model
    tokenizer_folder = model_name  # Folder where the tokenizer will be saved
    os.makedirs(tokenizer_folder, exist_ok=True)
    transformer_tokenizer.save_pretrained(tokenizer_folder)
    print(f"Tokenizer saved to: {tokenizer_folder}")

    # Save the vocabulary as a JSON file
    vocab = transformer_tokenizer.get_vocab()
    vocab_file = os.path.join(tokenizer_folder, "BertWordPieceTokenizer.json")
    with open(vocab_file, "w", encoding="utf-8") as f:
        json.dump(vocab, f, ensure_ascii=False, indent=4)
    print(f"Vocabulary saved to: {vocab_file}")

In [None]:
def preprocess_text(file_path):
    data = []
    with open(file_path,'r') as f:
        for x in f:
            data.append(x)
    return data

In [None]:

def calculate_fertility_score(tokenizer, data_list):
    """
    Calculate the overall fertility score for the entire dataset based on the tokenizer output.

    Args:
    - tokenizer: The trained tokenizer object.
    - data_list: A list of sentences for which fertility scores are calculated.

    Returns:
    - A single fertility score for the entire dataset.
    """
    total_word_count = 0
    total_token_count = 0

    # Calculate word and token counts for each sentence
    for sentence in data_list:
        # Tokenize the sentence
        input_ids = tokenizer.encode(sentence)
        word_count = len(sentence.split())
        token_count = len(input_ids)

        total_word_count += word_count
        total_token_count += token_count

    # Calculate the overall fertility score
    fertility_score = total_token_count / total_word_count if total_word_count > 0 else 0
    return fertility_score

In [None]:
# Define the input folder containing the CSV files
input_folder = '/kaggle/input/hindi-dataset-10k-files/'  # Folder containing Hindi .csv files


output_file = "fertility_score_BertWordPieceTokenizer.csv"  # Output file for fertility scores

# Collect data from all CSV files in the input folder
hindi_data = []
err_cnt = 0
for filename in tqdm(os.listdir(input_folder), desc="Processing files"):
    try:
        if filename.endswith(".csv"):
            file_path = os.path.join(input_folder, filename)
            # Process each file and append valid sentences
            file_data = preprocess_text(file_path)
            if file_data:  # Only extend if the file has valid sentences
                hindi_data.extend(file_data)
    except Exception as e:
        err_cnt+=1

print('error occured in files: ',err_cnt)

print(f"Total sentences for training: {len(hindi_data)}")

In [None]:
# Train the tokenizer on the collected data
if hindi_data:
    model_name = "BertWordPieceTokenizer"
    train_tokenizer(hindi_data, vocab_size=32000, model_name=model_name)

    # Load the trained tokenizer for testing
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Calculate the overall fertility score
    fertility_score = calculate_fertility_score(tokenizer, hindi_data)
    
    # Save the fertility score to a CSV file
    with open(output_file, "w", encoding='utf-8-sig') as f:
        f.write("Fertility Score\n")
        f.write(f"{fertility_score}\n")
    print(f"Fertility score saved to '{output_file}'")

    # Display the fertility score
    print(f"Overall Fertility Score: {fertility_score}")
else:
    print("No valid data found to train the tokenizer.")