In [None]:
import os
import pandas as pd
from tqdm import tqdm
import spacy
import json

In [None]:
def train_tokenizer(data_list, model_name="SpaCyTokenizer"):
    """
    Train a SpaCy tokenizer and save vocabulary.

    Args:
    - data_list: A list of sentences for training.
    - model_name: The name to save the trained tokenizer model.
    """
    # Load a blank SpaCy model (for Hindi, we can use "xx" for multilingual support)
    nlp = spacy.blank("xx")
    
    # Process all sentences to build vocabulary
    all_tokens = set()
    for sentence in tqdm(data_list, desc="Tokenizing sentences"):
        doc = nlp(sentence)
        all_tokens.update([token.text for token in doc])
    
    # Save vocabulary as JSON
    if not os.path.exists(model_name):
        os.makedirs(model_name)
    vocab_file = os.path.join(model_name, "spacy_tokenizer_vocab.json")
    with open(vocab_file, "w", encoding="utf-8") as f:
        json.dump(list(all_tokens), f, ensure_ascii=False, indent=4)
    print(f"Vocabulary saved to: {vocab_file}")

In [None]:
def preprocess_text(file_path):
    data = []
    with open(file_path,'r') as f:
        for x in f:
            data.append(x)
    return data

In [None]:
def calculate_fertility_score(nlp, data_list):
    """
    Calculate the overall fertility score for the entire dataset based on the tokenizer output.

    Args:
    - nlp: The SpaCy language model object.
    - data_list: A list of sentences for which fertility scores are calculated.

    Returns:
    - A single fertility score for the entire dataset.
    """
    total_word_count = 0
    total_token_count = 0

    # Calculate word and token counts for each sentence
    for sentence in data_list:
        doc = nlp(sentence)
        word_count = len(sentence.split())
        token_count = len(doc)
        total_word_count += word_count
        total_token_count += token_count

    # Calculate the overall fertility score
    fertility_score = total_token_count / total_word_count if total_word_count > 0 else 0
    return fertility_score

In [None]:
# Define the input folder containing the CSV files
input_folder = '/kaggle/input/hindi-dataset-10k-files/'  # Folder containing Hindi .csv files


output_file = "fertility_score_SpaCyTokenizer.csv"  # Output file for fertility scores

# Collect data from all CSV files in the input folder
hindi_data = []
err_cnt = 0
for filename in tqdm(os.listdir(input_folder), desc="Processing files"):
    try:
        if filename.endswith(".csv"):
            file_path = os.path.join(input_folder, filename)
            # Process each file and append valid sentences
            file_data = preprocess_text(file_path)
            if file_data:  # Only extend if the file has valid sentences
                hindi_data.extend(file_data)
    except Exception as e:
        err_cnt+=1

print('error occured in files: ',err_cnt)

print(f"Total sentences for training: {len(hindi_data)}")

In [None]:
# Train the tokenizer and save vocabulary
if hindi_data:
    model_name = "SpaCyTokenizer"
    train_tokenizer(hindi_data, model_name=model_name)

    # Load the SpaCy language model for calculating fertility
    nlp = spacy.blank("xx")

    # Calculate fertility score
    fertility_score = calculate_fertility_score(nlp, hindi_data)
    
    # Save the fertility score to a CSV file
    with open(output_file, "w", encoding='utf-8-sig') as f:
        f.write("Fertility Score\n")
        f.write(f"{fertility_score}\n")
    print(f"Fertility score saved to '{output_file}'")

    # Display the fertility score
    print(f"Overall Fertility Score: {fertility_score}")
else:
    print("No valid data found to train the tokenizer.")