In [2]:
from datasets import load_from_disk
from transformers import AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
multi_lingual_tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")



In [4]:
dataset_hi = load_from_disk("/home/PL-BERT/data/hi")
dataset_en = load_from_disk("/home/PL-BERT/data/en_ds")

In [5]:
multi_lingual_tokenizer.decode([101, 108, 108, 190, 102],)

'[CLS] # # v [SEP]'

In [6]:
# # Function to remove 102 and 108 from each list in input_ids
# def remove_tokens(input_ids):
#     removed_spec_input_ids = []
#     for single_input_ids in input_ids:
#         removed_spec_input_ids.append([token for token in single_input_ids if token not in (102, 101)])
#     return removed_spec_input_ids

# # Apply the function to the input_ids column
# dataset_mapped = dataset_en.map(lambda x: {"input_ids": remove_tokens(x["input_ids"])})

In [7]:
# dataset_mapped.save_to_disk("/home/PL-BERT/data/en_ds")

In [8]:
dataset_en[0]['input_ids']

[[30409],
 [108, 108, 190],
 [13258],
 [108, 108, 170, 10238],
 [108, 108, 186],
 [113],
 [11175],
 [10105],
 [44380],
 [108, 108, 77586],
 [10108],
 [11238],
 [19964],
 [108, 108, 33003],
 [16222],
 [108, 108, 193],
 [15767],
 [114],
 [10124],
 [10151],
 [28446],
 [11775],
 [14054],
 [171],
 [108, 108, 193],
 [108, 108, 171, 10161],
 [108, 108, 10298],
 [117],
 [10479],
 [12469],
 [53895],
 [10142],
 [10105],
 [11121],
 [119],
 [18021],
 [10106],
 [25369],
 [35057],
 [117],
 [13258],
 [108, 108, 170, 10238],
 [108, 108, 186],
 [13457],
 [11342],
 [108, 108, 171, 10157],
 [108, 108, 171, 13020],
 [108, 108, 175],
 [10160],
 [10105],
 [12089],
 [10108],
 [44380],
 [117],
 [10111],
 [10106],
 [10551],
 [46634],
 [12403],
 [11367],
 [10105],
 [28446],
 [10655],
 [16414],
 [12648],
 [22773],
 [17264],
 [10111],
 [10106],
 [26051],
 [11769],
 [117],
 [10105],
 [28446],
 [10655],
 [12648],
 [22773],
 [17264],
 [119],
 [10167],
 [26051],
 [44380],
 [10261],
 [10134],
 [10105],
 [12628],
 [170

In [9]:
def get_text(input_ids):
    decoded = multi_lingual_tokenizer.batch_decode(input_ids)
    text = ''
    for token in decoded:
        if token.startswith('# # '):
            text += token[2:]  # Remove ## and append without space
        else:
            text += ' ' + token  # Add space before new word
    text = text.strip().replace("# ", "")  # Remove leading/trailing spaces

    return text

In [10]:
# Map the get_text function and create a new column 'text'
dataset_en = dataset_en.map(lambda x: {"text": get_text(x["input_ids"])})

In [11]:
# Map the get_text function and create a new column 'text'
dataset_hi = dataset_hi.map(lambda x: {"text": get_text(x["input_ids"])})

In [12]:
dataset_hi

Dataset({
    features: ['id', 'url', 'title', 'input_ids', 'phonemes', 'text'],
    num_rows: 34475
})

In [11]:
# Function to get unique words from the 'text' column
def get_unique_words(dataset):
    all_text = " ".join(dataset["text"])
    unique_words = set(all_text.split())
    return unique_words

In [17]:
# Get unique words from the dataset and print them
en_unique_words = get_unique_words(dataset_en)
print(len(en_unique_words))

412107


In [18]:
# Get unique words from the dataset and print them
hi_unique_words = get_unique_words(dataset_hi)
print(len(hi_unique_words))

369209


In [21]:
import json

file_path = 'hi_unique_words.json'

with open(file_path, 'w', encoding='utf-8') as f:
    json.dump({"hi_unique_words": list(hi_unique_words)}, f, ensure_ascii=False, indent=4)

print(f"Data written to {file_path}")

Data written to hi_unique_words.json


In [23]:
import json

file_path = 'en_unique_words.json'

with open(file_path, 'w', encoding='utf-8') as f:
    json.dump({"en_unique_words": list(en_unique_words)}, f, ensure_ascii=False, indent=4)

print(f"Data written to {file_path}")

Data written to en_unique_words.json


In [22]:
from collections import Counter
from tqdm import tqdm
import json

In [24]:
hi_word_counter = Counter()
en_word_counter = Counter()

In [25]:
# Iterate through the text column of the dataset with a progress bar
for text in tqdm(dataset_hi['text'], desc="Processing words", unit="sentence"):
    words = text.lower().split()  # Convert to lowercase and split by spaces
    hi_word_counter.update(words)  # Update word count in Counter

# Save the word frequencies to a JSON file
output_file = 'hi_word_frequencies.json'
with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(hi_word_counter, f, ensure_ascii=False, indent=4)

print(f"Word frequencies saved to {output_file}")

Processing words: 100%|██████████| 34475/34475 [00:01<00:00, 18125.30sentence/s]


Word frequencies saved to hi_word_frequencies.json


In [26]:
# Iterate through the text column of the dataset with a progress bar
for text in tqdm(dataset_en['text'], desc="Processing words", unit="sentence"):
    words = text.lower().split()  # Convert to lowercase and split by spaces
    en_word_counter.update(words)  # Update word count in Counter

# Save the word frequencies to a JSON file
output_file = 'en_word_frequencies.json'
with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(en_word_counter, f, ensure_ascii=False, indent=4)

print(f"Word frequencies saved to {output_file}")

Processing words: 100%|██████████| 34475/34475 [00:02<00:00, 13342.41sentence/s]


Word frequencies saved to en_word_frequencies.json
