# Collect vocabulary

In [2]:
import os
import re
from collections import Counter


In [None]:
def extract_vocab_from_dir(input_dir, output_path, top_k=None):
    word_counter = Counter()

    for filename in os.listdir(input_dir):
        if filename.endswith(".txt"):
            with open(os.path.join(input_dir, filename), 'r', encoding='utf-8') as f:
                for line in f:
                    words = re.findall(r'\b[\w\-]+\b', line.lower())
                    word_counter.update(words)

    most_common = word_counter.most_common(top_k)
    with open(output_path, 'w', encoding='utf-8') as f:
        for word, count in most_common:
            f.write(f"{word}\t{count}\n")

    print(f"✅ Saved {len(most_common)} words to {output_path}")

# Example usage
if __name__ == "__main__":
    extract_vocab_from_dir(
        input_dir="outputs/full_sentences",
        output_path="outputs/collected_vocabulary/khasi_vocab.txt",
        top_k=None  # You can use 5000 or 10000 to limit
    )


✅ Saved 485859 words to outputs/verified_vocabulary/khasi_vocab.txt


# Separate the vocabulary into sections

1. remove the counters from khasi_vocab. 
2. keep only the words
3. manually tag the khasi_vocab_without_counter with 1,2,3,4 and so on so that we can use python to automatically detect and populate the files in collected_vocabulary folder
   1. valid_words
   2. invalid_words
   3. proper_nouns
   4. unclear_meanings
   5. word_substitution
   6. alt+0239 ï
   7. alt+164 ñ
4. ERROR: Due to issues with invisible characters (invisible to the reader but not to the computer) the words could not be extracted properly for replacement and caused issues. 
5. Now I am waiting for a clean corpus where the invisible characters are removed.

In [22]:
files_output_folder = os.listdir("outputs/collected_vocabulary")

with open("outputs/collected_vocabulary/khasi_vocab.txt",'r') as khasi_vocab, \
open("outputs/collected_vocabulary/khasi_vocab_without_counter.txt",'w') as khasi_vocab_without_counter:
    i=0
    for line in khasi_vocab:
        i+=1
        word = line.split()[0]
        desired_length = 22
        if len(word)<desired_length:
            word = word+"".join([" " for i in range(desired_length-len(word))])
        if i>781:
            khasi_vocab_without_counter.write(f"{word}\t:1\t:2\t:3\t:4\t:5\n")
        else:
            khasi_vocab_without_counter.write(f"{word}\t:1w\t2\t:3\t:4\t:5\n")

# Uptil row no 3215 of khasi_vocab_without_counter.txt 
I was using a system of putting the above numbers and a w after the number to indicate the valid, invalid, proper noun, unsure, and replace words.
I will write the code below to segregate the words and to change the original files according to the words.

In [10]:
# Copy all the sentences from all books into one file
input_path = r"outputs\full_sentences"
output_path = r"outputs\full_text"

files  = os.listdir(input_path)
for file in files:
    with open(os.path.join(input_path,file),'r',encoding='utf-8') as input_file, \
        open(os.path.join(output_path,"full_text.txt"),'a',encoding='utf-8') as output_file:
        for line in input_file:
            output_file.write(line)

In [14]:
# Select all the words that need to replaced and replace them in the full_text.txt
input_file = r"outputs\collected_vocabulary\khasi_vocab_without_counter.txt"
map_words_to_replace = dict()
with open(input_file,'r',encoding='utf-8') as f:
    for line in f:
        tm = line.split(":")
        if len(tm[-1])>1:
            new_word = tm[-1][2:].strip()
            to_replace = tm[0].strip()
            map_words_to_replace[to_replace]=new_word

# Compile regex for exact word replacement
pattern = re.compile(r'\b(' + '|'.join(re.escape(k) for k in map_words_to_replace) + r')\b')

def replace_match(match):
    return map_words_to_replace[match.group(0)]

with open(r"outputs\full_text\full_text.txt", 'r', encoding='utf-8') as full_text, \
     open(r"outputs\full_text\full_text2.txt", 'w', encoding='utf-8') as full_text2:
    for line in full_text:
        replaced_line = pattern.sub(replace_match, line)
        full_text2.write(replaced_line)

KeyboardInterrupt: 