In [1]:
import matplotlib.pyplot as plt
from matplotlib import font_manager as fm, rcParams
import os
from collections import Counter

In [None]:
# font_path = r"C:\Users\Admin\AppData\Local\Microsoft\Windows\Fonts\Nirmala.ttf"
# prop = fm.FontProperties(fname=font_path)

### helper functions

In [2]:
def top_k(freq, k=100):
    return sorted(freq.items(), key=lambda x: (-x[1], x[0]))[:k]

In [3]:
def build_frequency(tokens, freq=None):
    """Update frequency dict with new tokens"""
    if freq is None:
        freq = {}
    for w in tokens:
        freq[w] = freq.get(w, 0) + 1
    return freq

In [4]:
def plot_freq(items, title, filename):
    words, counts = zip(*items) if items else ([], [])
    plt.figure(figsize=(14,6))
    plt.bar(range(len(words)), counts)

    # Use Gujarati font
    plt.xticks(range(len(words)), words, rotation=90)
    plt.title(title, fontsize=16)

    plt.tight_layout()
    plt.savefig(filename, dpi=150)
    plt.close()    

In [5]:
def remove_stopwords(freq, threshold):
    if not freq:
        print("Warning: freq is empty, skipping stopword removal")
        return {}, threshold

    counts = sorted(freq.values())
    if isinstance(threshold, float) and 0 < threshold <= 1:
        idx = int(round((len(counts) - 1) * threshold))
        T = counts[idx]
    else:
        T = int(threshold)
    cleaned = {w: c for w, c in freq.items() if c < T}
    return cleaned, T


In [6]:
def read_tokens(file_path):
    tokens = []
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            w = line.strip()
            if w:
                tokens.append(w)
    return tokens

In [None]:
def load_frequencies(tsv_file):
    freq = Counter()
    with open(tsv_file, "r", encoding="utf-8") as f:
        for line in f:
            parts = line.strip().split("\t")
            if len(parts) == 2:
                word, count = parts
            elif len(parts) == 3:
                word, count, _ = parts  
            else:
                continue
            freq[word] += int(count)
    return freq

### check pointing 

In [8]:
checkpoint_file = "checkpoint.txt"

def get_last_position():
    if os.path.exists(checkpoint_file):
        with open(checkpoint_file, "r") as f:
            return int(f.read().strip() or 0)
    return 0

def save_position(pos):
    with open(checkpoint_file, "w") as f:
        f.write(str(pos))

In [9]:
def get_checkpoint():
    """Return last processed line number from checkpoint file."""
    if os.path.exists(checkpoint_file):
        with open(checkpoint_file, "r") as f:
            return int(f.read().strip())
    return 0

def save_checkpoint(line_num):
    """Save last processed line number to checkpoint file."""
    with open(checkpoint_file, "w") as f:
        f.write(str(line_num))

### batch processing

In [10]:
data_file = r"C:\Users\dubey\Downloads\tokenized_hindi.txt"
freq_file = "freq_raw.tsv"
BATCH_SIZE = 100000

freq = {} 

In [11]:
def process_batch(tokens, batch_num,line_num):
    # Count frequencies in this batch
    freq = build_frequency(tokens)

    # Append raw frequencies (do not overwrite previous ones)
    with open(freq_file, "a", encoding="utf-8") as f:
        for w, c in sorted(freq.items(), key=lambda x: (-x[1], x[0])):
            f.write(f"{w}\t{c}\n")

    print(f"Finished batch {batch_num} ({batch_num*BATCH_SIZE+line_num} words done)")    

In [12]:
def proccess_data():
    start_line = get_checkpoint()
    print(f"Resuming from line {start_line}")

    with open(data_file, "r", encoding="utf-8") as infile:
        tokens = []
        line_num = 0
        batch_num = 0

        # Skip lines until checkpoint
        for _ in range(start_line):
            infile.readline()
            

        line_num=0

        # Process batches
        for line in infile:
            word = line.strip()
            if word:
                tokens.append(word)
                line_num += 1

            if len(tokens) >= BATCH_SIZE:
                batch_num += 1
                process_batch(tokens, batch_num,line_num)
                save_checkpoint(batch_num*BATCH_SIZE+line_num)
                line_num=0
                tokens = []

        # Final leftover batch
        if tokens:
            batch_num += 1
            process_batch(tokens, batch_num,line_num)
            save_checkpoint(batch_num*BATCH_SIZE+line_num)

    print(" All batches processed!")


In [13]:
proccess_data()

Resuming from line 0


FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\dubey\\Downloads\\tokenized_hindi.txt'

In [14]:
freq=load_frequencies(freq_file)
freq

FileNotFoundError: [Errno 2] No such file or directory: 'freq_raw.tsv'

In [15]:
freq=load_frequencies(freq_file)

# Plot Top-100 original
plot_freq(top_k(freq,100), "Top 100 words (raw)", "top_100.png")

# Try multiple thresholds (percentiles + absolute)
for th in [0.50, 0.75, 0.90, 100]:
    cleaned, T = remove_stopwords(freq, th)
    plot_freq(top_k(cleaned,100), f"Top 100 after stopword removal (T={T})",
              f"cleaned_top_100_T{str(T).replace('.','_')}.png")

FileNotFoundError: [Errno 2] No such file or directory: 'freq_raw.tsv'