In [None]:
%pip install pandas
%pip install tqdm

In [2]:
import csv
import re
from collections import Counter
from typing import Dict, Optional, Pattern

import pandas as pd
from tqdm import tqdm

In [3]:
def is_malayalam_word(word: str) -> bool:
    """
    Check if every non-whitespace character in the word
    is within the Malayalam Unicode block (U+0D00 to U+0D7F).
    """
    return all(char.isspace() or 0x0D00 <= ord(char) <= 0x0D7F for char in word)


def clean_malayalam_words(input_file: str, output_file: str) -> None:
    """
    Read Malayalam words from a file, clean them by:
      - Stripping whitespace,
      - Removing empty lines and words with internal whitespace or only one letter,
      - Keeping only words that contain solely Malayalam characters,
      - Removing duplicate words.
    Write the cleaned words to a new file.
    """
    internal_whitespace = re.compile(r"\s")
    seen = set()
    cleaned_lines = []

    with open(input_file, "r", encoding="utf-8") as f:
        for line in f:
            word = line.strip()
            if not word or internal_whitespace.search(word) or len(word) == 1:
                continue
            if is_malayalam_word(word) and word not in seen:
                cleaned_lines.append(word)
                seen.add(word)

    with open(output_file, "w", encoding="utf-8") as f:
        f.write("\n".join(cleaned_lines) + "\n")

    print("Cleaning complete. See file:", output_file)

In [4]:
if __name__ == "__main__":
    INPUT_FILE = "mlwiki-all-unique-words-20190101.txt"
    OUTPUT_FILE = "cleaned_words_ml.txt"
    clean_malayalam_words(INPUT_FILE, OUTPUT_FILE)

Cleaning complete. See file: cleaned_words_ml.txt


In [6]:
VOWEL_SIGNS = ["ാ", "ി", "ീ", "ു", "ൂ", "ൃ", "െ", "േ", "ൈ", "ൊ", "ോ", "ൗ"]
VIRAMA = "്"


def load_mapping(csv_filename: str) -> Dict[str, str]:
    """Load transliteration mapping from a CSV file (expects columns 'Letter' and 'MachineString')."""
    with open(csv_filename, encoding="utf-8") as csvfile:
        return {
            row["Letter"].strip(): row["MachineString"].strip()
            for row in csv.DictReader(csvfile)
            if row["Letter"].strip()
        }


def translit_line(
    line: str, mapping: Dict[str, str], pattern: Optional[Pattern] = None
) -> str:
    """
    Transliterate a line using the provided mapping.

    Handles vowel signs and the virama specially:
      - For vowel signs: if output ends with "A", remove it before appending.
      - For the virama: if at the word's end, remove a trailing "A" (if present) and append "U".
    """
    if pattern is None:
        sorted_keys = sorted(mapping.keys(), key=len, reverse=True)
        pattern = re.compile("|".join(map(re.escape, sorted_keys)))

    output = ""
    last_index = 0
    for m in pattern.finditer(line):
        start, end = m.start(), m.end()
        output += line[last_index:start]
        key = m.group(0)
        if key == VIRAMA:
            if output.endswith("A"):
                output = output[:-1]
            if end == len(line) or (end < len(line) and line[end].isspace()):
                output += "U"
        elif key in VOWEL_SIGNS:
            if output.endswith("A"):
                output = output[:-1]
            output += mapping[key]
        else:
            output += mapping[key]
        last_index = end
    output += line[last_index:]
    return output


def translit_file(input_file: str, output_file: str, translit_key: str) -> None:
    """
    Load Malayalam words from the input file,
    transliterate each line using the mapping from the CSV,
    and write the transliterated output to the output file.
    """
    mapping = load_mapping(translit_key)
    sorted_keys = sorted(mapping.keys(), key=len, reverse=True)
    pattern = re.compile("|".join(map(re.escape, sorted_keys)))

    with open(input_file, "r", encoding="utf-8") as f_in:
        lines = f_in.read().splitlines()

    with open(output_file, "w", encoding="utf-8") as f_out:
        for line in tqdm(lines, desc="Transliterating lines", unit="line"):
            f_out.write(translit_line(line, mapping, pattern) + "\n")

    print(f"Transliteration done. Saved to file {output_file}")

In [7]:
if __name__ == "__main__":
    INPUT_FILE = "cleaned_words_ml.txt"
    OUTPUT_FILE = "MTWC_all.txt"
    TRANSLIT_KEY = "translit_key_ml.csv"
    translit_file(INPUT_FILE, OUTPUT_FILE, TRANSLIT_KEY)

Transliterating lines: 100%|██████████| 1265902/1265902 [00:27<00:00, 45682.53line/s]

Transliteration done. Saved to file MTWC_all.txt





In [9]:
def filter_words(df: pd.DataFrame, max_length: int = 15) -> pd.DataFrame:
    """Filter words longer than max_length and remove duplicates; add a 'Length' column."""
    df["Length"] = df["MachineWord"].str.len()
    return (
        df[df["Length"] <= max_length]
        .drop_duplicates("MachineWord")
        .reset_index(drop=True)
    )


def apply_rune_mapping(df: pd.DataFrame, rune_csv: str) -> pd.DataFrame:
    """Create 'RuneWord' column by replacing tokens in 'MachineWord' using the rune mapping."""
    mapping_df = pd.read_csv(rune_csv, encoding="utf-8")
    mapping = dict(zip(mapping_df["MachineLetter"], mapping_df["Rune"]))
    keys = sorted(mapping.keys(), key=len, reverse=True)
    pattern = re.compile("|".join(map(re.escape, keys)))
    df["RuneWord"] = df["MachineWord"].str.replace(
        pattern, lambda m: mapping[m.group(0)], regex=True
    )
    return df


def clean_lexicon(input_file: str, output_csv: str, rune_csv: str) -> None:
    """
    Clean the lexicon by loading words, filtering them, applying the rune mapping,
    sorting by word length, and saving to CSV.
    """
    with open(input_file, "r", encoding="utf-8") as f:
        words = f.read().splitlines()
    df = pd.DataFrame(words, columns=["MachineWord"])
    df = filter_words(df)
    df = apply_rune_mapping(df, rune_csv)
    df = df.sort_values("Length").reset_index(drop=True)[
        ["MachineWord", "RuneWord", "Length"]
    ]
    df.to_csv(output_csv, index=False, encoding="utf-8")
    print(f"CSV file saved to {output_csv}")

In [10]:
if __name__ == "__main__":
    INPUT_FILE = "MTWC_all.txt"
    OUTPUT_CSV = "MTWC.csv"
    RUNE_CSV = "runes_ml.csv"
    clean_lexicon(INPUT_FILE, OUTPUT_CSV, RUNE_CSV)

CSV file saved to MTWC.csv


In [12]:
def compute_frequencies(df: pd.DataFrame) -> tuple:
    """Compute token frequencies for all words and for words of length 2 and 3."""
    overall = Counter("".join(df["MachineWord"]))
    eq2 = Counter("".join(df.loc[df["Length"] == 2, "MachineWord"]))
    eq3 = Counter("".join(df.loc[df["Length"] == 3, "MachineWord"]))
    return overall, eq2, eq3


def create_frequency_dataframe(lexicon_csv: str, rune_csv: str) -> pd.DataFrame:
    """
    Create a DataFrame with tokens, their runes, overall frequency, and frequencies for words
    of length 2 and 3. The resulting DataFrame is sorted by descending Frequency and indexed by MachineLetter.
    """
    df = pd.read_csv(lexicon_csv, encoding="utf-8")
    overall, eq2, eq3 = compute_frequencies(df)
    rune_mapping = (
        pd.read_csv(rune_csv, encoding="utf-8")
        .set_index("MachineLetter")["Rune"]
        .to_dict()
    )
    data = [
        (
            token,
            rune_mapping.get(token, token),
            freq,
            eq2.get(token, 0),
            eq3.get(token, 0),
        )
        for token, freq in overall.items()
    ]
    return (
        pd.DataFrame(
            data, columns=["MachineLetter", "Rune", "Frequency", "Freq_2", "Freq_3"]
        )
        .sort_values("Frequency", ascending=False)
        .set_index("MachineLetter")
    )

In [13]:
if __name__ == "__main__":
    LEXICON_CSV = "MTWC.csv"
    RUNE_CSV = "runes_ml.csv"
    freqs = create_frequency_dataframe(LEXICON_CSV, RUNE_CSV)

In [14]:
freqs

Unnamed: 0_level_0,Rune,Frequency,Freq_2,Freq_3
MachineLetter,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A,A,1541451,6,463
I,I,527907,29,288
E,E,493391,31,335
U,U,483223,27,289
R,R,446994,12,88
N,N,404701,13,174
K,K,402635,7,103
M,M,313566,11,135
T,T,269052,5,71
Ʈ,[TT],260516,7,91
