In [None]:
%pip install pandas
%pip install tqdm

In [15]:
import csv
import re
from collections import Counter
from typing import Dict

import pandas as pd
from tqdm import tqdm

In [16]:
def is_malayalam_word(word: str) -> bool:
    """
    Check if the given word contains only Malayalam characters.
    
    Malayalam Unicode block is from U+0D00 to U+0D7F.
    Whitespace is ignored.
    
    Args:
        word: The string to check.
        
    Returns:
        True if every non-whitespace character in the word is within the Malayalam block.
    """
    for char in word:
        if not char.isspace() and not (0x0D00 <= ord(char) <= 0x0D7F):
            return False
    return True


def clean_malayalam_words(input_file: str, output_file: str) -> None:
    """
    Read Malayalam words from a file, filter and clean them according to specific rules,
    and write the cleaned words to a new file.
    
    Cleaning steps:
      - Strip leading/trailing whitespace.
      - Remove empty lines.
      - Discard lines with internal whitespace (i.e. multiple words).
      - Discard words that are only one letter long.
      - Keep only words that contain solely Malayalam characters.
      - Remove duplicate words.
    
    Args:
        input_file: Path to the input file containing Malayalam words (one per line).
        output_file: Path to the output file where cleaned words will be written.
    """
    # Read all lines from the input file.
    with open(input_file, "r", encoding="utf-8") as f_in:
        lines = f_in.read().splitlines()

    cleaned_lines = []
    seen = set()

    for line in lines:
        word = line.strip()
        # Skip empty lines.
        if not word:
            continue
        # Skip if there is any internal whitespace (i.e., multiple words).
        if re.search(r"\s", word):
            continue
        # Skip single-letter words.
        if len(word) == 1:
            continue
        # Check if the word is entirely Malayalam.
        if is_malayalam_word(word) and word not in seen:
            cleaned_lines.append(word)
            seen.add(word)

    # Write the cleaned words to the output file.
    with open(output_file, "w", encoding="utf-8") as f_out:
        for word in cleaned_lines:
            f_out.write(word + "\n")

    print("Cleaning complete. See file:", output_file)

In [None]:
if __name__ == "__main__":
    INPUT_FILE = "mlwiki-all-unique-words-20190101.txt"
    OUTPUT_FILE = "cleaned_words_ml.txt"
    clean_malayalam_words(INPUT_FILE, OUTPUT_FILE)

Cleaning complete. See file: cleaned_ml_words.txt


In [None]:
# Define the list of vowel signs (we don't include "ം" and "ഃ", which are handled elsewhere)
VOWEL_SIGNS = ["ാ", "ി", "ീ", "ു", "ൂ", "ൃ", "െ", "േ", "ൈ", "ൊ", "ോ", "ൗ"]
# Virama character for reference (handled specially)
VIRAMA = "്"


def load_mapping(csv_filename: str) -> Dict[str, str]:
    """
    Load the transliteration mapping from a CSV file.
    
    Expects columns: Letter, Unicode, Unicode Name, RuneString, MachineString.
    
    Args:
        csv_filename: Path to the CSV file containing the mapping.
        
    Returns:
        A dictionary mapping each letter or ligature to its transliteration.
    """
    mapping: Dict[str, str] = {}
    with open(csv_filename, encoding="utf-8") as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            letter = row["Letter"].strip()
            translit = row["MachineString"].strip()
            if letter:
                mapping[letter] = translit
    return mapping


def transliterate_line(line: str, mapping: Dict[str, str]) -> str:
    """
    Transliterate a line of text using a compiled regex that matches any key from the mapping.
    
    The alternatives in the regex are sorted in descending order of length so that longer
    ligatures are prioritized.
    
    Special treatment:
      - For vowel signs: if the output so far ends with an inherent "A", remove it before appending.
      - For the virama: if the virama is at the end of a word (i.e. at line's end or followed by whitespace),
        remove a trailing "A" from the output (if present) and append "U"; otherwise, simply remove the trailing "A".
    
    Args:
        line: The input string to be transliterated.
        mapping: A dictionary mapping letters/ligatures to their transliteration.
    
    Returns:
        The transliterated line as a string.
    """
    # Sort keys by descending length so that longer matches are prioritized.
    sorted_keys = sorted(mapping.keys(), key=len, reverse=True)
    pattern = re.compile("|".join(re.escape(key) for key in sorted_keys))

    output = ""
    last_index = 0

    for m in pattern.finditer(line):
        start, end = m.start(), m.end()
        # Append unchanged text between matches.
        output += line[last_index:start]
        key = m.group(0)

        if key == VIRAMA:
            # For virama, remove trailing inherent "A" if present.
            if output.endswith("A"):
                output = output[:-1]
            # If the virama is at the end of a word, append "U".
            if end == len(line) or (end < len(line) and line[end].isspace()):
                output += "U"
            # Otherwise, simply remove the inherent vowel (by doing nothing more).

        elif key in VOWEL_SIGNS:
            # For vowel signs, remove trailing inherent "A" before appending.
            if output.endswith("A"):
                output = output[:-1]
            output += mapping[key]

        else:
            output += mapping[key]

        last_index = end

    # Append any remaining text after the last match.
    output += line[last_index:]
    return output

def transliterate_file(input_file: str, output_file: str, translit_key: str) -> None:
    """
    Load Malayalam words from the input file, transliterate them using a mapping from the CSV,
    and write the transliterated text to the output file.
    
    Args:
        input_file: Path to the file containing Malayalam words (one per line).
        output_file: Path to the file where the transliterated output will be saved.
        translit_key: Path to the CSV file with the transliteration mapping.
    """
    # Load the transliteration mapping.
    mapping = load_mapping(translit_key)

    # Read the Malayalam words from the input file.
    with open(input_file, "r", encoding="utf-8") as f_in:
        lines = f_in.read().splitlines()

    # Transliterate each line and write the result to the output file.
    with open(output_file, "w", encoding="utf-8") as f_out:
        for line in tqdm(lines, desc="Transliterating lines", unit="line"):
            transliterated_line = transliterate_line(line, mapping)
            f_out.write(transliterated_line + "\n")

    print(f"Transliteration done. Saved to file {output_file}")

In [None]:
if __name__ == "__main__":
    INPUT_FILE = "cleaned_ml_words.txt"
    OUTPUT_FILE = "MTWC_all.txt"
    TRANSLIT_KEY = "transliteration_key_ml.csv"
    transliterate_file(INPUT_FILE, OUTPUT_FILE, TRANSLIT_KEY)

Transliterating lines: 100%|██████████| 1265902/1265902 [02:02<00:00, 10299.09line/s]

Transliteration done. See file:, MTWC_all.txt





In [45]:
# --- Step 1: Read input file and create DataFrame ---
input_file = "MTWC_all.txt"  # Input text file containing words line separated
output_csv = "MTWC.csv"  # Output CSV file
rune_csv = "runes_ml.csv"  # Rune mapping CSV file

# Read words from file into a DataFrame with column 'MachineWord'
with open(input_file, "r", encoding="utf-8") as f:
    words = f.read().splitlines()
df = pd.DataFrame(words, columns=["MachineWord"])

# Compute the length and filter out words >15 characters in a vectorized way
df["Length"] = df["MachineWord"].str.len()
df = df[df["Length"] <= 15].drop_duplicates(subset="MachineWord").reset_index(drop=True)

# --- Step 2: Load rune mapping from the rune CSV ---
mapping_df = pd.read_csv(rune_csv, encoding="utf-8")
rune_mapping = dict(zip(mapping_df["MachineLetter"], mapping_df["Rune"]))

# For multi-character tokens (e.g. "[TT]"), sort keys by length descending:
keys = sorted(rune_mapping.keys(), key=len, reverse=True)
pattern = re.compile("|".join(map(re.escape, keys)))

# --- Step 3: Create the new column 'RuneWord' using vectorized string replace ---
# Pandas' str.replace accepts a callable when regex=True
df["RuneWord"] = df["MachineWord"].str.replace(
    pattern, lambda m: rune_mapping[m.group(0)], regex=True
)

# --- Step 4: Sort by Length ascending and reorder columns ---
df = df.sort_values("Length", ascending=True).reset_index(drop=True)
df = df[["MachineWord", "RuneWord", "Length"]]

# --- Step 5: Save the final DataFrame to CSV ---
df.to_csv(output_csv, index=False, encoding="utf-8")
print(f"CSV file has been saved to {output_csv}.")

CSV file has been saved to MTWC.csv.


In [46]:
# --- Step 1: Read the CSV file ---
df = pd.read_csv("MTWC.csv", encoding="utf-8")

# Compute overall token frequencies by concatenating all MachineWord values and counting characters.
overall_counter = Counter("".join(df["MachineWord"]))

# Compute frequencies for words of length exactly 2 and 3 using vectorized filtering.
counter_eq2 = Counter("".join(df.loc[df["Length"] == 2, "MachineWord"]))
counter_eq3 = Counter("".join(df.loc[df["Length"] == 3, "MachineWord"]))

# --- Step 2: Load rune mapping from "runes_ml.csv" ---
mapping_df = pd.read_csv("runes_ml.csv", encoding="utf-8")
rune_mapping = dict(zip(mapping_df["MachineLetter"], mapping_df["Rune"]))

In [47]:
data = [
    (
        token,
        rune_mapping.get(token, token),
        freq,
        counter_eq2.get(token, 0),
        counter_eq3.get(token, 0),
    )
    for token, freq in overall_counter.items()
]

freqs = pd.DataFrame(
    data, columns=["MachineLetter", "Rune", "Frequency", "Freq_2", "Freq_3"]
)
freqs = freqs.sort_values("Frequency", ascending=False).set_index("MachineLetter")

freqs

Unnamed: 0_level_0,Rune,Frequency,Freq_2,Freq_3
MachineLetter,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A,A,1554922,6,489
I,I,510521,29,283
E,E,467329,32,272
U,U,465895,27,285
R,R,432149,12,84
N,N,391200,13,172
K,K,384470,7,102
M,M,303910,11,126
T,T,258754,5,67
Y,Y,250173,4,71
