In [None]:
!pip install pandas

In [40]:
import csv
import re
from collections import Counter
from tqdm import tqdm

import pandas as pd

In [41]:
def is_malayalam_word(word):
    """
    Check if the given word contains only Malayalam characters.
    Malayalam Unicode block is from U+0D00 to U+0D7F.
    Whitespace is ignored.
    """
    for char in word:
        if not char.isspace():
            if not (0x0D00 <= ord(char) <= 0x0D7F):
                return False
    return True

# Define file paths
input_file = "mlwiki-all-unique-words-20190101.txt"  # File containing Malayalam words (one per line)
output_file = "cleaned_ml_words.txt"

# Read the input file
with open(input_file, "r", encoding="utf-8") as f_in:
    lines = f_in.read().splitlines()

# Process lines:
#   - Strip beginning and trailing whitespace.
#   - Remove empty lines.
#   - Discard lines with multiple words (whitespace within).
#   - Discard words that are only one letter long.
#   - Keep only lines that contain only Malayalam characters.
#   - Remove duplicate words.
cleaned_lines = []
seen = set()
for line in lines:
    stripped = line.strip()
    # Skip if the line is empty.
    if not stripped:
        continue
    # Skip if the line contains internal whitespace (multiple words)
    if re.search(r"\s", stripped):
        continue
    # Skip if the word is a single letter.
    if len(stripped) == 1:
        continue
    # Keep only if the word is entirely Malayalam.
    if is_malayalam_word(stripped):
        if stripped not in seen:
            cleaned_lines.append(stripped)
            seen.add(stripped)

# Write the cleaned lines to the output file
with open(output_file, "w", encoding="utf-8") as f_out:
    for line in cleaned_lines:
        f_out.write(line + "\n")

print("Cleaning complete. See file:", output_file)


Cleaning complete. See file: cleaned_ml_words.txt


In [42]:
# Define the list of vowel signs (we don't include "ം" and "ഃ", which are handled elsewhere)
vowel_signs = ["ാ", "ി", "ീ", "ു", "ൂ", "ൃ", "െ", "േ", "ൈ", "ൊ", "ോ", "ൗ"]
# Virama character for reference (handled specially)
virama = "്"

def load_mapping(csv_filename):
    """
    Load the transliteration mapping from a CSV file.
    Expects columns: Letter, Unicode, Unicode Name, RuneString, MachineString.
    Returns a dictionary mapping the letter/ligature to its transliteration.
    """
    mapping = {}
    with open(csv_filename, encoding="utf-8") as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            letter = row["Letter"].strip()
            translit = row["MachineString"].strip()
            if letter:
                mapping[letter] = translit
    return mapping

def transliterate_line(line, mapping):
    """
    Optimized transliteration using a compiled regex to match any mapping key.
    The alternatives in the regex are sorted in descending order of length so that longer
    ligatures are prioritized.
    
    Special treatment:
      - For vowel signs: if the output so far ends with an inherent "A", remove it before appending.
      - For the virama: if the virama is at the end of a word (i.e. end of line or next character is whitespace),
        remove a trailing "A" from the output (if present) and append "U"; otherwise, simply remove the trailing "A".
    """
    # Sort keys by descending length so that longer matches are prioritized.
    sorted_keys = sorted(mapping.keys(), key=len, reverse=True)
    pattern = re.compile("|".join(re.escape(key) for key in sorted_keys))
    
    output = ""  # Accumulate output in a string
    last_index = 0
    
    for m in pattern.finditer(line):
        start, end = m.start(), m.end()
        # Append unchanged text between matches.
        output += line[last_index:start]
        key = m.group(0)
        
        # Special treatment for virama.
        if key == virama:
            if output.endswith("A"):
                output = output[:-1]
            # If virama is at the end of a word (i.e. at line's end or followed by whitespace)
            if end == len(line) or (end < len(line) and line[end].isspace()):
                output += "U"
            # Otherwise, do nothing (i.e. simply remove the inherent vowel).
        
        # Special treatment for vowel signs (other than virama).
        elif key in vowel_signs:
            if output.endswith("A"):
                output = output[:-1]
            output += mapping[key]
        
        else:
            output += mapping[key]
        
        last_index = end
    
    # Append any remaining text after the last match.
    output += line[last_index:]
    return output

In [43]:
# Define file paths
input_file = "cleaned_ml_words.txt"  # File containing Malayalam words (one per line)
# input_file = "test.txt"
output_file = "MTWC_all.txt"
# output_file = "output.txt"
csv_file = "transliteration_key_ml.csv"  # CSV mapping file in the same directory

In [None]:
# Load the transliteration mapping from CSV
mapping = load_mapping(csv_file)

# Read the Malayalam words from the input file
with open(input_file, "r", encoding="utf-8") as f_in:
    lines = f_in.read().splitlines()

# Transliterate each line using the custom mapping and write to the output file
with open(output_file, "w", encoding="utf-8") as f_out:
    for line in tqdm(lines, desc="Transliterating lines", unit="line"):
        transliterated_line = transliterate_line(line, mapping)
        f_out.write(transliterated_line + "\n")

print(f"Transliteration done. See file, {output_file}")

Transliterating lines: 100%|██████████| 1265902/1265902 [02:02<00:00, 10299.09line/s]

Transliteration done. See file:, MTWC_all.txt





In [45]:
# --- Step 1: Read input file and create DataFrame ---
input_file = "MTWC_all.txt"   # Input text file containing words line separated
output_csv = "MTWC.csv"       # Output CSV file
rune_csv = "runes_ml.csv"     # Rune mapping CSV file

# Read words from file into a DataFrame with column 'MachineWord'
with open(input_file, "r", encoding="utf-8") as f:
    words = f.read().splitlines()
df = pd.DataFrame(words, columns=["MachineWord"])

# Compute the length and filter out words >15 characters in a vectorized way
df["Length"] = df["MachineWord"].str.len()
df = df[df["Length"] <= 15].drop_duplicates(subset="MachineWord").reset_index(drop=True)

# --- Step 2: Load rune mapping from the rune CSV ---
mapping_df = pd.read_csv(rune_csv, encoding="utf-8")
rune_mapping = dict(zip(mapping_df["MachineLetter"], mapping_df["Rune"]))

# For multi-character tokens (e.g. "[TT]"), sort keys by length descending:
keys = sorted(rune_mapping.keys(), key=len, reverse=True)
pattern = re.compile("|".join(map(re.escape, keys)))

# --- Step 3: Create the new column 'RuneWord' using vectorized string replace ---
# Pandas' str.replace accepts a callable when regex=True
df["RuneWord"] = df["MachineWord"].str.replace(pattern, lambda m: rune_mapping[m.group(0)], regex=True)

# --- Step 4: Sort by Length ascending and reorder columns ---
df = df.sort_values("Length", ascending=True).reset_index(drop=True)
df = df[["MachineWord", "RuneWord", "Length"]]

# --- Step 5: Save the final DataFrame to CSV ---
df.to_csv(output_csv, index=False, encoding="utf-8")
print(f"CSV file has been saved to {output_csv}.")

CSV file has been saved to MTWC.csv.


In [46]:
# --- Step 1: Read the CSV file ---
df = pd.read_csv("MTWC.csv", encoding="utf-8")

# Compute overall token frequencies by concatenating all MachineWord values and counting characters.
overall_counter = Counter("".join(df["MachineWord"]))

# Compute frequencies for words of length exactly 2 and 3 using vectorized filtering.
counter_eq2 = Counter("".join(df.loc[df["Length"] == 2, "MachineWord"]))
counter_eq3 = Counter("".join(df.loc[df["Length"] == 3, "MachineWord"]))

# --- Step 2: Load rune mapping from "runes_ml.csv" ---
mapping_df = pd.read_csv("runes_ml.csv", encoding="utf-8")
rune_mapping = dict(zip(mapping_df["MachineLetter"], mapping_df["Rune"]))

In [47]:
data = [
    (token,
     rune_mapping.get(token, token),
     freq,
     counter_eq2.get(token, 0),
     counter_eq3.get(token, 0))
    for token, freq in overall_counter.items()
]

freqs = pd.DataFrame(data, columns=["MachineLetter", "Rune", "Frequency", "Freq_2", "Freq_3"])
freqs = freqs.sort_values("Frequency", ascending=False).set_index("MachineLetter")

freqs

Unnamed: 0_level_0,Rune,Frequency,Freq_2,Freq_3
MachineLetter,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A,A,1554922,6,489
I,I,510521,29,283
E,E,467329,32,272
U,U,465895,27,285
R,R,432149,12,84
N,N,391200,13,172
K,K,384470,7,102
M,M,303910,11,126
T,T,258754,5,67
Y,Y,250173,4,71
