In [None]:
import pandas as pd
import re

In [None]:
def prepare_samam_data():
    samam_df = pd.read_csv("../data/raw/samam/glossary_df.csv", sep='\t')
    # Step 1: Split rows using '-' where available
    initial_split = samam_df["Malayalam"].str.split('-', n=1, expand=True)
    initial_split.columns = ['Word', 'Meaning']

    # Step 2: Separate rows that have and don't have a meaning after '-'
    rows_with_meaning = initial_split[~initial_split["Meaning"].isna()]
    rows_without_meaning = initial_split[initial_split["Meaning"].isna()]

    # Step 3: For rows without '-', split using pattern like (1), (2), etc.
    def split_by_numbered_pattern(text):
        parts = re.split(r'\s*\(\d+\)\s*', text, maxsplit=1)
        if len(parts) == 2:
            return pd.Series([parts[0].strip(), parts[1].strip()])
        else:
            return pd.Series([text.strip(), ""])  # fallback if pattern not found

    # Apply pattern-based split
    split_rows = rows_without_meaning["Word"].apply(split_by_numbered_pattern)
    split_rows.columns = ['Word', 'Meaning']

    # Step 4: Combine both sets of parsed rows
    combined_df = pd.concat([rows_with_meaning, split_rows], ignore_index=True)

    # Step 5: Clean up — remove (n) patterns and strip whitespace
    samam_cleaned_df = combined_df.applymap(lambda x: re.sub(r'\(\d+\)', '', x).strip())

    return samam_cleaned_df

def prepare_datuk_data():
    datuk = pd.read_csv("../data/datuk/files/datuk", sep='\t')
    datuk = datuk[["from_content", "to_content"]].applymap(lambda x: re.sub(r'\s?\d+$', '', x).strip())
    datuk = datuk.rename(columns={"from_content": "Word", "to_content": "Meaning"})
    return datuk

In [None]:
samam_cleaned_df = prepare_samam_data()
datuk_cleaned_df = prepare_datuk_data()
test_data_from_samam = samam_cleaned_df[~samam_cleaned_df['Word'].isin(datuk_cleaned_df['Word'])]

In [None]:
test_data_from_samam