In [13]:
import csv
import pandas as pd
import re

# Regular expression to filter out marker lines (e.g. "#4/5", "104", etc.)
MARKER_PATTERN = re.compile(r'^\s*#?\d+(\/\d+)?\s*$')

def filter_lines(raw_lines):
    """
    Remove blank lines and marker lines from the raw lines.
    Returns a list of stripped, filtered lines.
    """
    filtered = []
    for line in raw_lines:
        stripped = line.strip()
        if not stripped:
            continue  # skip blank lines
        if MARKER_PATTERN.match(stripped):
            continue  # skip marker lines
        filtered.append(stripped)
    return filtered

def contains_myanmar(text):
    """
    Returns True if the text contains at least one character in the Myanmar Unicode block.
    The Myanmar block is roughly U+1000 to U+109F.
    """
    for ch in text:
        if '\u1000' <= ch <= '\u109F':
            return True
    return False

def is_myanmar(text):
    """
    Returns True if the text is considered Myanmar text.
    For our purposes, if it contains any Myanmar characters, we assume it is Myanmar.
    """
    return contains_myanmar(text)

def is_english(text):
    """
    Returns True if the text does not contain any Myanmar characters.
    """
    return not contains_myanmar(text)

def pair_lines(filtered_lines):
    """
    Iterate through the filtered lines in order and pair an English sentence
    with the next Myanmar sentence that follows.
    
    If two (or more) English lines appear consecutively, the last one is used.
    If a Myanmar line appears without a preceding English line, it is skipped.
    
    Returns a list of [English, Myanmar] pairs.
    """
    pairs = []
    pending_english = None
    for line in filtered_lines:
        if is_english(line):
            # Update the pending English sentence.
            pending_english = line
        elif is_myanmar(line):
            if pending_english:
                pairs.append([pending_english, line])
                pending_english = None  # clear after pairing
            else:
                # No pending English sentence; skip this Myanmar line.
                continue
    return pairs

def clean_english(text):
    """
    Remove all punctuation from the English text while preserving spaces.
    This uses a regex that removes any character that is not a word character or whitespace.
    """
    cleaned = re.sub(r'[^\w\s]', '', text)
    return cleaned

def clean_myanmar(text):
    """
    Remove all punctuation and spaces from the Myanmar text.
    This regex keeps only characters in the Myanmar Unicode range (U+1000 to U+109F).
    """
    # Remove any character that is not in the Myanmar block.
    cleaned = re.sub(r'[^\u1000-\u109F]', '', text)
    return cleaned

def write_sample_data():
    """
    Writes sample data to 'my_01.txt' for demonstration purposes.
    This sample intentionally includes extra blank lines and marker lines.
    
    Expected structure in the valid (filtered) lines:
      - Some lines are English.
      - Some lines are Myanmar.
    """
    sample_data = (
        "Metadata header\n"             # (to be ignored)
        "\n"                            # blank line
        "Everybody gets champagne and some people don't drink it so what's left the kids drink so we were going around drinking all this champagne.\n"  # English (valid)
        "လူတိုင်းက ရှန်ပိန်တွေရပြီး တချို့လူတွေက မသောက်ကြဘူးဆိုတော့ ကျန်တာတွေကို ကလေးတွေက သောက်ကြတာဆိုတော့ ဒီရှန်ပိန်တွေကို ငါတို့ အကုန်သောက်ခဲ့တယ်။\n"  # Myanmar (valid)
        "Extra info\n"                  # ignored
        "More info\n"                   # ignored
        "I mean that was the whole point.\n"  # English (valid)
        "ငါဆိုလိုတာက ဒါက အဓိက ဘဲလေ။\n"  # Myanmar (valid)
        "#4/5\n"                        # marker (to be removed)
        "သူက ဘယ်အချိန်လဲဆိုတာ မပြောခဲ့ပြန်ဘူး၊ ဒီတော့ ငါ့လဲ အဲဒီမှာ စိတ်ပူနေရပြီး ဒါက ဘယ်အချိန်မှာ လိုမှာလဲဆိုတာ မသိတော့ဘူး။\n"  # English? (but contains Myanmar punctuation) – language detection will consider it Myanmar since it has Myanmar characters.
        "It was probably the first thing I remember from being a little kid about, ah, especially about something that I'd done wrong.\n"  # English (valid)
        "He is from Greece and he is from a small village in Greece called Tokalleka and he came to America and I believe it was 1969 or 1970 and he shortly got married.\n"  # English (valid)
        "သူကတော့ ဂရိ က လာတာဖြစ်ပြီး ဂရိ က Tokalleka ဆိုတဲ့ ရွာငယ်လေးကပါ ဒီနောက် သူအမေရိကားကို လာခဲ့တာ အဲဒါ ၁၉၆၉ သို့မဟုတ် ၁၉၇၀ ကလို့ ငါထင်တယ် ဒီနောက် မကြာခင်မှာပဲ သူအိပ်ထောင်ပြုခဲ့တယ်။\n"  # Myanmar (valid)
        "#5/5\n"                        # marker (remove)
        "Nobody knew where they went.\n"  # English (valid)
        "And they couldn't stay in the Augusta area because people knew that they had tried to do something that was really taboo and try to pass for white.\n"  # Myanmar? (language detection: English only characters so will be considered English)
        "We were watching something on TV.\n"  # English (valid)
        "ကျွန်တော်တို့ တီဗီမှာ တခုခု ကြည့်နေခဲ့သည်။\n"  # Myanmar (valid)
        "ဒီလို အသေးအမွှားလေးတွေက ငါလုပ်နေတာတွေပေါ် အကြီးအကျယ် ပြောင်းလဲစေခဲ့တယ်။\n"  # English? Actually, contains Myanmar characters so considered Myanmar.
        "\"Um, and she said, she said, she said, Baby, she said, You don't understand about life the way I understand about life.\"\n"  # English (valid)
        "သူပြောတယ်၊ သူပြောတယ်၊ သူပြောတယ်၊ ကလေးရယ် တဲ့၊  သူပြောတယ်၊ ဘဝအ‌‌‌ကြောင်းကို ငါနားလည်သလို မင်းနားမလည်ဘူး။\n"  # Myanmar (valid)
        "စီအိုင်အေက ဒီရုပ်ရှင်ကို ဖြုတ်ချပြီး နောက်တနေ့မှာ ကုလသမဂ္ဂဆီကို ယူသွားလိုက်တယ်။\n"  # English? Contains Myanmar letters so treated as Myanmar.
        "\"So I went to, I went to Washington D.C. and I didn't go directly to, uh, that, uh, they had told me to on my orders.\"\n"  # English (valid)
        "သူကတော့ အသားနဲနဲဖြူတဲ့ လူမဲတစ်ယောက်ပါ။\n"  # Myanmar (valid)
        "\"So anyway, Dad goes and makes this nice big glass of chocolate milk for me.\"\n"  # English (valid)
        "\"Um, and so they just left town, and she, she never did see her sister again, never saw her sister again.\"\n"  # English (valid)
        "ဒီတော့ သူတို့ မြို့ကနေ ပြောင်းခဲ့ကြပြီး၊ သူ သူ့ရဲ့ ညီမကို ဘယ်တော့မှ ပြန်မတွေ့ခဲ့ဘူ။ သူ့ ညီမကို ဘယ်တော့မှ ပြန်မတွေ့ခဲ့ဘူး။\n"  # Myanmar (valid)
        "\"OK, can you hear me?\"\n"  # English (valid)
    )
    
    with open("my_01.txt", 'w', encoding='utf-8') as f:
        f.write(sample_data)
    print("Sample data written to my_01.txt.")

def pair_and_clean(filtered_lines):
    """
    Pair English and Myanmar sentences based on language detection,
    then clean the sentences:
      - Remove punctuation from both languages.
      - Additionally remove spaces from Myanmar sentences.
    
    Returns a list of [clean_english, clean_myanmar] pairs.
    """
    pairs = pair_lines(filtered_lines)
    cleaned_pairs = []
    for eng, myan in pairs:
        clean_eng = clean_english(eng)
        clean_myan = clean_myanmar(myan)
        cleaned_pairs.append([clean_eng, clean_myan])
    return cleaned_pairs

def main():
    all_data = []  # List to hold all [English, Myanmar] pairs
    df = []
    # Process files my_01.txt to my_100.txt.
    for i in range(1, 101):
        file_name = f"dataset/translation/my_{i:02d}.txt"  # e.g. my_01.txt, my_02.txt, etc.
        try:
            with open(file_name, 'r', encoding='utf-8') as f:
                raw_lines = f.readlines()
                # Filter out blank lines and marker lines.
                filtered = filter_lines(raw_lines)
                # Pair and clean the lines.
                cleaned_pairs = pair_and_clean(filtered)
                all_data.extend(cleaned_pairs)
        except FileNotFoundError:
            print(f"File {file_name} not found. Skipping.")
            continue
        except Exception as e:
            print(f"Error processing {file_name}: {e}")
            continue

    # Write the collected, cleaned pairs to a CSV file.
    output_file = 'dataset/translation/dataset.csv'
    try:
        with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerows(all_data)
            df = pd.DataFrame(all_data, columns=['en', 'my'])
        print(f"Dataset successfully written to {output_file}")
    except Exception as e:
        print(f"Error writing dataset: {e}")
    
    return df

if __name__ == "__main__":
    # For demonstration, write sample data to my_01.txt.
    write_sample_data()
    df = main()

Sample data written to my_01.txt.
Dataset successfully written to dataset/translation/dataset.csv


In [14]:
df

Unnamed: 0,en,my
0,Everybody gets champagne and some people dont ...,လူတိုင်းကရှန်ပိန်တွေရပြီးတချို့လူတွေကမသောက်ကြဘ...
1,I mean that was the whole point,ငါဆိုလိုတာကဒါကအဓိကဘဲလေ။
2,He is from Greece and he is from a small villa...,သူကတော့ဂရိကလာတာဖြစ်ပြီးဂရိကဆိုတဲ့ရွာငယ်လေးကပါဒ...
3,We were watching something on TV,ကျွန်တော်တို့တီဗီမှာတခုခုကြည့်နေခဲ့သည်။
4,Um and she said she said she said Baby she sai...,သူပြောတယ်၊သူပြောတယ်၊သူပြောတယ်၊ကလေးရယ်တဲ့၊သူပြေ...
...,...,...
9907,Davidson shouldnt talk in a way where bone and...,သည်နှင့်ကိုအသံထွက်ညီသည့်နည်းလမ်းတစ်ခုဖြင့်စကား...
9908,It would be better if Davidson rhymed the word...,သည်နှင့်ဟူသောစကားလုံးများကိုကာရံမိလျှင်ပိုကောင...
9909,A 200000 word novel at 25 is a fair price,စကားလုံး၂၀၀၀၀၀ရှိသောဝတ္ထုတစ်ပုဒ်ကို၂၅ဒေါ်လာဆို...
9910,A 200000 word novel for 25 is 4000 words per d...,စကားလုံး၂၀၀၀၀၀ရှိသည့်ဝတ္ထုတစ်ပုဒ်သည်၂၅ဒေါ်လာရှ...


In [42]:
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict

# Split the dataframe into train, test, and validation sets
train_df, temp_df = train_test_split(df, test_size=0.02, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Reset the index to avoid '__index_level_0__' column
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

# Convert the dataframes to Hugging Face datasets
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

# Create a DatasetDict
dataset_dict = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    'test': test_dataset
})

In [43]:
from huggingface_hub import HfApi, create_repo, login

# the huggingface write token has been removed for security restrictions
# Save the dataset to disk
dataset_dict.save_to_disk('dataset/npu_a3_en_my')

Saving the dataset (1/1 shards): 100%|██████████| 9713/9713 [00:00<00:00, 1489553.01 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 99/99 [00:00<00:00, 41494.56 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 100/100 [00:00<00:00, 25003.30 examples/s]


In [45]:
# Create the repository if it does not exist
repo_id = 'st125338/npu_a3_en_my'
create_repo(repo_id, repo_type='dataset', private=False)

# Push the dataset to Hugging Face
dataset_dict.push_to_hub(repo_id)

Creating parquet from Arrow format: 100%|██████████| 10/10 [00:00<00:00, 794.04ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:02<00:00,  2.41s/it]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 733.40ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.88s/it]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 677.59ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.16s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/st125338/npu_a3_en_my/commit/913dfc44311d4c4c322de86b820cd81f5613fa9b', commit_message='Upload dataset', commit_description='', oid='913dfc44311d4c4c322de86b820cd81f5613fa9b', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/st125338/npu_a3_en_my', endpoint='https://huggingface.co', repo_type='dataset', repo_id='st125338/npu_a3_en_my'), pr_revision=None, pr_num=None)