# 1.Import libraries

In [None]:
!pip install -q transformers datasets evaluate sentencepiece accelerate google-generativeai sacrebleu
!pip install -q pandas matplotlib seaborn

In [None]:

from datasets import load_from_disk, DatasetDict, concatenate_datasets
import pandas as pd
import os
from google.colab import drive


print("‚úÖ Setup complete!")


‚úÖ Setup complete!


# 2.Load data

In [None]:
# Mount Drive
drive.mount('/content/drive')

# Paths theo h√¨nh c·ªßa b·∫°n
PROJECT_DIR = "/content/drive/MyDrive/NLP_final/NLP_Translation_Project"

print(" Loading datasets...")

Mounted at /content/drive
 Loading datasets...


In [None]:
print("üì• Loading data...")

# Load
train_dataset = load_from_disk(f"{PROJECT_DIR}/phomt_cleaned_train")
val_dataset = load_from_disk(f"{PROJECT_DIR}/phomt_validation")

with open(f"{PROJECT_DIR}/mtet_test_data.pkl", 'rb') as f:
    test_data = pickle.load(f)

print(f"Original size: Train={len(train_dataset):,}, Val={len(val_dataset):,}")

# SAMPLE 10K train
train_dataset = train_dataset.shuffle(seed=42).select(range(10000))

# SAMPLE 1K val
val_dataset = val_dataset.shuffle(seed=42).select(range(1000))

# FIX test data
if isinstance(test_data, dict):
    # Convert dict to Dataset
    from datasets import Dataset
    test_dataset = Dataset.from_dict(test_data)
elif isinstance(test_data, list):
    test_dataset = Dataset.from_list(test_data)
else:
    # If DataFrame
    test_dataset = Dataset.from_pandas(test_data)

# Sample 1K test
test_dataset = test_dataset.shuffle(seed=42).select(range(min(1000, len(test_dataset))))

print(f"\n‚úÖ Sampled: Train={len(train_dataset):,}, Val={len(val_dataset):,}, Test={len(test_dataset):,}")
print(f"Train columns: {train_dataset.column_names}")


# Show sample
print(f"\nüìã Sample train data:")
print(train_dataset[0])


üì• Loading data...
Original size: Train=498,193, Val=18,720

‚úÖ Sampled: Train=10,000, Val=1,000, Test=1,000
Train columns: ['vi', 'en']

üìã Sample train data:
{'vi': 'nh∆∞ng t·ª´ t·∫≠n ƒë√°y tr√°i tim m√¨nh t√¥i xin c√¥ th·ª© l·ªói cho t√¥i v√¨ ƒë√£ kh√¥ng ƒë·∫øn ƒë∆∞·ª£c.', 'en': 'But I beg you from the bottom of my heart to forgive me for not being there.'}


# 3.Format data

In [None]:
# ===== CELL 3: STANDARDIZE COLUMN NAMES =====

# Ki·ªÉm tra v√† ƒë·ªïi t√™n columns cho ƒë·ªìng nh·∫•t
# Expected: 'source', 'target' ho·∫∑c 'en', 'vi'

def standardize_columns(dataset):
    cols = dataset.column_names
    rename_map = {}

    # Check all possible variations
    if 'vi_test' in cols:
        rename_map['vi_test'] = 'vi'
    if 'en_test' in cols:
        rename_map['en_test'] = 'en'
    if 'en_text' in cols:
        rename_map['en_text'] = 'en'
    if 'vi_text' in cols:
        rename_map['vi_text'] = 'vi'

    if rename_map:
        for old, new in rename_map.items():
            dataset = dataset.rename_column(old, new)

    return dataset

train_dataset = standardize_columns(train_dataset)
val_dataset = standardize_columns(val_dataset)
test_dataset = standardize_columns(test_dataset)  # ADD THIS

print("‚úÖ Columns standardized!")
print(f"Train: {train_dataset.column_names}")
print(f"Test: {test_dataset.column_names}")

‚úÖ Columns standardized!
Train: ['vi', 'en']
Test: ['vi', 'en']


# 4.CREATE BIDIRECTIONAL DATA



In [None]:


def create_bidirectional_from_dataset(dataset):
    """
    Expand dataset to bidirectional (Vi‚ÜíEn + En‚ÜíVi)
    """
    data_list = []

    for item in dataset:
        # Check column names
        if 'en' in item and 'vi' in item:
            en_text = item['en']
            vi_text = item['vi']
        elif 'source' in item and 'target' in item:
            # Assume source is Vi, target is En
            vi_text = item['source']
            en_text = item['target']
        else:
            continue

        # Vi‚ÜíEn
        data_list.append({
            'source': vi_text,
            'target': en_text,
            'direction': 'vi2en'
        })

        # En‚ÜíVi
        data_list.append({
            'source': en_text,
            'target': vi_text,
            'direction': 'en2vi'
        })

    return data_list

print("üîÑ Creating bidirectional data...")

train_bi_list = create_bidirectional_from_dataset(train_dataset)
val_bi_list = create_bidirectional_from_dataset(val_dataset)

# Convert to Dataset
from datasets import Dataset

train_bi = Dataset.from_list(train_bi_list)
val_bi = Dataset.from_list(val_bi_list)

# Shuffle
train_bi = train_bi.shuffle(seed=42)
val_bi = val_bi.shuffle(seed=42)

print(f"‚úÖ Bidirectional data created!")
print(f"   Train: {len(train_bi):,} (doubled from {len(train_dataset):,})")
print(f"   Val: {len(val_bi):,}")

# Check sample
print(f"\nüîç Sample Vi‚ÜíEn:")
sample_vi2en = [item for item in train_bi if item['direction'] == 'vi2en'][0]
print(sample_vi2en)

print(f"\nüîç Sample En‚ÜíVi:")
sample_en2vi = [item for item in train_bi if item['direction'] == 'en2vi'][0]
print(sample_en2vi)


üîÑ Creating bidirectional data...
‚úÖ Bidirectional data created!
   Train: 20,000 (doubled from 10,000)
   Val: 2,000

üîç Sample Vi‚ÜíEn:
{'source': 'V·∫£ l·∫°i, chuy·ªán c√°c ng∆∞·ªùi b√πm l·∫´n nhau kh√¥ng d√≠nh d√°ng g√¨ t·ªõi ch√∫ng t√¥i.', 'target': "Besides, it's none of our business if you guys wanna bump each other off.", 'direction': 'vi2en'}

üîç Sample En‚ÜíVi:
{'source': '- I can take Carl.', 'target': '- T·ªõ s·∫Ω nh·∫≠n Carl.', 'direction': 'en2vi'}


# 5,HANDLE TEST DATA

In [None]:


print("üîÑ Converting test data...")

# Check test_dataset columns
print(f"Test columns: {test_dataset.column_names}")
print(f"Test size: {len(test_dataset)}")

# Use same bidirectional function
test_bi_list = create_bidirectional_from_dataset(test_dataset)
test_bi = Dataset.from_list(test_bi_list)

print(f"‚úÖ Test data converted: {len(test_bi):,}")
print(f"üîç Sample: {test_bi[0]}")


üîÑ Converting test data...
Test columns: ['vi', 'en']
Test size: 1000
‚úÖ Test data converted: 2,000
üîç Sample: {'source': 'Nh∆∞ng n·∫øu kh√¥ng ai cho ch√∫ng th·∫•y vi·ªác ƒë√≥ , n·∫øu ch√∫ng kh√¥ng ƒë∆∞·ª£c th·∫•y nh·ªØng ·∫£nh h∆∞·ªüng c·ªßa th·ª±c ph·∫©m t·ªõi tr√≠ n√£o v√† c∆° th·ªÉ , ch√∫ng m√π qu√°ng s·∫Ω ƒÉn b·∫•t c·ª© c√°i g√¨ ƒë∆∞·ª£c ƒë·∫∑t tr∆∞·ªõc m·∫∑t .', 'target': "But when none of this is presented to them , if they 're not shown how food affects the mind and the body , they blindly eat whatever the hell you put in front of them .", 'direction': 'vi2en'}


#  6: CREATE BASELINE DATASET & SAVE


In [None]:

baseline_datasets = DatasetDict({
    'train': train_bi,
    'validation': val_bi,
    'test': test_bi
})

print("‚úÖ Baseline dataset created:")
print(baseline_datasets)

# Save
baseline_datasets.save_to_disk(f"{PROJECT_DIR}/dataset_baseline")
print(f"\nüíæ Saved to {PROJECT_DIR}/dataset_baseline")


‚úÖ Baseline dataset created:
DatasetDict({
    train: Dataset({
        features: ['source', 'target', 'direction'],
        num_rows: 20000
    })
    validation: Dataset({
        features: ['source', 'target', 'direction'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['source', 'target', 'direction'],
        num_rows: 2000
    })
})


Saving the dataset (0/1 shards):   0%|          | 0/20000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2000 [00:00<?, ? examples/s]


üíæ Saved to /content/drive/MyDrive/NLP_final/NLP_Translation_Project/dataset_baseline


# 7.Check data quality

In [None]:
# ===== CHECK DATA QUALITY (FIXED) =====

from datasets import load_from_disk
import pandas as pd

PROJECT_DIR = "/content/drive/MyDrive/NLP_final/NLP_Translation_Project"
baseline = load_from_disk(f"{PROJECT_DIR}/dataset_baseline")

print("="*80)
print("üîç DATA QUALITY CHECK")
print("="*80)

# 1. Check sizes
print(f"\nüìä Dataset sizes:")
for split in baseline:
    print(f"   {split}: {len(baseline[split]):,}")

# 2. Check for NaN/empty
print(f"\nüßπ Checking for NaN/empty values:")
for split in ['train', 'validation', 'test']:
    data = baseline[split]
    empty_source = sum(1 for i in range(len(data)) if not data[i]['source'] or data[i]['source'].strip() == '')
    empty_target = sum(1 for i in range(len(data)) if not data[i]['target'] or data[i]['target'].strip() == '')
    print(f"   {split}: Empty source={empty_source}, Empty target={empty_target}")

# 3. Check length distribution
print(f"\nüìè Length statistics (words):")
train_data = baseline['train']
source_lens = [len(train_data[i]['source'].split()) for i in range(min(1000, len(train_data)))]
target_lens = [len(train_data[i]['target'].split()) for i in range(min(1000, len(train_data)))]

print(f"   Source - Min:{min(source_lens)}, Max:{max(source_lens)}, Avg:{sum(source_lens)/len(source_lens):.1f}")
print(f"   Target - Min:{min(target_lens)}, Max:{max(target_lens)}, Avg:{sum(target_lens)/len(target_lens):.1f}")

# 4. Check duplicates
print(f"\nüîÑ Checking duplicates in train:")
sources = [baseline['train'][i]['source'] for i in range(len(baseline['train']))]
unique_sources = len(set(sources))
print(f"   Total: {len(sources):,}, Unique: {unique_sources:,}, Duplicates: {len(sources) - unique_sources:,}")

# 5. Check direction balance
print(f"\n‚öñÔ∏è Direction balance:")
for split in ['train', 'validation', 'test']:
    data = baseline[split]
    vi2en = sum(1 for i in range(len(data)) if data[i]['direction'] == 'vi2en')
    en2vi = sum(1 for i in range(len(data)) if data[i]['direction'] == 'en2vi')
    print(f"   {split}: Vi‚ÜíEn={vi2en:,}, En‚ÜíVi={en2vi:,}")

# 6. Show samples
print(f"\nüìù Random samples:")
import random
random.seed(42)
for idx in random.sample(range(len(baseline['train'])), 3):
    sample = baseline['train'][idx]
    print(f"\n{idx+1}. [{sample['direction']}]")
    print(f"   Source: {sample['source'][:100]}...")
    print(f"   Target: {sample['target'][:100]}...")

print("\n" + "="*80)
print("‚úÖ Quality check complete!")


üîç DATA QUALITY CHECK

üìä Dataset sizes:
   train: 20,000
   validation: 2,000
   test: 2,000

üßπ Checking for NaN/empty values:
   train: Empty source=0, Empty target=0
   validation: Empty source=0, Empty target=0
   test: Empty source=0, Empty target=0

üìè Length statistics (words):
   Source - Min:3, Max:94, Avg:14.3
   Target - Min:3, Max:121, Avg:14.9

üîÑ Checking duplicates in train:
   Total: 20,000, Unique: 19,994, Duplicates: 6

‚öñÔ∏è Direction balance:
   train: Vi‚ÜíEn=10,000, En‚ÜíVi=10,000
   validation: Vi‚ÜíEn=1,000, En‚ÜíVi=1,000
   test: Vi‚ÜíEn=1,000, En‚ÜíVi=1,000

üìù Random samples:

3649. [en2vi]
   Source: There's a protein found in milk called casein that acts like a detergent against capsaicin....
   Target: Protein casein c√≥ trong s·ªØa ho·∫°t ƒë·ªông nh∆∞ m·ªôt ch·∫•t t·∫©y ƒë·ªëi v·ªõi capsaicin....

820. [vi2en]
   Source: Priestley t√≠ch h·ª£p nhi·ªÅu y·∫øu t·ªë c·ªßa truy·ªÅn thuy·∫øt "Rogue Trader" v√†o Warhammer 40.000, ch·ªß y·∫øu l√† nh.