In [1]:
import pandas as pd
import random
from tqdm import tqdm

def flatten_sentence(sentence):
    vietnamese_to_ascii = {
        'đ': 'd', 'Đ': 'D',
        'á': 'a', 'à': 'a', 'ả': 'a', 'ã': 'a', 'ạ': 'a',
        'ă': 'a', 'ắ': 'a', 'ằ': 'a', 'ẳ': 'a', 'ẵ': 'a', 'ặ': 'a',
        'â': 'a', 'ấ': 'a', 'ầ': 'a', 'ẩ': 'a', 'ẫ': 'a', 'ậ': 'a',
        'é': 'e', 'è': 'e', 'ẻ': 'e', 'ẽ': 'e', 'ẹ': 'e',
        'ê': 'e', 'ế': 'e', 'ề': 'e', 'ể': 'e', 'ễ': 'e', 'ệ': 'e',
        'í': 'i', 'ì': 'i', 'ỉ': 'i', 'ĩ': 'i', 'ị': 'i',
        'ó': 'o', 'ò': 'o', 'ỏ': 'o', 'õ': 'o', 'ọ': 'o',
        'ô': 'o', 'ố': 'o', 'ồ': 'o', 'ổ': 'o', 'ỗ': 'o', 'ộ': 'o',
        'ơ': 'o', 'ớ': 'o', 'ờ': 'o', 'ở': 'o', 'ỡ': 'o', 'ợ': 'o',
        'ú': 'u', 'ù': 'u', 'ủ': 'u', 'ũ': 'u', 'ụ': 'u',
        'ư': 'u', 'ứ': 'u', 'ừ': 'u', 'ử': 'u', 'ữ': 'u', 'ự': 'u',
        'ý': 'y', 'ỳ': 'y', 'ỷ': 'y', 'ỹ': 'y', 'ỵ': 'y',
    }
    sentence = sentence.replace(" ", "")
    return ''.join(vietnamese_to_ascii.get(char, char) for char in sentence)

def repeat_characters(sentence):
    return ''.join(char * random.randint(3, 6) if char.isalpha() else char for char in sentence)

def add_random_noise(sentence):
    num_noise_chars = max(1, int(len(sentence) * random.uniform(0.05, 0.1)))
    noise_chars = ''.join(random.choices("abcdefghijklmnopqrstuvwxyz", k=num_noise_chars))
    noisy_sentence = list(sentence)
    for noise_char in noise_chars:
        insert_position = random.randint(0, len(noisy_sentence))
        noisy_sentence.insert(insert_position, noise_char)
    return ''.join(noisy_sentence)

def create_noise(sentence, variations=5):
    noisy_variations = []
    flattened_sentence = flatten_sentence(sentence)
    for _ in range(variations):
        repeated_sentence = repeat_characters(flattened_sentence)
        noisy_sentence = add_random_noise(repeated_sentence)
        noisy_variations.append(noisy_sentence)
    return noisy_variations

# Đọc file input
input_file = r'..\dataset\vi_sents.txt'  # Đường dẫn file
with open(input_file, 'r', encoding='utf-8') as file:
    lines = [line.strip() for line in file.readlines()]

# Tạo nhiễu cho từng dòng với tqdm
data = []
for line in tqdm(lines, desc="Processing lines"):
    noisy_lines = create_noise(line)
    for noisy_line in noisy_lines:
        data.append({'Original': line, 'Noisy': noisy_line})

# Lưu vào file CSV
output_file = '..\dataset\output.csv'
df = pd.DataFrame(data)
df.to_csv(output_file, index=False, encoding='utf-8')

print(f"Output saved to {output_file}")


Processing lines: 100%|██████████| 254090/254090 [00:34<00:00, 7363.32it/s]


Output saved to ..\dataset\output.csv
