# Notebook for Exploring different text cleaning and filtering procedure

In [1]:
import unicodedata
import html
from pathlib import Path
from functools import partial

from tqdm.auto import tqdm

import pythainlp
import pandas as pd

NORM_CODE = "NFKC"

def th_contain_escape_code(lang, text):
    charsets = [
        '\\x9e',
        '\\x95',
        '\\x94',
        '\\x93',
        '\\x90',
        '\\x91',
    ]

    if lang == "th":
        for char in charsets:
            if char in repr(text):
                return True
    return False


def normalize_unicode(lang, text):

    if NORM_CODE is not 'NONE' and NORM_CODE in ['NFC', 'NFD', 'NFKC', 'NFKD']:
        text = unicodedata.normalize(NORM_CODE, text)
    if lang == "th":
        return text.replace(u'\x99', u'').replace(u'\x9c', u'')
    return text


def str_strip(lang, text):

    return str(text).strip()


def normalize_text(lang, text):
    text = text.replace('“', '"').replace(
        '”', '"').replace("‘", "'").replace("’", "'")
    return text


def html_unescape(lang, text):

    return html.unescape(text)


def normalize_thai_text(lang, text):
    """
        Remove redudant symbol of tones and vowels.
        and subsitute [“เ”, “เ”] to “แ”.
    """

    if lang == "th":
        return pythainlp.util.normalize(text)

    return text

def replace_escape_code(lang, text):
    mapping = {
        # "\\x9e": "",
        # "\\x95": "",
        "\x94": "\"",
        "\x93": "\"",
        # "\\x90": "",
        "\x91": "\'",
        "\x92": "\'",
        "\x96": "-",
    }
    for char in mapping:
        if char in text:
            text = text.replace(char, mapping[char])
    return text

def filter_thai_text_without_thai_chars(lang, text):
    if lang == "th":
        thai_char = [
            "ก", "ข", "ฃ", "ค", "ฅ", "ฆ", "ง", "จ", "ฉ", "ช",
            "ซ", "ฌ", "ญ", "ฎ", "ฏ", "ฐ", "ฑ", "ฒ", "ณ", "ด",
            "ต", "ถ", "ท", "ธ", "น", "บ", "ป", "ผ", "ฝ", "พ",
            "ฟ", "ภ", "ม", "ย", "ร", "ฤ", "ล", "ฦ", "ว", "ศ",
            "ษ", "ส", "ห", "ฬ", "อ", "ฮ"
        ]
        for char in thai_char:
            if char in text:
                return False
        return True
    return False

def filter_blank_text(lang, text):
    if text == "":
        return True
    return False

CLEANING_RULES = [
    str_strip,
    html_unescape,
    normalize_unicode,
    normalize_text,
    normalize_thai_text,
    replace_escape_code,
]

FILTERING_RULES = [
    th_contain_escape_code,
    filter_thai_text_without_thai_chars,
    filter_blank_text,
]

  if NORM_CODE is not 'NONE' and NORM_CODE in ['NFC', 'NFD', 'NFKC', 'NFKD']:


# Load CSV and clean text

In [12]:
csv_file_path = "../data/scb-mt-en-th-2020+mt-opus/mt_opus_opensubtitles.csv"
# csv_file_path = "../data/scb-mt-en-th-2020+mt-opus/mt_opus_kde4.csv"
df = pd.read_csv(csv_file_path, encoding='utf-8')
csv_filename = Path(csv_file_path).stem

print(f'Begin cleaning, filtering from sub-dataset: {csv_filename}')
print(f'\nNumber of segment pairs (before): {df.shape[0]}')

n_before = df.shape[0]

for lang in ['th', 'en']:

    df[f'{lang}_text'] = df[f'{lang}_text'].apply(str)

    for rule in CLEANING_RULES:

        df[f'{lang}_text'] = df[f'{lang}_text'].apply(
            lambda x: rule(lang, x))

Begin cleaning, filtering from sub-dataset: mt_opus_opensubtitles

Number of segment pairs (before): 2924561


# Testing out filtering rules

In [13]:
lang = "th"
_rule = partial(FILTERING_RULES[0], lang)

df[f'{lang}_text_to_drop'] = df[f'{lang}_text'].apply(_rule)

In [59]:
lang = "en"
_rule = partial(FILTERING_RULES[2], lang)

df[f'{lang}_text_to_drop'] = df[f'{lang}_text'].apply(_rule)

In [14]:
# Show dropped text
df[df["th_text_to_drop"]== True][["en_text","th_text"]]

Unnamed: 0,en_text,th_text
215088,"'I'll try you at work. Bye, bye.'","เธเธตเนเธเธทเธญเธเธณเธเธฑเธ""เธชเธดเธ"
215373,"'Hello, pickle. It's me, Mum.",เธญเธขเนเธฒเธเธเนเธญเธขเธเนเนเธเนเธเธ...
215391,One unifying detail seems to be 'that the atta...,'การโจมตีเกิดขึ้น... ' เนเธเธฒเธฐเธญเธธเนเธ...
215482,'People receiving bites have experienced heada...,เธเธฒเธเธเธตเนเธเธฒเธเนเนเธญเธฒเธเธฑเ...
215557,How's that for a slice of fried gold?,เธเธญเนเธฅเนเธง
...,...,...
2390688,"Sorry about the house, sir.",เธฅเธตเธญเธฒเธซเน เธกเธตเนเธเธฃเธซเธฒเธเธธ...
2390693,"- Heroes, every one of them. -Dad!",เนเธฎเน เธเธกเนเธเธตเธขเธเนเธเธฃเธกเธฒเ...
2390695,And they will be remembered.,"เธ""เธต เนเธเนเธกเธฑเธเธขเธธเนเธเธกเธฒเธ"
2390699,"And in this, not only did they fail... they gr...","เนเธเน, เธเธฑเธเนเธเธดเนเธ, เธเธฑเธ, ..."


In [48]:
to_delete_df = df[df["th_text_to_drop"]== True]
for i in range(len(to_delete_df)):
    en_text_rm = to_delete_df.iloc[i]["en_text"]
    th_text_rm = to_delete_df.iloc[i]["th_text"]
    print(f"{en_text_rm} | {th_text_rm}")

2003-07-03 | 2003- 07- 03
1 | 1
%1, %2 | % 1,% 2
2 | 2
3 | 3
%1 ISO | ISO% 1
%1 (%2) | % 1 (% 2)
LibKExiv2 | LibKExiv2
LibExiv2 | LibExiv2
x: %1 y: %2 | x:% 1 y:% 2
%1 %2 | % 1% 2
%1 - %2 | % 1% 2
(%1, %2) RGBA: %3, %4, %5, %6 | (% 1,% 2) RGBA:% 3,% 4,% 5,% 6
%1 (rev.: %2) | % 1 (% 2)
(%1, %2)(%3x%4) | (% 1,% 2) (% 3x% 4)
David Hodson | (c) 2004- 2007, Gilles Caulier
(c) 2004-2005, Gilles Caulier (c) 2006-2007, Gilles Caulier and Marcel Wiesweg | (c) 2004- 2008, Gilles Caulier
A digiKam image plugin to simulate infrared film. | (c) 2008, Adrian Schroeter (c) 2008, Gilles Caulier
(C) 2009, Percy Camilo Triveño Aucahuasi (C) 2000, Artur Rataj | (C) 2008, Percy Camilo Triveño Aucahuasi (C) 2000, Artur Rataj
(c) 1997-98 Bernd Johannes Wuebben | (c) 1997- 98 Bernd Johannes Wuebben
(C) 2005-2006 Gaël de Chalendar | (C) 2005- 2006 Gaël de Chalendar
(c) 2005-2006, Gaël de Chalendar & lt; kleag@free. frgt; | (C) 2005- 2006 Gael de ChalendarPage size
(C) 2005-2006 Gael de Chalendar | (C) 2005- 2

In [24]:
df[df["th_text_to_drop"]== True].iloc[0]["en_text"]

'A hacker called the "Puppet Master" has begun to infiltrate terminals throughout our network.'

In [25]:
df[df["th_text_to_drop"]== True].iloc[0]["th_text"]

'นักเจาะระบบฉายา \x93นักเชิดหุ่น\x94 ได้เริ่มแทรกซึม เข้าสู่ระบบเครือข่ายของเรา'

In [49]:
df = df[["en_text","th_text"]]


Unnamed: 0,en_text,th_text
0,The & dolphin; Handbook,คู่มือ & dolphin;
1,& dolphin; is the default file manager for & k...,& dolphin; คือโปรแกรมจัดการแฟ้มสำหรับ & kde; ซ...
2,Filemanager,โปรแกรมจัดการแฟ้ม
3,Introduction,บทนำ
4,& dolphin; is & kde; 's default file manager. ...,& dolphin; คือโปรแกรมจัดการแฟ้มสำหรับ & kde; ซ...
...,...,...
52497,fade curve,เส้นโค้งกำกับการหรี่เสียง
52498,current volume,ระดับเสียงปัจจุบัน
52499,volume to fade to,ระดับเสียงที่จะให้หรี่ไป
52500,fade time in milliseconds,ระยะเวลาในการหรี่เสียงในหน่วยมิลลิวินาที


In [51]:
df.to_csv("test.csv", index=False)