# Step 1. Preparing Dataset
In this notebook, we will clean Bengali sentences. Please prepare train.csv (given data) with fold column in advance. In this notebook, "train_with_fold.csv" is the file.

# Libraries

In [None]:
!pip install joblib
!pip install indic-nlp-library
!pip install bnunicodenormalizer
!wget -P . https://objectstore.e2enetworks.net/ai4b-public-nlu-nlg/indic-corp-frozen-for-the-paper-oct-2022/bn.txt

In [None]:
import re
import argparse
import pandas as pd
from tqdm import tqdm
from joblib import Parallel, delayed
from bnunicodenormalizer import Normalizer
from indicnlp.tokenize.indic_tokenize import trivial_tokenize

# Process Raw Text

In [None]:
def make_pattern_from_dict():
    pattern = '[^\u0980-\u09FF\ \,\।\?\-]'
    return pattern

def find_punctuation_count(line):
    sent = []
    sent.append(line)
    punc = re.findall('[\,\।\?\-]+', line)
    return sent + [punc.count(ch) for ch in [',', '।', '?', '-']]

def process_sent(sent):
    normalized = normalize(sent)
    processed = ' '.join(trivial_tokenize(normalized, 'bn'))
    return processed

def filter_line(line):
    out = None
    if re.search(punc, line):
        line = process_sent(line.strip())
        clean_line = re.sub(pattern, ' ', line)
        if not (clean_line and (clean_line[0] in [',', '।', '?', '-'])):
            temp_line = clean_line.replace(" ","")
            if not regex.search(temp_line):
                out =  ' '.join(clean_line.split())
    return out

def get_clean_data():
    global pattern, punc, regex, normalize

    punc = '[\,\।\?\-]+'
    regex = re.compile('[\,\।\?\-]{2,}')
    bnorm = Normalizer()
    def normalize(sentence):
        word = [bnorm(word)['normalized'] for word in sentence.split()]
        return " ".join([w for w in word if w is not None])
    pattern = make_pattern_from_dict()

    print("--------------------------------Processing (given train)----------------------------------")
    given_df = pd.read_csv('train_with_fold.csv', usecols=['sentence', 'fold'])
    given_df = given_df[given_df['fold'] != 0].reset_index(drop=True)
    given_df['sentence'] = given_df['sentence'].apply(lambda x: re.sub('!', '।', x))
    given_df['sentence'] = given_df['sentence'].apply(lambda x: re.sub('[\।\?]*\?[\।\?]*', '?', x))
    given_df['sentence'] = given_df['sentence'].apply(lambda x: re.sub('।+', '।', x))
    line_list = given_df['sentence'].tolist()

    outfile = open('processed_given_train.tsv', 'w')
    print("sentence\tcomma_count\tend_count\tqm_count\thyp_count", file=outfile)

    gen = (filter_line(line) for line in line_list)

    out = Parallel(n_jobs=-1)(delayed(find_punctuation_count)(line) for line in tqdm(gen) if line)

    for line in tqdm(out):
        print(*line, sep="\t", file=outfile)

    outfile.close()

    print("--------------------------------Processing (given valid)----------------------------------")
    given_df = pd.read_csv('train_with_fold.csv', usecols=['sentence', 'fold'])
    given_df = given_df[given_df['fold'] == 0].reset_index(drop=True)
    given_df['sentence'] = given_df['sentence'].apply(lambda x: re.sub('!', '।', x))
    given_df['sentence'] = given_df['sentence'].apply(lambda x: re.sub('[\।\?]*\?[\।\?]*', '?', x))
    given_df['sentence'] = given_df['sentence'].apply(lambda x: re.sub('।+', '।', x))
    line_list = given_df['sentence'].tolist()

    outfile = open('processed_given_valid.tsv', 'w')
    print("sentence\tcomma_count\tend_count\tqm_count\thyp_count", file=outfile)

    gen = (filter_line(line) for line in line_list)

    out = Parallel(n_jobs=-1)(delayed(find_punctuation_count)(line) for line in tqdm(gen) if line)

    for line in tqdm(out):
        print(*line, sep="\t", file=outfile)

    outfile.close()

    print("--------------------------------Processing (IndicCorp v2) 1/4----------------------------------")
    line_list = []
    with open("bn.txt") as inpfile:
        line_list.extend(inpfile.readlines()[:10000000])
        print(len(line_list))

    outfile = open('processed_indiccorpv2_0.tsv', 'w')
    print("sentence\tcomma_count\tend_count\tqm_count\thyp_count", file=outfile)

    gen = (filter_line(line) for line in line_list)

    out = Parallel(n_jobs=-1)(delayed(find_punctuation_count)(line) for line in tqdm(gen) if line)

    for line in tqdm(out):
        print(*line, sep="\t", file=outfile)

    outfile.close()

    print("--------------------------------Processing (IndicCorp v2) 2/4----------------------------------")
    line_list = []
    with open("bn.txt") as inpfile:
        line_list.extend(inpfile.readlines()[10000000:20000000])
        print(len(line_list))

    outfile = open('processed_indiccorpv2_1.tsv', 'w')
    print("sentence\tcomma_count\tend_count\tqm_count\thyp_count", file=outfile)

    gen = (filter_line(line) for line in line_list)

    out = Parallel(n_jobs=-1)(delayed(find_punctuation_count)(line) for line in tqdm(gen) if line)

    for line in tqdm(out):
        print(*line, sep="\t", file=outfile)

    outfile.close()

    print("--------------------------------Processing (IndicCorp v2) 3/4----------------------------------")
    line_list = []
    with open("bn.txt") as inpfile:
        line_list.extend(inpfile.readlines()[20000000:30000000])
        print(len(line_list))

    outfile = open('processed_indiccorpv2_2.tsv', 'w')
    print("sentence\tcomma_count\tend_count\tqm_count\thyp_count", file=outfile)

    gen = (filter_line(line) for line in line_list)

    out = Parallel(n_jobs=-1)(delayed(find_punctuation_count)(line) for line in tqdm(gen) if line)

    for line in tqdm(out):
        print(*line, sep="\t", file=outfile)

    outfile.close()

    print("--------------------------------Processing (IndicCorp v2) 4/4----------------------------------")
    line_list = []
    with open("bn.txt") as inpfile:
        line_list.extend(inpfile.readlines()[30000000:])
        print(len(line_list))

    outfile = open('processed_indiccorpv2_3.tsv', 'w')
    print("sentence\tcomma_count\tend_count\tqm_count\thyp_count", file=outfile)

    gen = (filter_line(line) for line in line_list)

    out = Parallel(n_jobs=-1)(delayed(find_punctuation_count)(line) for line in tqdm(gen) if line)

    for line in tqdm(out):
        print(*line, sep="\t", file=outfile)

    outfile.close()

In [None]:
get_clean_data()

--------------------------------Processing (IndicCorp v2) 4/4----------------------------------
11004792


2568264it [3:30:34, 170.93it/s]