# Step 2. Splitting Dataset
This notebook formats Bengali sentences into training data.

In [None]:
import os
import pandas as pd
from tqdm import tqdm
from joblib import Parallel, delayed
from sklearn.model_selection import train_test_split

In [None]:
def fix_for_first_ch_punc(line):
    if line and (line[0] in [',', '।', '?', '-']):
        line = line[1:]
        return ' '.join(line.split())
    return line

def split_sen_with_label(line):
    line = fix_for_first_ch_punc(line)
    words, labels = [], []
    word_list = line.split()
    for w in word_list:
        if w in [',', '।', '?', '-']:
            if w == ',':
                lab = 'comma'
            elif w == '।':
                lab = 'end'
            elif w == '?':
                lab = 'qm'
            elif w == '-':
                lab = 'hyp'
            labels.pop()
            labels.append(lab)
        else:
            lab = 'blank'
            words.append(w)
            labels.append(lab)

    yield words, labels

def transform_data():
    print("--------------------------------Processing (given train)----------------------------------")
    df = pd.read_csv('processed_given_train.tsv', sep='\t')
    print("Before CSV length:", len(df))
    df = df.drop_duplicates().reset_index(drop=True)
    print("After CSV length:", len(df))

    line_list = df['sentence'].tolist()

    outfile = open('given_train.csv', 'w')
    print('sentence_index,sentence,label', file=outfile)

    def process(ix, line):
        g = split_sen_with_label(line)
        words, labels = next(g)
        if len(words) == len(labels):
            return [ix+1," ".join(words)," ".join(labels)]

    out = Parallel(n_jobs=-1)(delayed(process)(ix, line) for ix, line in tqdm(enumerate(line_list)))

    for i in tqdm(out):
        print(*i, sep=',', file=outfile)

    outfile.close()

    print("--------------------------------Processing (given valid)----------------------------------")
    df = pd.read_csv('processed_given_valid.tsv', sep='\t')
    print("Before CSV length:", len(df))
    df = df.drop_duplicates().reset_index(drop=True)
    print("After CSV length:", len(df))

    line_list = df['sentence'].tolist()

    outfile = open('given_valid.csv', 'w')
    print('sentence_index,sentence,label', file=outfile)

    def process(ix, line):
        g = split_sen_with_label(line)
        words, labels = next(g)
        if len(words) == len(labels):
            return [ix+1," ".join(words)," ".join(labels)]

    out = Parallel(n_jobs=-1)(delayed(process)(ix, line) for ix, line in tqdm(enumerate(line_list)))

    for i in tqdm(out):
        print(*i, sep=',', file=outfile)

    outfile.close()

    print("--------------------------------Processing (IndicCorp v2) 1/4----------------------------------")
    df = pd.read_csv('processed_indiccorpv2_0.tsv', sep='\t')
    print("Before CSV length:", len(df))
    df = df.drop_duplicates().reset_index(drop=True)
    print("After CSV length:", len(df))
    df_train, df_valid = train_test_split(df, test_size=0.01, random_state=42)

    line_list = df_train['sentence'].tolist()

    outfile = open('indiccorpv2_0_train.csv', 'w')
    print('sentence_index,sentence,label', file=outfile)

    def process(ix, line):
        g = split_sen_with_label(line)
        words, labels = next(g)
        if len(words) == len(labels):
            return [ix+1," ".join(words)," ".join(labels)]

    out = Parallel(n_jobs=-1)(delayed(process)(ix, line) for ix, line in tqdm(enumerate(line_list)))

    for i in tqdm(out):
        print(*i, sep=',', file=outfile)

    outfile.close()

    line_list = df_valid['sentence'].tolist()

    outfile = open('indiccorpv2_0_valid.csv', 'w')
    print('sentence_index,sentence,label', file=outfile)

    out = Parallel(n_jobs=-1)(delayed(process)(ix, line) for ix, line in tqdm(enumerate(line_list)))

    for i in tqdm(out):
        print(*i, sep=',', file=outfile)

    outfile.close()

    print("--------------------------------Processing (IndicCorp v2) 2/4----------------------------------")
    df = pd.read_csv('processed_indiccorpv2_1.tsv', sep='\t')
    print("Before CSV length:", len(df))
    df = df.drop_duplicates().reset_index(drop=True)
    print("After CSV length:", len(df))
    df_train, df_valid = train_test_split(df, test_size=0.01, random_state=42)

    line_list = df_train['sentence'].tolist()

    outfile = open('indiccorpv2_1_train.csv', 'w')
    print('sentence_index,sentence,label', file=outfile)

    def process(ix, line):
        g = split_sen_with_label(line)
        words, labels = next(g)
        if len(words) == len(labels):
            return [ix+1," ".join(words)," ".join(labels)]

    out = Parallel(n_jobs=-1)(delayed(process)(ix, line) for ix, line in tqdm(enumerate(line_list)))

    for i in tqdm(out):
        print(*i, sep=',', file=outfile)

    outfile.close()

    line_list = df_valid['sentence'].tolist()

    outfile = open('indiccorpv2_1_valid.csv', 'w')
    print('sentence_index,sentence,label', file=outfile)

    out = Parallel(n_jobs=-1)(delayed(process)(ix, line) for ix, line in tqdm(enumerate(line_list)))

    for i in tqdm(out):
        print(*i, sep=',', file=outfile)

    outfile.close()

    print("--------------------------------Processing (IndicCorp v2) 3/4----------------------------------")
    df = pd.read_csv('processed_indiccorpv2_2.tsv', sep='\t')
    print("Before CSV length:", len(df))
    df = df.drop_duplicates().reset_index(drop=True)
    print("After CSV length:", len(df))
    df_train, df_valid = train_test_split(df, test_size=0.01, random_state=42)

    line_list = df_train['sentence'].tolist()

    outfile = open('indiccorpv2_2_train.csv', 'w')
    print('sentence_index,sentence,label', file=outfile)

    def process(ix, line):
        g = split_sen_with_label(line)
        words, labels = next(g)
        if len(words) == len(labels):
            return [ix+1," ".join(words)," ".join(labels)]

    out = Parallel(n_jobs=-1)(delayed(process)(ix, line) for ix, line in tqdm(enumerate(line_list)))

    for i in tqdm(out):
        print(*i, sep=',', file=outfile)

    outfile.close()

    line_list = df_valid['sentence'].tolist()

    outfile = open('indiccorpv2_2_valid.csv', 'w')
    print('sentence_index,sentence,label', file=outfile)

    out = Parallel(n_jobs=-1)(delayed(process)(ix, line) for ix, line in tqdm(enumerate(line_list)))

    for i in tqdm(out):
        print(*i, sep=',', file=outfile)

    outfile.close()

    print("--------------------------------Processing (IndicCorp v2) 4/4----------------------------------")
    df = pd.read_csv('processed_indiccorpv2_3.tsv', sep='\t')
    print("Before CSV length:", len(df))
    df = df.drop_duplicates().reset_index(drop=True)
    print("After CSV length:", len(df))
    df_train, df_valid = train_test_split(df, test_size=0.01, random_state=42)

    line_list = df_train['sentence'].tolist()

    outfile = open('indiccorpv2_3_train.csv', 'w')
    print('sentence_index,sentence,label', file=outfile)

    def process(ix, line):
        g = split_sen_with_label(line)
        words, labels = next(g)
        if len(words) == len(labels):
            return [ix+1," ".join(words)," ".join(labels)]

    out = Parallel(n_jobs=-1)(delayed(process)(ix, line) for ix, line in tqdm(enumerate(line_list)))

    for i in tqdm(out):
        print(*i, sep=',', file=outfile)

    outfile.close()

    line_list = df_valid['sentence'].tolist()

    outfile = open('indiccorpv2_3_valid.csv', 'w')
    print('sentence_index,sentence,label', file=outfile)

    out = Parallel(n_jobs=-1)(delayed(process)(ix, line) for ix, line in tqdm(enumerate(line_list)))

    for i in tqdm(out):
        print(*i, sep=',', file=outfile)

    outfile.close()

In [None]:
transform_data()