## Importing Useful Packages and Mounting Google Drive

In [1]:
import pandas as pd
import string
from google.colab import data_table

In [2]:
# mounting google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Load Data

In [3]:
# define source and target languages
source_name = 'English'
target_name = 'IsiNdebele'

source_code = 'eng'
target_code = 'nde'

In [4]:
# loading data
main_path = f'/content/drive/My Drive/Research/{source_code}-to-{target_code}'
gtest_path = f'{main_path}/Autshumato-Evaluation-Set/Autshumato.EvaluationSet.English.Translator1.txt'
ref1_path = f'{main_path}/Autshumato-Evaluation-Set/Autshumato.EvaluationSet.{target_name}.Translator1.txt'
ref2_path = f'{main_path}/Autshumato-Evaluation-Set/Autshumato.EvaluationSet.{target_name}.Translator2.txt'
ref3_path = f'{main_path}/Autshumato-Evaluation-Set/Autshumato.EvaluationSet.{target_name}.Translator3.txt'
ref4_path = f'{main_path}/Autshumato-Evaluation-Set/Autshumato.EvaluationSet.{target_name}.Translator4.txt'

with open(gtest_path, 'r', encoding='utf-8') as f:
  gtest = f.read().splitlines()

with open(ref1_path, 'r', encoding='utf-8') as f:
  ref1 = f.read().splitlines()

with open(ref2_path, 'r', encoding='utf-8') as f:
  ref2 = f.read().splitlines()

with open(ref3_path, 'r', encoding='utf-8') as f:
  ref3 = f.read().splitlines()

with open(ref4_path, 'r', encoding='utf-8') as f:
  ref4 = f.read().splitlines()

In [5]:
# convert text files containing source and target segments to pandas.DataFrame
df_gtest = pd.DataFrame({source_name:gtest})

In [6]:
# visualise dataframe
data_table.DataTable(df_gtest, include_index=True, num_rows_per_page=5)

Unnamed: 0,English
0,<Doc01>
1,South African Social Security Agency
2,Customer Care Charter
3,Welcome to our Client Charter
4,"We want you, our client, to judge us according..."
...,...
509,It could take up to 6 months.
510,The service is free.
511,Forms to be filled in.
512,Application forms for provider accreditation a...


## Cleaning Test Set

This involves removing rows that indicate which document the sentences were obtained from (i.e., has nothing to do with translation task).

In [7]:
# removing rows from global test set
remove_rows = []
for i in range(len(df_gtest)):
  if df_gtest.iloc[i,0][:4] == '<Doc':
    remove_rows.append(i)

# rows to remove (not translate) in gtest
df_gtest.iloc[remove_rows, :]

Unnamed: 0,English
0,<Doc01>
82,<Doc02>
116,<Doc03>
153,<Doc04>
205,<Doc05>
234,<Doc06>
246,<Doc07>
270,<Doc08>
291,<Doc09>
345,<Doc10>


In [8]:
# dropping rows in gtest
df_gtest.drop(remove_rows, inplace=True)
df_gtest.reset_index(drop=True, inplace=True)

In [9]:
# dropping rows in ref1, ..., ref4
ref1 = [ref1[i] for i in range(len(ref1)) if i not in remove_rows]
ref2 = [ref2[i] for i in range(len(ref2)) if i not in remove_rows]
ref3 = [ref3[i] for i in range(len(ref3)) if i not in remove_rows]
ref4 = [ref4[i] for i in range(len(ref4)) if i not in remove_rows]

## Convert Datasets to Binary Files

In [10]:
with open(f'{main_path}/cleaned-data/clean_gtest.{source_code}', 'w') as f:
  for segment in df_gtest[source_name]:
    f.write(segment + '\n')

with open(f'{main_path}/cleaned-data/ref1.{target_code}', 'w') as f:
  for segment in ref1:
    f.write(segment + '\n')

with open(f'{main_path}/cleaned-data/ref2.{target_code}', 'w') as f:
  for segment in ref2:
    f.write(segment + '\n')

with open(f'{main_path}/cleaned-data/ref3.{target_code}', 'w') as f:
  for segment in ref3:
    f.write(segment + '\n')

with open(f'{main_path}/cleaned-data/ref4.{target_code}', 'w') as f:
  for segment in ref4:
    f.write(segment + '\n')