# Block 1 & 2 (Merge Output CSVs) 

In [1]:
import re
import pandas as pd
import os

base_file_name = 'flan-zsopt'  # Base file name to match
folder_path = f'outputs/{base_file_name}'  # Folder containing the CSV files


def collect_csvs():
    pattern = re.compile(rf"^(\d+)-{re.escape(base_file_name)}\.csv$")
    csv_files = [file for file in os.listdir(folder_path) if pattern.match(file)]
    csv_files.sort(key=lambda x: int(re.findall(r'^\d+', x)[0]))
    return csv_files


csv_files = collect_csvs()


def combine_csv_files():
    df = pd.concat([pd.read_csv(os.path.join(folder_path, file), na_filter=False) for file in csv_files], ignore_index=True)
    return df

In [2]:
merged_df = combine_csv_files()
merged_df = merged_df.sort_values('Id').drop_duplicates(subset=['Id']).reset_index(drop=True)

In [3]:
merged_df

Unnamed: 0,Id,Original Input,Translated Input,Original Target,Translated Target
0,0,Briefly summarize this sentence: montenegro 's...,මෙම වාක්‍යය කෙටියෙන් සාරාංශ කරන්න: සඳුදා ප්‍රක...,montenegro 's coalition wins election,මොන්ටිනිග්‍රෝ සන්ධානය මැතිවරණයෙන් ජය ගනී
1,1,Why is technology a good thing?\nIs technology...,තාක්ෂණය හොඳ දෙයක් වන්නේ ඇයි?\nතාක්ෂණය හොඳ දෙයක...,1).,1)
2,2,Had dinner here last night and overall it was ...,ඊයේ රාත්‍රියේ මෙහි රාත්‍රී ආහාරය ගත් අතර සමස්ත...,negative,සෘණ
3,3,What is the most logical completion of this ne...,මෙම පුවත් කතාවේ වඩාත්ම තාර්කික සම්පූර්ණ කිරීම ...,Apple remains the top consumer electronics bra...,පන්තිකාමරවල සහ ඉලෙක්ට්‍රොනිකව පවත්වන ලද යෞවනයන...
4,4,@zparminter I have three and only the last one...,@zparminter මට තුනක් ඇති අතර මෙතෙක් මා මත මිය ...,positive,ධනාත්මක
...,...,...,...,...,...
1153898,1153898,"Read this: Up until the mid-14th century, Euro...",මෙය කියවන්න: 14 වන සියවසේ මැද භාගය දක්වා යුරෝප...,unanswerable,පිළිතුරු දිය නොහැකි
1153899,1153899,See the multi-choice question below:\n\nSenten...,පහත බහුවරණ ප්‍රශ්නය බලන්න:\n\nවාක්‍ය 1: අද Ibi...,(II).,(II).
1153900,1153900,Multi-choice question: What is the sentiment o...,බහුවරණ ප්‍රශ්නය: පහත ට්වීට් එකෙහි හැගීම කුමක්ද...,positive,ධනාත්මක
1153901,1153901,Ahsan loses Finnish Open final KARACHI: Pakist...,අහ්සන්ට ෆින්ලන්ත විවෘත අවසන් තරගය අහිමි වෙයි ක...,No,නැත


# Block 3 (Compare Dataset Length with Original)

In [4]:
# Load Original Dataset to Verify
import utils
from termcolor import colored

original_dataset_name = 'flan-zsopt'
folder_path = f'datasets/{original_dataset_name}'  # Folder containing the original dataset
dataset = utils.load_dataset(folder_path)


def check_dataset_length(df: pd.DataFrame, dataset: pd.DataFrame) -> bool:
    length_check = len(df) == len(dataset)
    if length_check:
        print(colored("Length check passed.", "green"))
    else:
        print(colored(f"Length check failed. Translated: {len(df)}, Original: {len(dataset)}", "red"))
    return length_check


print(len(merged_df), len(dataset))
length_check = check_dataset_length(merged_df, dataset)

1153903 1153903
[32mLength check passed.[0m


# Block 4 (Verify Row Continuity of the Translated Dataset)

In [5]:
missing_ids = []
extra_ids = []

def check_row_continuity(df: pd.DataFrame, id_column: str = 'Id') -> bool:
    global missing_ids, extra_ids

    expected_ids = set(df[id_column])
    actual_ids = set(range(len(dataset)))

    extra_ids = sorted(expected_ids - actual_ids)
    missing_ids = sorted(actual_ids - expected_ids)

    if missing_ids:
        print(colored(f"Missing IDs: {sorted(missing_ids)}", "red"))

    if extra_ids:
        print(colored(f"Extra IDs: {sorted(extra_ids)}", "red"))

    if missing_ids or extra_ids:
        return False
    else:
        print(colored("All rows are present and in order.", "green"))
        return True


db_continuity = check_row_continuity(merged_df)

[32mAll rows are present and in order.[0m


([], [])

In [53]:
# Translate any missing entries

from concurrent.futures import wait, FIRST_EXCEPTION
from termcolor import colored
from deep_translator.exceptions import RequestError
from errors import InvalidOutputError, MissingTranslationError, GeneralError, ReachedMaxRetriesError
import csv
import concurrent.futures
import utils
from multi_thread_handler import mth

dataset_name = 'flan-zsopt'
output_folder = f'outputs/{dataset_name}'
next_file_index_file_path = f'{output_folder}/next-file-index.txt'

next_file_index = utils.read_integer_from_file(next_file_index_file_path)

file_name = utils.get_output_csv_path(output_folder, next_file_index, dataset_name, 'csv')


def process_row(args):
    i, row = args
    input_text = row['inputs']
    target_text = row['targets']

    mth.safe_print(f"Processing Row: {i}")
    result = utils.choose_translation_method_and_translate(mth.rate_limited_translate, mth.sdk_translate, i, [input_text, target_text])
    if len(result) != 2:
        raise InvalidOutputError

    input_result = result[0]
    target_result = result[1]

    mth.safe_print(f"Queued Translation: {i}")
    return i, input_text, input_result, target_text, target_result


def translate_specific_ids(ids):
    file_name = utils.get_output_csv_path(output_folder, next_file_index, dataset_name, 'csv')

    with open(file_name, 'w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['Id', 'Original Input', 'Translated Input', 'Original Target', 'Translated Target'])

        utils.update_integer_in_file(next_file_index_file_path, next_file_index + 1)

        with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
            futures = {executor.submit(process_row, (i, dataset.iloc[i])): i for i in ids}
            results = {}

            done, not_done = wait(futures.keys(), return_when=FIRST_EXCEPTION)

            for future in done:
                try:
                    i, input_text, input_result, target_text, target_result = future.result()
                    results[i] = (input_text, input_result, target_text, target_result)
                except RequestError as e:
                    mth.safe_print(colored(f"[Network Error - Automatic Retry]: {e}", 'red'))
                    raise ReachedMaxRetriesError
                except Exception as e:
                    mth.safe_print(colored(f"[Non-Network Error]: {e}", 'red'))
                    raise e

            # Write the results
            for i in ids:
                if i in results:
                    writer.writerow([i] + list(results[i]))
                else:
                    raise MissingTranslationError(i)

if missing_ids:
    translate_specific_ids(missing_ids)
else:
    print(colored("No missing IDs to translate.", "green"))

[32mNo missing IDs to translate.[0m


# Block 5,6,7 (Column Data Comparison)

In [6]:
assert check_dataset_length(merged_df, dataset)


def compare_columns(df1: pd.DataFrame, df2: pd.DataFrame, df1_col_name: str, df2_col_name: str):
    comparison_df = pd.DataFrame({
        f'{df1_col_name}': df1[df1_col_name],
        f'{df2_col_name}': df2[df2_col_name],
        'match': df1[df1_col_name] == df2[df2_col_name]
    })
    mismatched_df = comparison_df[~comparison_df['match']]
    if len(mismatched_df) > 0:
        print(colored(f"[{df2_col_name}] Some entries didn't match with the originals", "red"))
    else:
        print(colored(f"[{df2_col_name}] Data comparison Successful.", "green"))

    return mismatched_df


def validate_columns():
    input_col_mismatches = compare_columns(merged_df, dataset, 'Original Input', 'inputs')
    target_col_mismatches = compare_columns(merged_df, dataset, 'Original Target', 'targets')

    if len(input_col_mismatches) == 0 and len(target_col_mismatches) == 0:
        return True
    else:
        return False


validate_columns()

[32mLength check passed.[0m
[32m[inputs] Data comparison Successful.[0m
[32m[targets] Data comparison Successful.[0m


True

In [21]:
compare_columns(merged_df, dataset, 'Original Input', 'inputs')

[32m[inputs] Data comparison Successful.[0m


Unnamed: 0,Original Input,inputs,match


In [7]:
compare_columns(merged_df, dataset, 'Original Target', 'targets')

[32m[targets] Data comparison Successful.[0m


Unnamed: 0,Original Target,targets,match


# Block 8 (Combine Metadata Records)

In [8]:
# Adding metadata to the merged dataset
dataset_metadata = dataset.loc[:, ~dataset.columns.isin(['inputs', 'targets'])]
print(f"Metadata length: {len(dataset_metadata)}")

assert check_row_continuity(merged_df)
assert validate_columns()
assert len(dataset_metadata) == len(merged_df)

merged_df_with_metadata = pd.concat([merged_df, dataset_metadata], axis=1)

merged_df_with_metadata

Metadata length: 1153903
[32mAll rows are present and in order.[0m
[32m[inputs] Data comparison Successful.[0m
[32m[targets] Data comparison Successful.[0m


Unnamed: 0,Id,Original Input,Translated Input,Original Target,Translated Target,_template_idx,_task_source,_task_name,_template_type
0,0,Briefly summarize this sentence: montenegro 's...,මෙම වාක්‍යය කෙටියෙන් සාරාංශ කරන්න: සඳුදා ප්‍රක...,montenegro 's coalition wins election,මොන්ටිනිග්‍රෝ සන්ධානය මැතිවරණයෙන් ජය ගනී,1,Flan2021,gigaword:1.2.0,zs_opt
1,1,Why is technology a good thing?\nIs technology...,තාක්ෂණය හොඳ දෙයක් වන්නේ ඇයි?\nතාක්ෂණය හොඳ දෙයක...,1).,1),0,Flan2021,glue/qqp:2.0.0,zs_opt
2,2,Had dinner here last night and overall it was ...,ඊයේ රාත්‍රියේ මෙහි රාත්‍රී ආහාරය ගත් අතර සමස්ත...,negative,සෘණ,0,Flan2021,yelp_polarity_reviews:0.2.0,zs_opt
3,3,What is the most logical completion of this ne...,මෙම පුවත් කතාවේ වඩාත්ම තාර්කික සම්පූර්ණ කිරීම ...,Apple remains the top consumer electronics bra...,පන්තිකාමරවල සහ ඉලෙක්ට්‍රොනිකව පවත්වන ලද යෞවනයන...,8,Flan2021,super_glue/record:1.0.2,zs_opt
4,4,@zparminter I have three and only the last one...,@zparminter මට තුනක් ඇති අතර මෙතෙක් මා මත මිය ...,positive,ධනාත්මක,1,Flan2021,sentiment140:1.0.0,zs_opt
...,...,...,...,...,...,...,...,...,...
1153898,1153898,"Read this: Up until the mid-14th century, Euro...",මෙය කියවන්න: 14 වන සියවසේ මැද භාගය දක්වා යුරෝප...,unanswerable,පිළිතුරු දිය නොහැකි,8,Flan2021,squad/v2.0:3.0.0,zs_opt
1153899,1153899,See the multi-choice question below:\n\nSenten...,පහත බහුවරණ ප්‍රශ්නය බලන්න:\n\nවාක්‍ය 1: අද Ibi...,(II).,(II).,4,Flan2021,glue/mnli:2.0.0,zs_opt
1153900,1153900,Multi-choice question: What is the sentiment o...,බහුවරණ ප්‍රශ්නය: පහත ට්වීට් එකෙහි හැගීම කුමක්ද...,positive,ධනාත්මක,4,Flan2021,sentiment140:1.0.0,zs_opt
1153901,1153901,Ahsan loses Finnish Open final KARACHI: Pakist...,අහ්සන්ට ෆින්ලන්ත විවෘත අවසන් තරගය අහිමි වෙයි ක...,No,නැත,0,Flan2021,anli/r3:0.1.0,zs_opt


# Block 9 (Save as Parquet)

In [9]:
import os

parquet_file_name = "flan-zsopt"  # Name of the parquet file to save

assert check_row_continuity(merged_df_with_metadata)
assert validate_columns()

updated_df = merged_df_with_metadata.drop(columns='Id')


def save_as_parquet(df: pd.DataFrame, file_name: str) -> str:
    parquet_path = f"translated_datasets/{file_name}.parquet"
    if os.path.exists(parquet_path):
        print(f"File {parquet_path} already exists. Not overwriting.")
        return parquet_path
    df.to_parquet(parquet_path, index=False)
    print(f"Data saved as {parquet_path}")
    return parquet_path


parquet_path = save_as_parquet(updated_df, parquet_file_name)

[32mAll rows are present and in order.[0m
[32m[inputs] Data comparison Successful.[0m
[32m[targets] Data comparison Successful.[0m
Data saved as translated_datasets/flan-zsopt.parquet


# Block 10 (Upload to Hugging Face)
<em>(Make sure to login with `huggingface-cli` before running this block)</em>

In [14]:
# parquet_file_name = "flan-zsopt" # Uncomment if you want to change the parquet file name
# parquet_path = f"translated_datasets/{parquet_file_name}.parquet"
print(f"Loading the saved parquet file: {parquet_path}")
saved_df = pd.read_parquet(parquet_path)

Loading the saved parquet file: translated_datasets/flan-zsopt.parquet


In [15]:

from datasets import Dataset
import os

repo_id = "0xAIT/sinhala-flan"
subset_name = "flan_zsopt"  # Make sure to change this to the subset name (Use underscores instead of hyphens e.g. "flan_zsopt")

assert check_row_continuity(merged_df)
assert validate_columns()

# read full parquet file size
chunk_size_in_mb = 250 # MB
file_size = os.path.getsize(parquet_path) / (1024 * 1024)
chunks_count = int(file_size / chunk_size_in_mb) + 1
rows_per_chunk = int(len(saved_df) / chunks_count) + 1
print(f"Parquet file size: {file_size:.2f} MB, Total chunks: {chunks_count}, Rows per chunk: {rows_per_chunk:}")


def upload_in_chunks(df, chunk_size, repo_id, dataset_name):
    try:
        user_input = input(f"Type yes to upload {parquet_file_name} subset: ")
    except EOFError:
        print("Input was canceled. Exiting without uploading.")
        return

    if user_input.lower() != 'yes':
        print("Exiting without uploading.")
        return


    for i in range(0, len(df), chunk_size):
        chunk = df.iloc[i:i+chunk_size]
        dataset = Dataset.from_pandas(chunk)
        split_name = f"{dataset_name}_chunk_{i//chunk_size}"
        dataset.push_to_hub(repo_id, config_name=dataset_name, split=split_name)
        print(f"Uploaded chunk {i//chunk_size}")


upload_in_chunks(saved_df, rows_per_chunk, repo_id, subset_name)

Parquet file size: 1612.20 MB, Total chunks: 7, Rows per chunk: 164844


Uploading the dataset shards:   0%|          | 0/2 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/83 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/83 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/1.24k [00:00<?, ?B/s]

Uploaded chunk 0


Uploading the dataset shards:   0%|          | 0/2 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/83 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/83 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/1.87k [00:00<?, ?B/s]

Uploaded chunk 1


Uploading the dataset shards:   0%|          | 0/2 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/83 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/83 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/2.02k [00:00<?, ?B/s]

Uploaded chunk 2


Uploading the dataset shards:   0%|          | 0/2 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/83 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/83 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/2.18k [00:00<?, ?B/s]

Uploaded chunk 3


Uploading the dataset shards:   0%|          | 0/2 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/83 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/83 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/2.33k [00:00<?, ?B/s]

Uploaded chunk 4


Uploading the dataset shards:   0%|          | 0/2 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/83 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/83 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/2.48k [00:00<?, ?B/s]

Uploaded chunk 5


Uploading the dataset shards:   0%|          | 0/2 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/83 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/83 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/2.63k [00:00<?, ?B/s]

Uploaded chunk 6
