In [1]:
# Loading Dataset
import utils

folder_path = 'datasets/cot-fsopt'
dataset = utils.load_dataset(folder_path)
dataset

Unnamed: 0,inputs,targets,_template_idx,_task_source,_task_name,_template_type
0,The man be showing his toys to adults and not ...,"Premise: ""Woman skates in possession of puck.""...",1,CoT,cot_esnli_ii,fs_opt
1,"[QUESTION] If ""A brown dog is on the ground gr...",A man is break dancing and collecting a lot of...,0,CoT,cot_esnli,fs_opt
2,Jax: Which of the following sentences is nonse...,Chain of thought: A watch is not worn on one's...,1,CoT,cot_sensemaking,fs_opt
3,"Student asked: Given the sentence ""A boy weari...",Let's think. A group is on the river regardles...,7,CoT,cot_esnli,fs_opt
4,**Q**\nIs the following sentence factually cor...,no\nLaughter is not a response of fear but to ...,1,CoT,cot_creak,fs_opt
...,...,...,...,...,...,...
182899,"Q: Premise: ""A boy and girl are placing a red ...",The couple is enjoying the wedding but it is n...,2,CoT,cot_esnli,fs_opt
182900,"[Of the below sentences, which one does *not* ...",My step-by-step solution first: Rude people ar...,7,CoT,cot_sensemaking,fs_opt
182901,Just because the gymnast is flipping does not ...,"If ""A large boat drives through the harbor."" d...",2,CoT,cot_esnli_ii,fs_opt
182902,Test for natural language inference.\nPremise:...,A man walking in front of a Heal's store is no...,3,CoT,cot_esnli,fs_opt


In [2]:
# Translate using unofficial API
from concurrent.futures import wait, FIRST_EXCEPTION
from termcolor import colored
from deep_translator.exceptions import RequestError
from errors import InvalidOutputError, MissingTranslationError, GeneralError, ReachedMaxRetriesError
import csv
import concurrent.futures
import utils
from multi_thread_handler import mth

dataset_name = 'cot-fsopt'
output_folder = f'outputs/{dataset_name}'
start_pointer_file_path = f'{output_folder}/start-pointer.txt'
next_file_index_file_path = f'{output_folder}/next-file-index.txt'
batch_size = 20

start_pointer = utils.read_integer_from_file(start_pointer_file_path)
next_file_index = utils.read_integer_from_file(next_file_index_file_path)
# start_pointer = 9461
# next_file_index = 9

file_name = utils.get_output_csv_path(output_folder, next_file_index, dataset_name, 'csv')
content_len = len(dataset)

error_occurred = False


def process_row(args):
    i, row = args
    input_text = row['inputs']
    target_text = row['targets']

    mth.safe_print(f"Processing Row: {i}")
    result = utils.choose_translation_method_and_translate(mth.rate_limited_translate, i, [input_text, target_text])
    if len(result) != 2:
        raise InvalidOutputError

    input_result = result[0]
    target_result = result[1]

    mth.safe_print(f"Queued Translation: {i}")
    return i, input_text, input_result, target_text, target_result


def translate_dataset(block_after: int = None):
    global error_occurred

    with open(file_name, 'w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['Id', 'Original Input', 'Translated Input', 'Original Target', 'Translated Target'])

        utils.update_integer_in_file(next_file_index_file_path, next_file_index + 1)

        end_pointer = start_pointer + block_after if block_after is not None else content_len
        current_batch_start = start_pointer
        connection_retries = 0
        start_time = utils.get_current_time()

        while current_batch_start < end_pointer and connection_retries < 3:
            current_batch_end = min(current_batch_start + batch_size, end_pointer)

            error_occurred = False

            with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
                futures = {executor.submit(process_row, (i, row)): i for i, row in
                           dataset.iloc[current_batch_start:current_batch_end].iterrows()}
                results = {}

                done, not_done = wait(futures.keys(), return_when=FIRST_EXCEPTION)
                non_network_error_occurred = False

                for future in done:
                    try:
                        i, input_text, input_result, target_text, target_result = future.result()
                        results[i] = (input_text, input_result, target_text, target_result)
                    except RequestError as e:
                        mth.safe_print(colored(f"[Network Error - Automatic Retry]: {e}", 'red'))
                        connection_retries += 1
                        if connection_retries >= 3:
                            raise ReachedMaxRetriesError
                        error_occurred = True
                        break
                    except Exception as e:
                        mth.safe_print(colored(f"[Non-Network Error]: {e}", 'red'))
                        error_occurred = True
                        non_network_error_occurred = True
                        raise e

                if error_occurred:
                    for future in not_done:
                        mth.safe_print(colored(f"Cancelling unsubmitted futures: {future}", 'yellow'))
                        future.cancel()

                    if non_network_error_occurred:
                        break
                    else:
                        continue

                # Write the results of this batch
                for i in range(current_batch_start, current_batch_end):
                    print(f"Writing row {i}")
                    if i in results:
                        writer.writerow([i] + list(results[i]))
                    else:
                        raise MissingTranslationError(i)

                utils.update_integer_in_file(start_pointer_file_path, current_batch_end)
                file.flush()

                current_batch_start = current_batch_end
                current_time = utils.get_current_time()
                speed = utils.get_speed(current_batch_end - start_pointer, start_time, current_time)
                estimated_time = utils.get_estimated_time(content_len - start_pointer,
                                                          i - start_pointer, start_time,
                                                          current_time)

                mth.safe_print(
                    colored(
                        f"Moving to next batch. Translated {current_batch_end} of {content_len}, Elapsed (Secs): {current_time - start_time}, Estimated (Hrs): {estimated_time}, Speed: {speed}",
                        "green"))


translate_dataset()

Processing Row: 0
Processing Row: 1
Processing Row: 2
Processing Row: 3
Sent a request on 1722694261.371
Sent a request on 1722694261.372
Processing Row: 4
Sent a request on 1722694261.374
Processing Row: 5
Processing Row: 6
Sent a request on 1722694261.376
Processing Row: 7
Processing Row: 8
Processing Row: 9
Processing Row: 10
Processing Row: 11
Sent a request on 1722694261.381
Processing Row: 12
Processing Row: 13
Processing Row: 14
Sent a request on 1722694261.394
Processing Row: 15
Sent a request on 1722694261.397
Processing Row: 16
Processing Row: 17
Processing Row: 18
Processing Row: 19
Sent a request on 1722694261.404
Sent a request on 1722694261.405
Sent a request on 1722694261.407
Sent a request on 1722694261.407
Sent a request on 1722694261.408
Sleeping for 1 seconds/s
Translated by blob for index 3, Time: 1722694261
Queued Translation: 3
Translated by blob for index 7, Time: 1722694261
Queued Translation: 7
Translated by blob for index 9, Time: 1722694261
Queued Translation

KeyboardInterrupt: 