In [118]:
import re
import pandas as pd

base_file_name = 'cot-zsopt'  # Base file name to match
folder_path = 'outputs-cot-zsopt'  # Folder containing the CSV files


def collect_csvs():
    pattern = re.compile(rf"^(\d+)-{re.escape(base_file_name)}\.csv$")
    csv_files = [file for file in os.listdir(folder_path) if pattern.match(file)]
    csv_files.sort(key=lambda x: int(re.findall(r'^\d+', x)[0]))
    return csv_files


csv_files = collect_csvs()


def combine_csv_files():
    df = pd.concat([pd.read_csv(os.path.join(folder_path, file)) for file in csv_files], ignore_index=True)
    return df

In [119]:
merged_df = combine_csv_files()
merged_df

Unnamed: 0,Id,Original Input,Translated Input,Original Target,Translated Target
0,0,Give some stream of consciousness and then the...,"යම් ප්‍රවාහයක් ලබා දෙන්න, ඉන්පසු පිළිතුර දෙන්න...","To answer this question, we should know that: ...","මෙම ප්‍රශ්නයට පිළිතුරු සැපයීම සඳහා, අප දැනගත ය..."
1,1,The student realized that he left his writing ...,ඔහු තම ලේඛන උපකරණය ඔහුගේ අවසාන අධ්‍යයන ස්ථානයේ...,Classroom is the place of study for students. ...,පන්ති කාමරය යනු සිසුන් සඳහා අධ්‍යයන ස්ථානයයි. ...
2,2,Let's think step by step! How do fungi get nut...,පියවරෙන් පියවර සිතමු! දිලීර පෝෂණය ලබා ගන්නේ කෙ...,"Fungi lack chlorophyll, so they cannot make fo...",දිලීර වලට හරිතප්‍රද නොමැති බැවින් ශාක වලට හැකි...
3,3,Denny asked: Test for natural language inferen...,ඩෙනී ඇසුවා: ස්වභාවික භාෂා අනුමාන සඳහා පරීක්ෂණය...,A snowy mountain is outdoors so if a man is ma...,"හිම සහිත කන්දක් එළිමහනේ, ඒ නිසා මිනිසෙක් හිම ක..."
4,4,Please answer the following question by reason...,කරුණාකර පියවරෙන් පියවර තර්ක කිරීමෙන් පහත ප්‍රශ...,A group of men stand in no particular pattern ...,මිනිසුන් පිරිසක් කතා නොකර එකිනෙකාගෙන් ඈත් වී ව...
...,...,...,...,...,...
95565,95565,"Given the sentence ""A low wave is forming in t...","""අළු අහසකට එරෙහිව සාගරයේ පහත් තරංගයක් නිර්මාණය...",The ocean would not be located in the same pla...,පිරිමි ළමයෙකු යහනක වාඩි වී රූපවාහිනිය නරඹන ස්ථ...
95566,95566,Consider the question. How much 60% of 50 is g...,ප්රශ්නය සලකා බලන්න. 50 න් 60% 30 න් 50% ට වඩා ...,(60/100) * 50 – (50/100) * 30\n30 - 15 = 15,(60/100) * 50 - (50/100) * 30\n30 - 15 = 15
95567,95567,Use reasoning to lead to the answer of the fol...,පහත ප්‍රශ්නයට පිළිතුරු දීමට හේතු දැක්වීම භාවිත...,Car runs at speed that saves time of travellin...,මෝටර් රථය වේගයෙන් ධාවනය වන අතර එමඟින් ගමනේ කාල...
95568,95568,Which of these sentences doesn't make sense?\n...,මේ වාක්‍යවලින් තේරුමක් නැති වාක්‍ය මොනවාද?\nවි...,Rats have sharp teeth that can penetrate most ...,මීයන්ට බොහෝ ද්‍රව්‍ය විනිවිද යාමට හැකි තියුණු ...


In [120]:
# Load Original Dataset to Verify
import utils

folder_path = 'cot-zsopt'  # Folder containing the original dataset
dataset = utils.load_dataset(folder_path)

len(merged_df), len(dataset)

(95570, 95570)

In [121]:
def check_row_continuity(df: pd.DataFrame, id_column: str = 'Id') -> bool:
    expected_ids = set(df[id_column])
    actual_ids = set(range(len(df)))

    missing_ids = expected_ids - actual_ids
    extra_ids = actual_ids - expected_ids

    if missing_ids:
        print(f"Missing IDs: {sorted(missing_ids)}")

    if extra_ids:
        print(f"Extra IDs: {sorted(extra_ids)}")

    if missing_ids or extra_ids:
        return False
    else:
        print("All rows are present and in order.")
        return True


db_continuity = check_row_continuity(merged_df)

All rows are present and in order.


In [123]:
import os

parquet_file_name = "cot-zsopt"  # Name of the parquet file to save

if not db_continuity or len(merged_df) != len(dataset):
    raise ValueError("Data continuity is not maintained.")

updated_df = merged_df.drop(columns='Id')


def save_as_parquet(df: pd.DataFrame, file_name: str) -> str:
    parquet_path = f"translated_datasets/{file_name}.parquet"
    if os.path.exists(parquet_path):
        print(f"File {parquet_path} already exists. Not overwriting.")
        return parquet_path
    df.to_parquet(parquet_path, index=False)
    print(f"Data saved as {parquet_path}")
    return parquet_path


parquet_path = save_as_parquet(updated_df, parquet_file_name)

Data saved as translated_datasets/cot-zsopt.parquet


In [124]:
from huggingface_hub import HfFolder
from pandas import DataFrame
from huggingface_hub.utils import RepositoryNotFoundError
import datasets as hf_datasets

if not db_continuity or len(merged_df) != len(dataset):
    raise ValueError("Data continuity is not maintained.")


def upload_subset_to_huggingface(df: DataFrame, repo_id: str, subset_name: str):
    try:
        print(f"Creating dataset from DataFrame with {len(df)} rows.")
        dataset = hf_datasets.Dataset.from_pandas(df)

        print(f"Creating DatasetDict with subset '{subset_name}'")
        dataset_dict = hf_datasets.DatasetDict({
            f"subset_{subset_name}": dataset
        })

        print(f"Pushing to Hugging Face Hub: {repo_id}")
        dataset_dict.push_to_hub(repo_id, private=False, token=HfFolder.get_token())

        print(f"File uploaded successfully to {repo_id}")

    except Exception as e:
        print(f"An error occurred: {str(e)}")
        raise


repo_id = "0xAIT/sinhala-flan"
subset_name = "cot_zsopt"
upload_subset_to_huggingface(updated_df, repo_id, subset_name)

Creating dataset from DataFrame with 95570 rows.
Creating DatasetDict with subset 'cot_zsopt'
Pushing to Hugging Face Hub: 0xAIT/sinhala-flan


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/96 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/462 [00:00<?, ?B/s]

File uploaded successfully to 0xAIT/sinhala-flan


In [None]:
# Verify Token Access
from huggingface_hub import HfApi


def verify_repository(repo_id: str, token: str):
    api = HfApi()
    try:
        repo_info = api.repo_info(repo_id, token=token, repo_type="dataset")
        print(f"Repository {repo_id} exists and is accessible.")
    except RepositoryNotFoundError:
        print(f"Repository {repo_id} does not exist. Please create it or check the repository ID.")
        raise
    except Exception as e:
        print(f"An error occurred while verifying the repository: {str(e)}")
        raise


verify_repository(repo_id, HfFolder.get_token())