# Block 1 & 2 (Merge Output CSVs) 

In [None]:
import re
import pandas as pd
import os

base_file_name = 'mmlu-dev'  # Base file name to match
folder_path = f'outputs/{base_file_name}'  # Folder containing the CSV files


def collect_csvs():
    pattern = re.compile(rf"^(\d+)-{re.escape(base_file_name)}\.csv$")
    csv_files = [file for file in os.listdir(folder_path) if pattern.match(file)]
    csv_files.sort(key=lambda x: int(re.findall(r'^\d+', x)[0]))
    return csv_files


csv_files = collect_csvs()


def combine_csv_files():
    df = pd.concat([pd.read_csv(os.path.join(folder_path, file), na_filter=False) for file in csv_files], ignore_index=True)
    return df

In [None]:
merged_df = combine_csv_files()
merged_df = merged_df.sort_values('Id').drop_duplicates(subset=['Id']).reset_index(drop=True)

In [None]:
merged_df

In [None]:
import numpy as np


merged_df['Original Choices'] = merged_df[['Original Choice 1', 'Original Choice 2', 'Original Choice 3', 'Original Choice 4']].values.tolist()
merged_df['Translated Choices'] = merged_df[['Translated Choice 1', 'Translated Choice 2', 'Translated Choice 3', 'Translated Choice 4']].values.tolist()

merged_df['Original Choices'] = merged_df['Original Choices'].apply(np.array)
merged_df['Translated Choices'] = merged_df['Translated Choices'].apply(np.array)

merged_df = merged_df.drop(columns=['Original Choice 1', 'Original Choice 2', 'Original Choice 3', 'Original Choice 4', 
                      'Translated Choice 1', 'Translated Choice 2', 'Translated Choice 3', 'Translated Choice 4'])

In [None]:
merged_df

# Block 3 (Compare Dataset Length with Original)

In [None]:
# Load Original Dataset to Verify
import utils
from datasets import load_dataset
from termcolor import colored

original_subset_name = 'dev'

full_dataset = load_dataset("cais/mmlu", "all")
dataset = full_dataset[original_subset_name].to_pandas()


def check_dataset_length(df: pd.DataFrame, dataset: pd.DataFrame) -> bool:
    length_check = len(df) == len(dataset)
    if length_check:
        print(colored("Length check passed.", "green"))
    else:
        print(colored(f"Length check failed. Translated: {len(df)}, Original: {len(dataset)}", "red"))
    return length_check


print(len(merged_df), len(dataset))
length_check = check_dataset_length(merged_df, dataset)

# Block 4 (Verify Row Continuity of the Translated Dataset)

In [None]:
missing_ids = []
extra_ids = []

def check_row_continuity(df: pd.DataFrame, id_column: str = 'Id') -> bool:
    global missing_ids, extra_ids

    expected_ids = set(df[id_column])
    actual_ids = set(range(len(dataset)))

    extra_ids = sorted(expected_ids - actual_ids)
    missing_ids = sorted(actual_ids - expected_ids)

    if missing_ids:
        print(colored(f"Missing IDs: {sorted(missing_ids)}", "red"))

    if extra_ids:
        print(colored(f"Extra IDs: {sorted(extra_ids)}", "red"))

    if missing_ids or extra_ids:
        return False
    else:
        print(colored("All rows are present and in order.", "green"))
        return True


db_continuity = check_row_continuity(merged_df)

# Block 5,6,7 (Column Data Comparison)

In [None]:
assert check_dataset_length(merged_df, dataset)


def compare_columns(df1: pd.DataFrame, df2: pd.DataFrame, df1_col_name: str, df2_col_name: str):
    comparison_df = pd.DataFrame({
        f'{df1_col_name}': df1[df1_col_name],
        f'{df2_col_name}': df2[df2_col_name],
        'match': df1.apply(lambda row: np.array_equal(row[df1_col_name], df2.at[row.name, df2_col_name]), axis=1)
    })
    mismatched_df = comparison_df[~comparison_df['match']]
    if len(mismatched_df) > 0:
        print(colored(f"[{df2_col_name}] Some entries didn't match with the originals", "red"))
    else:
        print(colored(f"[{df2_col_name}] Data comparison Successful.", "green"))

    return mismatched_df


def validate_columns():
    input_col_mismatches = compare_columns(merged_df, dataset, 'Original Question', 'question')
    target_col_mismatches = compare_columns(merged_df, dataset, 'Original Choices', 'choices')

    if input_col_mismatches.empty and target_col_mismatches.empty:
        return True
    else:
        return False


validate_columns()

In [None]:
compare_columns(merged_df, dataset, 'Original Input', 'inputs')

In [None]:
compare_columns(merged_df, dataset, 'Original Target', 'targets')

In [None]:
mismatched_length_df = merged_df[merged_df.apply(lambda row: len(row['Original Choices']) != len(row['Translated Choices']), axis=1)]

if len(mismatched_length_df) > 0:
    print(colored("Some entries have mismatched lengths between Original Choices and Translated Choices", "red"))
    for index, row in mismatched_length_df.iterrows():
        print(f"Row {index}: Original Choices Length = {len(row['Original Choices'])}, Translated Choices Length = {len(row['Translated Choices'])}")
else:
    print(colored("Length comparison between Original Choices and Translated Choices is successful.", "green"))


# Block 8 (Combine Metadata Records)

In [None]:
# Adding metadata to the merged dataset
dataset_metadata = dataset.loc[:, ~dataset.columns.isin(['question', 'choices'])]
print(f"Metadata length: {len(dataset_metadata)}")

assert check_row_continuity(merged_df)
assert validate_columns()
assert len(dataset_metadata) == len(merged_df)

merged_df_with_metadata = pd.concat([merged_df, dataset_metadata], axis=1)

merged_df_with_metadata

# Block 9 (Save as Parquet)

In [None]:
import os

parquet_file_name = "mmlu-dev"  # Name of the parquet file to save

assert check_row_continuity(merged_df_with_metadata)
assert validate_columns()

updated_df = merged_df_with_metadata.drop(columns='Id')


def save_as_parquet(df: pd.DataFrame, file_name: str) -> str:
    parquet_path = f"translated_datasets/{file_name}.parquet"
    if os.path.exists(parquet_path):
        print(f"File {parquet_path} already exists. Not overwriting.")
        return parquet_path
    df.to_parquet(parquet_path, index=False)
    print(f"Data saved as {parquet_path}")
    return parquet_path


parquet_path = save_as_parquet(updated_df, parquet_file_name)

# Block 10 (Upload to Hugging Face)
<em>(Make sure to login with `huggingface-cli` before running this block)</em>

In [None]:
import pandas as pd

parquet_file_name = "mmlu-auxiliary_train" # Uncomment if you want to change the parquet file name
parquet_path = f"translated_datasets/{parquet_file_name}.parquet"
print(f"Loading the saved parquet file: {parquet_path}")
saved_df = pd.read_parquet(parquet_path)

In [None]:
saved_df

In [None]:

from datasets import Dataset

repo_id = "0xAIT/sinhala-MMLU"
subset_name = "all" # Make sure to change this to the subset name (Use underscores instead of hyphens e.g. "flan_zsopt")
split_name = "auxiliary_train" # Make sure to change this to the subset name (Use underscores instead of hyphens e.g. "flan_zsopt")

dataset = Dataset.from_pandas(saved_df)
dataset.push_to_hub(repo_id, config_name=subset_name, split=split_name)