<a href="https://colab.research.google.com/github/theonlyamos/datasets/blob/main/English_to_Ghanaian_Languages_Dataset_processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
filenames = ['https://fisd-dataset.s3.amazonaws.com/fisd-ga-90p.zip',
             'https://fisd-dataset.s3.amazonaws.com/fisd-ga-10p.zip',
             'https://fisd-dataset.s3.amazonaws.com/fisd-fanti-90p.zip',
             'https://fisd-dataset.s3.amazonaws.com/fisd-fanti-10p.zip',
             'https://fisd-dataset.s3.amazonaws.com/fisd-akuapim-twi-90p.zip',
             'https://fisd-dataset.s3.amazonaws.com/fisd-akuapim-twi-10p.zip',
             'https://fisd-dataset.s3.amazonaws.com/fisd-asanti-twi-90p.zip',
             'https://fisd-dataset.s3.amazonaws.com/fisd-asanti-twi-10p.zip']

In [2]:
!mkdir raw_data

In [3]:
import os
from pathlib import Path

main_dir = Path.cwd()
raw_data_dir = main_dir / 'raw_data'

os.chdir(raw_data_dir)

for filename in filenames:
    os.system(f'wget {filename}')
    os.system(f'unzip {filename.split("/")[-1]}')

os.chdir(main_dir)

In [4]:
!rm raw_data/*.zip

In [6]:
!mkdir -p processed_data/audios

In [7]:
# prompt: load all the data.csv in the folders into one dataframe

import os
import pandas as pd
from pathlib import Path

main_dir = Path.cwd()
raw_data_dir = main_dir / 'raw_data'
process_data_dir = main_dir / 'processed_data'
audios_dir = process_data_dir / 'audios'

all_data = pd.DataFrame()

for root, dirs, files in os.walk(raw_data_dir):
    for file in files:
        if file == 'data.csv':
            filepath = os.path.join(root, file)
            try:
                df = pd.read_csv(filepath,  on_bad_lines='skip', encoding='utf-8', sep='\t')
                all_data = pd.concat([all_data, df], ignore_index=True)
            except pd.errors.ParserError as e:
                print(f"Error reading {filepath}: {e}")
                # Handle the error appropriately, e.g., skip the file or log the error
        elif file.endswith('.ogg'):
          filepath = os.path.join(root, file)
          new_filename = os.path.join(audios_dir, file)
          os.rename(filepath, new_filename)

all_data = all_data.drop(columns=all_data.columns[0])
all_data['Language'] = all_data['Audio Filepath'].str.split('/').str[1]
all_data['Audio Filepath'] = 'audios/' + all_data['Audio Filepath'].str.split('/').str[-1]
all_data['Gender'] = all_data['Audio Filepath'].str.split('-').str[0].str[-4].str.lower()
all_data['Age'] = all_data['Audio Filepath'].str.split('-').str[0].str[-2:]

print(all_data.shape)

(115925, 6)


In [27]:
# Save DataFrame as csv file
all_data.to_csv("processed_data/english_to_multi_ghanaian_languages_audio_to_text_dataset.csv", index=False)

# Create a new DataFrame without the 'Audio Filepath' column
new_data = all_data.drop_duplicates(subset=['Translation'])

# Display the new dataset
new_data = new_data.drop(columns=['Audio Filepath', 'Gender', 'Age'])

# Save the new dataset to a new CSV file
new_data.to_csv("processed_data/english_to_multi_ghanaian_languages_text_to_text_dataset.csv", index=False)

# Rename the columns
# new_data['English'] = new_data['Translation']

# Display the new dataset


# Remove duplicate rows based on the 'Translation' column
# new_data = new_data.drop_duplicates(subset=['Translation'])

languages = new_data['Language'].unique()

for language in languages:
  language_dataset = new_data[new_data['Language'] == language]
  language_dataset = language_dataset.rename(columns={'Transcription': language.title(), 'Translation': 'English'})
  language_dataset = language_dataset.drop(columns=['Language'])
  language_dataset.to_csv(f"processed_data/{language}_to_english_text_to_text_dataset.csv", index=False)

# Display the shape of the new dataset
# fanti_dataset.head(10)

# You can save this new dataset to a CSV file if needed

In [46]:
!mkdir dataset_repo

dataset  dataset_repo  processed_data  raw_data  sample_data


In [52]:
from huggingface_hub import HfApi, Repository

from google.colab import userdata

HF_TOKEN = userdata.get('HF_TOKEN')
REPO_ID = "theonlyamos/ghanaian_languages_to_english_translation_and_transcription_dataset"

api = HfApi()
repo_url = api.create_repo(repo_id=REPO_ID, token=HF_TOKEN, exist_ok=True)
repo = Repository(local_dir='dataset_repo', clone_from=repo_url, use_auth_token=HF_TOKEN)
repo.git_pull()

For more details, please read https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http.
/content/dataset_repo is already a clone of https://huggingface.co/theonlyamos/ghanaian_languages_to_english_translation_and_transcription_dataset. Make sure you pull the latest changes with `repo.git_pull()`.


In [48]:
!cp -r processed_data/* dataset_repo/


In [55]:
from pathlib import Path
import os
from tqdm import tqdm
from huggingface_hub import notebook_login, HfApi

# notebook_login() # Authenticate

api = HfApi() # Initialize the Hugging Face API

main_dir = Path.cwd()
dataset_repo_dir = main_dir / 'dataset_repo'

# Get the total number of files to process
total_files = sum([len(files) for _, _, files in os.walk(dataset_repo_dir)])

# Wrap the loop with tqdm to show progress
with tqdm(total=total_files, desc="Uploading files") as pbar:
    for root, _, files in os.walk(dataset_repo_dir):
        for file in files:
            file_path = os.path.join(root, file)

            # Upload the file using api.upload_file
            api.upload_file(
                path_or_fileobj=file_path,
                path_in_repo=os.path.relpath(file_path, dataset_repo_dir),  # Preserve directory structure
                repo_id=REPO_ID,
                repo_type="dataset",
                token=HF_TOKEN
            )

            pbar.update(1)  # Update the progress bar after uploading each file

print("Dataset uploaded successfully!")

Uploading files:   0%|          | 6/235013 [00:03<59:52:00,  1.09it/s]

english_to_multi_ghanaian_languages_audio_to_text_dataset.csv:   0%|          | 0.00/11.1M [00:00<?, ?B/s]

Uploading files:   0%|          | 128/235013 [00:47<24:11:51,  2.70it/s]


HfHubHTTPError: 429 Client Error: Too Many Requests for url: https://huggingface.co/api/datasets/theonlyamos/ghanaian_languages_to_english_translation_and_transcription_dataset/commit/main (Request ID: Root=1-6746129d-115ab92567e217ed0c22d1d1;7802dd54-a77d-4886-b4b6-bdf16e104ed4)

You have been rate-limited; you can retry this action in about 1 hour. If you're a new user, your limits will raise progressively over time. Get in touch with us at website@huggingface.co if you need access now.