<a href="https://colab.research.google.com/github/theonlyamos/ghana-languages-dataset/blob/main/Notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
filenames = ['https://fisd-dataset.s3.amazonaws.com/fisd-ga-90p.zip',
             'https://fisd-dataset.s3.amazonaws.com/fisd-ga-10p.zip',
             'https://fisd-dataset.s3.amazonaws.com/fisd-fanti-90p.zip',
             'https://fisd-dataset.s3.amazonaws.com/fisd-fanti-10p.zip',
             'https://fisd-dataset.s3.amazonaws.com/fisd-akuapim-twi-90p.zip',
             'https://fisd-dataset.s3.amazonaws.com/fisd-akuapim-twi-10p.zip',
             'https://fisd-dataset.s3.amazonaws.com/fisd-asanti-twi-90p.zip',
             'https://fisd-dataset.s3.amazonaws.com/fisd-asanti-twi-10p.zip']

In [3]:
!mkdir -p datasets

In [6]:
import pandas as pd
from pathlib import Path
import os

# create data frame
data = pd.DataFrame(columns=['Audio Filepath', 'Transcription', 'Translation'])

for url in filenames:
  filename = url.split('/')[-1]
  folder = filename.split('.')[0]
  language = folder.split('-')[1]
  # download dataset using wget
  # os.system(f"wget {url}")
  # unzip file
  # os.system(f"unzip {filename}")
  # create dataframe
  df = pd.read_csv(Path(folder, 'data.csv'), encoding='latin-1', on_bad_lines='skip')
  df[['index', 'Audio Filepath', 'Transcription', 'Translation']] = df['\tAudio Filepath\tTranscription\tTranslation'].str.split('\t', expand=True)

  # Remove leading/trailing whitespace from the newly created columns
  df['Audio Filepath'] = df['Audio Filepath'].str.strip()
  df['Transcription'] = df['Transcription'].str.strip()
  df['Translation'] = df['Translation'].str.strip()
  df['Language'] = language

  # Drop the original combined column
  df = df.drop('index', axis=1)
  df = df.drop('\tAudio Filepath\tTranscription\tTranslation', axis=1)

  # add df to main dataframe
  data = pd.concat([data, df])


In [7]:
data.drop_duplicates(inplace=True)

In [8]:
# do a check for all rows with language as fanti
data[data['Language'] == 'asanti'].head(10)

Unnamed: 0,Audio Filepath,Transcription,Translation,Language
0,lacuna-audios-train/asanti-twi/audios/AsantiTw...,Maa wÉn ho yÉÉ huam,For them to smell good,asanti
1,lacuna-audios-train/asanti-twi/audios/AsantiTw...,Maa wÉn ho yÉÉ huam,For them to smell good,asanti
2,lacuna-audios-train/asanti-twi/audios/AsantiTw...,Maa wÉn ho yÉÉ huam,For them to smell good,asanti
3,lacuna-audios-train/asanti-twi/audios/AsantiTw...,Maa wÉn ho yÉÉ huam,For them to smell good,asanti
4,lacuna-audios-train/asanti-twi/audios/AsantiTw...,Maa wÉn ho yÉÉ huam,For them to smell good,asanti
5,lacuna-audios-train/asanti-twi/audios/AsantiTw...,Maa wÉn ho yÉÉ huam,For them to smell good,asanti
6,lacuna-audios-train/asanti-twi/audios/AsantiTw...,Maa wÉn ho yÉÉ huam,For them to smell good,asanti
7,lacuna-audios-train/asanti-twi/audios/AsantiTw...,Maa wÉn ho yÉÉ huam,For them to smell good,asanti
8,lacuna-audios-train/asanti-twi/audios/AsantiTw...,Maa wÉn ho yÉÉ huam,For them to smell good,asanti
9,lacuna-audios-train/asanti-twi/audios/AsantiTw...,Maa wÉn ho yÉÉ huam,For them to smell good,asanti


In [9]:
# create new data frame containing only transcription and translation
transcription_translation_df = data[['Language', 'Transcription', 'Translation']]

In [10]:
# rename transcription column name to Phrase
transcription_translation_df.rename(columns={'Transcription': 'Phrase'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  transcription_translation_df.rename(columns={'Transcription': 'Phrase'}, inplace=True)


In [11]:
transcription_translation_df.head(10)

Unnamed: 0,Language,Phrase,Translation
0,ga,Bo ni otswa mi aloo?,Did you call me
1,ga,Bo ni otswa mi aloo?,Did you call me
2,ga,Bo ni otswa mi aloo?,Did you call me
3,ga,Bo ni otswa mi aloo?,Did you call me
4,ga,Bo ni otswa mi aloo?,Did you call me
5,ga,Bo ni otswa mi aloo?,Did you call me
6,ga,Bo ni otswa mi aloo?,Did you call me
7,ga,Bo ni otswa mi aloo?,Did you call me
8,ga,Bo ni otswa mi aloo?,Did you call me
9,ga,Bo ni otswa mi aloo?,Did you call me


In [12]:
# save transcription_translation data frame
transcription_translation_df.to_csv(Path('datasets','multi_to_english.csv'), index=False)

In [13]:
!ls datasets/

multi_to_english.csv


In [14]:
# prompt: I need to upload fanti-dataset.csv to huggingface

from huggingface_hub import HfApi, HfFolder, CommitOperationAdd
from huggingface_hub import notebook_login

notebook_login()

api = HfApi()

api.upload_file(
    path_or_fileobj="datasets/multi_to_english.csv",
    path_in_repo="multi_to_english.csv",
    repo_id="theonlyamos/ghana-languages",
    repo_type="dataset",
)

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

CommitInfo(commit_url='https://huggingface.co/datasets/theonlyamos/ghana-languages/commit/bf87ee34d22f54160c8df2b032224011f1d54764', commit_message='Upload multi_to_english.csv with huggingface_hub', commit_description='', oid='bf87ee34d22f54160c8df2b032224011f1d54764', pr_url=None, pr_revision=None, pr_num=None)