<a href="https://colab.research.google.com/github/tmontaj/Text-AudioDatasets/blob/main/Librispeech/data_download_and_save.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Downloading and preparing Librispeech dataset

##### needed libraries 

In [3]:
import pandas as pd
import tarfile
import os, sys
import shutil


!pip install wget
import wget




##### Downloading and extracting Librispeech 

In [46]:
def download_librispeech(out, splits):
  """
    Downloading librispeech dataset splits

    Arguments:
    out -- path to save the dataset on
    splits -- list of splits needed to be downloaded. splits are:
                    [dev-clean
                    dev-other,
                    test-clean, 
                    test-other,
                    train-clean-100,
                    train-clean-360,
                    train-other-500]


  """
  def _splits_url(split_name):
    return "https://www.openslr.org/resources/12/"+split_name+".tar.gz"

  #create this bar_progress method which is invoked automatically from wget
  def _bar_progress(current, total, width=80):
    progress_message = "Downloading: %d%% [%d / %d] bytes" % (current / total * 100, current, total)
    # Don't use print() as it will print in new line every time.
    sys.stdout.write("\r" + progress_message)
    sys.stdout.flush()
  
  def _splits_progress(split_name, split_number, splits_count):
    progress_message = "Split: %s [%d / %d]" % (split_name, split_number, splits_count)
    # Don't use print() as it will print in new line every time.
    sys.stdout.write("\r" + progress_message+"\n")
    sys.stdout.flush()

  split_number = 1
  splits_count = len(splits)

  for split_name in splits:
    _splits_progress(split_name, split_number, splits_count)
    wget.download(_splits_url(split_name), out=out, bar=_bar_progress)
    split_number+=1

In [5]:
# download_librispeech("", ["dev-clean", "dev-other"])

Downloading: dev-clean [1 / 2]
Downloading: dev-other [2 / 2]
Downloading: 100% [314305928 / 314305928] bytes

In [9]:
def unzip_librispeech(out, extract_path):
  """
  extracting librispeech data

  Arguments:
  out -- path of the downloaded tar files 
  extract_path -- path to extract the files on  
  """
  dirs = os.listdir(out)

  for i in dirs:
    target_name = i.split('.')
    name = out +'/'+i
    if name.endswith('.tar.gz'):
      tar = tarfile.open(i, "r:gz")
      tar.extractall(extract_path +'/' + target_name[0])
      tar.close()
  

In [34]:
# unzip_librispeech(".", "tst")

##### Organize directories 

In [36]:
def organize_dirs (extract_path, organized_path):
  """
  extracting librispeech data

  Arguments:
  extract_path -- path to extract the files on  
  organized_path -- path to organize the files in  
  """
  dirs = os.listdir(extract_path)
  for dir in dirs:
    shutil.move(extract_path+ '/'+ dir+ '/' + 'LibriSpeech/'+ dir , organized_path)
  
  common_files_path = extract_path + '/' + dirs[0]+'/' + "LibriSpeech"
  dirs = os.listdir( common_files_path )

  for f in dirs:
    shutil.move(common_files_path+'/'+ f , organized_path)


In [35]:
# organize_dirs ("./tst", "./tst2")

dev-clean
dev-other


In [37]:
def _remove(dir_path):
  """
  extracting librispeech data

  Arguments:
  dir_path -- path to dirctory or file to remove  
  """
  os.system('rm -R %s' %dir_path)

In [41]:
def download_and_extract(out, splits, extract_path, organized_path, clean_organized_path=False):
  download_librispeech(out, splits)
  unzip_librispeech(out, extract_path)
  if clean_organized_path:
    _remove(organized_path)
  organize_dirs (extract_path, organized_path)

In [43]:
def load_metadata(data_path):
  """
  extracting librispeech data

  Arguments:
  data_path -- path to dataset
  """

  # edits to be done in the data before loading it
  # edit line 45 remove || 
  # edit ;id to be id 
  #-----------------------------------------
  # use sep | and skip first 11 rows 

  speakers = pd.read_csv(data_path+"/"+'SPEAKERS.TXT', sep="|", skiprows=11)
  return speakers

In [45]:
load_metadata("./tst2")

Unnamed: 0,ID,SEX,SUBSET,MINUTES,NAME
0,14,F,train-clean-360,25.03,Kristin LeMoine
1,16,F,train-clean-360,25.11,Alys AtteWater
2,17,M,train-clean-360,25.04,Gord Mackenzie
3,19,F,train-clean-100,25.19,Kara Shallenberg
4,20,F,train-other-500,30.07,Gesine
...,...,...,...,...,...
2479,8975,F,train-clean-100,25.11,Daisy Flaim
2480,9000,M,train-other-500,27.26,Ramon Escamilla
2481,9022,F,train-clean-360,25.17,Claire M
2482,9023,F,train-clean-360,25.19,P. J. Morgan


In [None]:
import pandas as pd
df = pd.read_csv(filename,names=['code'])
df[['code','name_of_code']] = df["code"].str.split(" ", 1, expand=True)
df["name_of_code"] = df["name_of_code"].str.strip("-")
print(df)