<a href="https://colab.research.google.com/github/tmontaj/Text-AudioDatasets/blob/main/Librispeech/data_download_and_save.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Downloading and preparing Librispeech dataset

##### needed libraries 

In [22]:
import pandas as pd
import tarfile
import os, sys
import shutil


!pip install wget
import wget


Collecting wget
  Downloading https://files.pythonhosted.org/packages/47/6a/62e288da7bcda82b935ff0c6cfe542970f04e29c756b0e147251b2fb251f/wget-3.2.zip
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25l[?25hdone
  Created wheel for wget: filename=wget-3.2-cp36-none-any.whl size=9682 sha256=d23f1decf8ce620d876c938939336ca143e387c6f6670714092646263c35f5fd
  Stored in directory: /root/.cache/pip/wheels/40/15/30/7d8f7cea2902b4db79e3fea550d7d7b85ecb27ef992b618f3f
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2


In [None]:
#create this bar_progress method which is invoked automatically from wget and used in deffrent code

def _bar_progress(current, total, width=80):
  progress_message = "Downloading: %d%% [%d / %d] bytes" % (current / total * 100, current, total)
  # Don't use print() as it will print in new line every time.
  sys.stdout.write("\r" + progress_message)
  sys.stdout.flush()

##### Downloading and extracting Librispeech 

In [None]:
def download_librispeech(out, splits):
  """
    Downloading librispeech dataset splits

    Arguments:
    out -- path to save the dataset on
    splits -- list of splits needed to be downloaded. splits are:
                    [dev-clean
                    dev-other,
                    test-clean, 
                    test-other,
                    train-clean-100,
                    train-clean-360,
                    train-other-500]


  """
  def _splits_url(split_name):
    return "https://www.openslr.org/resources/12/"+split_name+".tar.gz"
  
  def _splits_progress(split_name, split_number, splits_count):
    progress_message = "Split: %s [%d / %d]" % (split_name, split_number, splits_count)
    # Don't use print() as it will print in new line every time.
    sys.stdout.write("\r" + progress_message+"\n")
    sys.stdout.flush()

  split_number = 1
  splits_count = len(splits)

  for split_name in splits:
    _splits_progress(split_name, split_number, splits_count)
    wget.download(_splits_url(split_name), out=out, bar=_bar_progress)
    split_number+=1

In [None]:
# download_librispeech("", ["dev-clean", "dev-other"])

Downloading: dev-clean [1 / 2]
Downloading: dev-other [2 / 2]
Downloading: 100% [314305928 / 314305928] bytes

In [None]:
def unzip_librispeech(out, extract_path):
  """
  extracting librispeech data

  Arguments:
  out -- path of the downloaded tar files 
  extract_path -- path to extract the files on  
  """
  dirs = os.listdir(out)

  print("Start extracting ...")

  for i in dirs:
    target_name = i.split('.')
    name = out +'/'+i
    if name.endswith('.tar.gz'):
      tar = tarfile.open(i, "r:gz")
      tar.extractall(extract_path +'/' + target_name[0])
      tar.close()

  print("... Finished extracting")
    
  

In [None]:
# unzip_librispeech(".", "tst")

##### Organize directories 

In [None]:
def organize_dirs (extract_path, organized_path):
  """
  extracting librispeech data

  Arguments:
  extract_path -- path to extract the files on  
  organized_path -- path to organize the files in  
  """
  print("Start organize_dirs ...")

  dirs = os.listdir(extract_path)
  for dir in dirs:
    shutil.move(extract_path+ '/'+ dir+ '/' + 'LibriSpeech/'+ dir , organized_path)
  
  common_files_path = extract_path + '/' + dirs[0]+'/' + "LibriSpeech"
  dirs = os.listdir( common_files_path )

  for f in dirs:
    shutil.move(common_files_path+'/'+ f , organized_path)
  
  print("... Finished organize_dirs")



In [None]:
# organize_dirs ("./tst", "./tst2")

dev-clean
dev-other


In [33]:
def _remove(dir_path):
  """
  thin wrapper over os.system to remove directory or file 

  Arguments:
  dir_path -- path to dirctory or file to remove  
  """
  os.system('rm -R %s' %dir_path)

In [25]:
def _rename(dir_path, old_name, new_name):
  """
  thin wrapper over os.system to rename directory or file 

  Arguments:
  dir_path -- path to dirctory or file to rename  
  old_name -- old name (original) for directory or file
  new_name -- new name for directory or file
  """
  os.system('mv %s %s' %(dir_path+"/"+old_name, dir_path+"/"+new_name))

In [28]:
# _rename("./", "SPEAKERS2.TXT", "SPEAKERS3.TXT")

In [73]:
def download_and_extract(out, splits, extract_path, organized_path, remove_organized_path=False):
  """
  download and extract librispeech

  Arguments:
  out -- path of the downloaded tar files 
  extract_path -- path to extract the files on  
  organized_path -- path to organize the files in  
  remove_organized_path -- flag to remove organized_path (uses -R to remove all files)
  splits -- list of splits needed to be downloaded. splits are:
                    [dev-clean
                    dev-other,
                    test-clean, 
                    test-other,
                    train-clean-100,
                    train-clean-360,
                    train-other-500]
  """
  download_librispeech(out, splits)
  unzip_librispeech(out, extract_path)
  if clean_organized_path:
    _remove(organized_path)
  organize_dirs (extract_path, organized_path)

In [None]:
download_and_extract(out = "./",
                     splits = ["dev-clean", "dev-other"]
                     extract_path = "./tst",
                     organized_path = "./tst2"
                     )

In [74]:
def load(src,splits):
  """
  simple download and extract librispeech

  Arguments:
  src -- path to dataset directory 
  splits -- list of splits needed to be downloaded. splits are:
                    [dev-clean
                    dev-other,
                    test-clean, 
                    test-other,
                    train-clean-100,
                    train-clean-360,
                    train-other-500]
  """
  os.system("mkdir %s" %(src+"/librispeech"))

  out = src+"/out"
  extract_path = src+"/tmp"
  organized_path = src+"/dataset"

  download_and_extract(out=out,
                     splits=splits,
                     extract_path = extract_path, 
                     organized_path = organized_path
                     )
  print("CONGRATS Librispeach is ready to be used at %s" %(organized_path))


In [49]:
def clean_speakers_file(src):
  """
  clean speakers file

  Arguments:
  src -- path to dataset
  """
  input=open(src+"/SPEAKERS.TXT", "r")
  dest=open(src+"/SPEAKERS_temp.TXT", "w")

  input_lines = input.readlines()

  line_num = 1
  for line in input_lines:
    if line_num == 45:
      line = line.split("|")
      line [-2] = line[-2]+" "+line[-1] 
      line.pop(-1)
      line.pop(-2)
      line = "|".join(line)

    if line_num == 12:
      line = line[1:].lower()
    
    dest.write(line)
    line_num+=1

  input.close()
  dest.close()

  # _remove(src+"/SPEAKERS.TXT")
  _rename(src, "SPEAKERS_temp.TXT", "speakers.txt")

In [50]:
clean_speakers_file(src="./")

In [71]:
def load_metadata(data_path):
  """
  load metadata currently loads speakers.txt only 
  
  Arguments:
  data_path -- path to dataset
  """

  # use sep | and skip first 11 rows 
  speakers = pd.read_csv(data_path+"/"+'speakers.txt', sep="|", skiprows=11)
  speakers.columns = speakers.columns.map(lambda x: x.strip())
  speakers.set_index("id", inplace=True)
  return speakers

In [72]:
# x = load_metadata("./")
# x.columns

load_metadata("./")

Unnamed: 0_level_0,sex,subset,minutes,name
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
14,F,train-clean-360,25.03,Kristin LeMoine
16,F,train-clean-360,25.11,Alys AtteWater
17,M,train-clean-360,25.04,Gord Mackenzie
19,F,train-clean-100,25.19,Kara Shallenberg
20,F,train-other-500,30.07,Gesine
...,...,...,...,...
8975,F,train-clean-100,25.11,Daisy Flaim
9000,M,train-other-500,27.26,Ramon Escamilla
9022,F,train-clean-360,25.17,Claire M
9023,F,train-clean-360,25.19,P. J. Morgan


In [None]:
import pandas as pd
df = pd.read_csv(filename,names=['code'])
df[['code','name_of_code']] = df["code"].str.split(" ", 1, expand=True)
df["name_of_code"] = df["name_of_code"].str.strip("-")
print(df)