<a href="https://colab.research.google.com/github/tmontaj/Text-AudioDatasets/blob/main/Librispeech/data_download_and_save.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Downloading and preparing Librispeech dataset

##### needed libraries 

In [1]:
import pandas as pd
import tarfile
import os, sys
import shutil


!pip install wget
import wget
!pip install soundfile
import soundfile
from pathlib import Path



In [2]:
#create this bar_progress method which is invoked automatically from wget and used in deffrent code

def _bar_progress(current, total, width=80):
  progress_message = "Downloading: %d%% [%d / %d] bytes" % (current / total * 100, current, total)
  # Don't use print() as it will print in new line every time.
  sys.stdout.write("\r" + progress_message)
  sys.stdout.flush()

##### Downloading and extracting Librispeech 

In [3]:
def download_librispeech(out, splits):
  """
    Downloading librispeech dataset splits

    Arguments:
    out -- path to save the dataset on
    splits -- list of splits needed to be downloaded. splits are:
                    [dev-clean
                    dev-other,
                    test-clean, 
                    test-other,
                    train-clean-100,
                    train-clean-360,
                    train-other-500]


  """
  def _splits_url(split_name):
    return "https://www.openslr.org/resources/12/"+split_name+".tar.gz"
  
  def _splits_progress(split_name, split_number, splits_count):
    progress_message = "Split: %s [%d / %d]" % (split_name, split_number, splits_count)
    # Don't use print() as it will print in new line every time.
    sys.stdout.write("\r" + progress_message+"\n")
    sys.stdout.flush()

  print("Start downloading librispeech ...")
  split_number = 1
  splits_count = len(splits)

  for split_name in splits:
    _splits_progress(split_name, split_number, splits_count)
    wget.download(_splits_url(split_name), out=out, bar=_bar_progress)
    split_number+=1

  print("... Finish downloading librispeech")


In [4]:
# download_librispeech("", ["dev-clean", "dev-other"])

In [5]:
def unzip_librispeech(out, extract_path):
  """
  extracting librispeech data

  Arguments:
  out -- path of the downloaded tar files 
  extract_path -- path to extract the files on  
  """
  dirs = os.listdir(out)

  print("Start extracting ...")

  for i in dirs:
    target_name = i.split('.')
    name = out +'/'+i
    if name.endswith('.tar.gz'):
      tar = tarfile.open(i, "r:gz")
      tar.extractall(extract_path +'/' + target_name[0])
      tar.close()

  print("... Finished extracting")

In [6]:
# unzip_librispeech(".", "tst")

##### Organize directories 

In [7]:
def organize_dirs (extract_path, organized_path):
  """
  extracting librispeech data

  Arguments:
  extract_path -- path to extract the files on  
  organized_path -- path to organize the files in  
  """
  print("Start organize_dirs ...")

  dirs = os.listdir(extract_path)
  for dir in dirs:
    shutil.move(extract_path+ '/'+ dir+ '/' + 'LibriSpeech/'+ dir , organized_path)
  
  common_files_path = extract_path + '/' + dirs[0]+'/' + "LibriSpeech"
  dirs = os.listdir( common_files_path )

  for f in dirs:
    shutil.move(common_files_path+'/'+ f , organized_path)
  
  print("... Finished organize_dirs")



In [8]:
# organize_dirs ("./tst", "./tst2")

In [9]:
def _remove(dir_path):
  """
  thin wrapper over os.system to remove directory or file 

  Arguments:
  dir_path -- path to dirctory or file to remove  
  """
  os.system('rm -R %s' %dir_path)

In [10]:
def _rename(dir_path, old_name, new_name):
  """
  thin wrapper over os.system to rename directory or file 

  Arguments:
  dir_path -- path to dirctory or file to rename  
  old_name -- old name (original) for directory or file
  new_name -- new name for directory or file
  """
  os.system('mv %s %s' %(dir_path+"/"+old_name, dir_path+"/"+new_name))

In [11]:
# _rename("./", "SPEAKERS2.TXT", "SPEAKERS3.TXT")

In [12]:
def download_and_extract(out, splits, extract_path,
                         organized_path, remove_organized_path=False, download=True):
  """
  download and extract librispeech

  Arguments:
  out -- path of the downloaded tar files 
  extract_path -- path to extract the files on  
  organized_path -- path to organize the files in  
  remove_organized_path -- flag to remove organized_path (uses -R to remove all files)
  download -- flag to optionaly skip download the dataset
  splits -- list of splits needed to be downloaded. splits are:
                    [dev-clean
                    dev-other,
                    test-clean, 
                    test-other,
                    train-clean-100,
                    train-clean-360,
                    train-other-500]
  """
  if download:
    download_librispeech(out, splits)
  print("----------------------------")
  unzip_librispeech(out, extract_path)
  print("----------------------------")
  if remove_organized_path:
    _remove(organized_path)
  organize_dirs (extract_path, organized_path)
  print("----------------------------")


In [13]:
# download_and_extract(out = "./",
#                      splits = ["dev-clean", "dev-other"],
#                      extract_path = "./tst",
#                      organized_path = "./tst2"
#                      )

In [14]:
# _remove("./tst")
# _remove("./tst2")

In [15]:
def load(src, splits, remove_organized_path=False, download=True):
  """
  simple download and extract librispeech

  Arguments:
  src -- path to dataset directory 
  splits -- list of splits needed to be downloaded. splits are:
                    [dev-clean
                    dev-other,
                    test-clean, 
                    test-other,
                    train-clean-100,
                    train-clean-360,
                    train-other-500]
  """
  src = src+"/librispeech"
  out = src+"/out"
  extract_path = src+"/tmp"
  organized_path = src+"/data"

  os.system("mkdir -p %s" %(src))
  if download:
    os.system("mkdir -p %s" %(out))
    _remove(out+"/*")

  os.system("mkdir -p %s" %(extract_path))
  _remove(extract_path+"/*")


  os.system("mkdir -p %s" %(organized_path))
  _remove(organized_path+"/*")


  download_and_extract(out=out,
                     splits=splits,
                     extract_path = extract_path, 
                     organized_path = organized_path,
                     remove_organized_path = remove_organized_path,
                     download = download
                     )
  print("CONGRATS Librispeach is ready to be used at %s" %(organized_path))


In [None]:
load(src="dataset",
     splits=["dev-clean", "dev-other"])

Start downloading librispeech ...
Split: dev-clean [1 / 2]
Split: dev-other [2 / 2]
Downloading: 75% [237928448 / 314305928] bytes

In [None]:
def clean_speakers_file(src):
  """
  clean speakers file

  Arguments:
  src -- path to dataset
  """
  input=open(src+"/SPEAKERS.TXT", "r")
  dest=open(src+"/SPEAKERS_temp.TXT", "w")

  input_lines = input.readlines()

  line_num = 1
  for line in input_lines:
    if line_num == 45:
      line = line.split("|")
      line [-2] = line[-2]+" "+line[-1] 
      line.pop(-1)
      line.pop(-2)
      line = "|".join(line)

    if line_num == 12:
      line = line[1:].lower()
    
    dest.write(line)
    line_num+=1

  input.close()
  dest.close()

  # _remove(src+"/SPEAKERS.TXT")
  _rename(src, "SPEAKERS_temp.TXT", "speakers.txt")

In [None]:
# clean_speakers_file(src="./")

In [None]:
def load_metadata(data_path):
  """
  load metadata currently loads speakers.txt only 
  
  Arguments:
  data_path -- path to dataset
  """

  # use sep | and skip first 11 rows 
  speakers = pd.read_csv(data_path+"/"+'speakers.txt', sep="|", skiprows=11)
  speakers.columns = speakers.columns.map(lambda x: x.strip())
  speakers.set_index("id", inplace=True)
  return speakers

In [None]:
# x = load_metadata("./")
# x.columns

# load_metadata("./")

In [None]:
def load_trans(src):
  """
  load single file of trans
  
  Arguments:
  src -- path to the file
  """
  split_name = "dev-clean"
  split = split_name.split("-")

  df = pd.read_csv(src,names=['data'])
  df[['id','text']] = df["code"].str.split(" ", 1, expand=True)
  df[['speaker', 'chapter', 'index']] = df["id"].str.strip("-", expand=True)
  df[["split"]] = split[0]
  df[["isClean"]] = True if split[1] == "clean" else False

  return df

In [None]:
# load_trans(src)

In [None]:
def load_all_trans(src):
  """
  load single file of trans
  
  Arguments:
  src -- path to data directory
  """
  splits = [x for x in Path(src).iterdir() if x.is_dir()]

  for path in Path('src').rglob('*.trans.txt'):
      print(path)



In [None]:
#  load_all_trans(src)

In [None]:
def load_wav(src, split, isClean, chapter, speaker, id):
  split = split + ("-clean" if isClean else "-other")
  path = os.path.join(src, split, chapter, speaker, id+".flac")
  wav, sample_rate = sf.read(path)      

  return wav, sample_rate

In [None]:
# wav, sample_rate = load_wav(src, split, isClean, chapter, speaker, id)
# wav.shape

In [None]:
def load_sample(data, src, split, isClean, chapter, speaker, id):
  data = data[data["split"]==split]
  data = data[data["isClean"]==isClean]
  data = data[data["id"]==id]

  wav, sample_rate = load_wav(src, split, isClean, chapter, speaker, id)

  data[['wav', "sample_rate"]] = wav, sample_rate

  return wav