https://www.kaggle.com/carlfm01/120h-spanish-speech

### Content
Total hours: 120h.

Language: Spanish.

Number of speakers: 17 without counting the collaborative audio books.

Type of speech: Clean speech.

A CSV file containing the audio file name and the aligned transcription.

### Inspiration
There's a lack of Spanish speech data to train or even to test under public domain. Test your own model and share your WER, spot bad transcriptions.

### Provenance
#### Sources
https://librivox.org/

Spanish books catalog in librivox are [here](https://librivox.org/search?primary_key=5&search_category=language&search_page=1&search_form=get_results)

#### Collection methodology

Automatically aligned the text with the Windows speech recognition,then as validation of the alignment used a Mozilla's DeepSpeech model using a few different language models.

### Extra
Collected by: Carlos Fonseca M @ https://github.com/carlfm01, probably by [this tool](https://github.com/carlfm01/librivox-tools)

### License
License : Public Domain

### Downloading 
#### Method 1
export your cookies from your browser, when you logged in at kaggle and put your cookies.txt on your server. Then run:
```
mkdir data

wget -x --load-cookies cookies.txt -P data -nH --cut-dirs=5 https://www.kaggle.com/carlfm01/120h-spanish-speech/download
```
#### Method 2

In [None]:
import glob
import os
#import subprocess
#import tarfile
#import wget

In [None]:
#%matplotlib inline
import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import IPython.display as ipd
from ipywidgets import interact
import ipywidgets as widgets
import re

In [None]:
import soundfile as sf
#import IPython.display as ipd
#from ipywidgets import interact
#import ipywidgets as widgets
from pathlib import Path


In [None]:
%ls

In [None]:
#%ls asr-spanish-v1-carlfm01\audios\0000df16-47ea-428f-8367-df2ce365d5c4.wav

In [None]:
#%ls asr-spanish-v1-carlfm01\asr-spanish-v1-carlfm01\audios\0000df16-47ea-428f-8367-df2ce365d5c4.wav

In [None]:
pwd

In [None]:
data_folder_name = os.path.join('/gong-asr/kaldi-12/Domain-specific-ESPnet','espnet','egs',
                        'spanish_common_voice','asr1', 'raw_data', 'CommonVoiceSpanish', 'decompressed')

In [None]:
! ls -l {data_folder_name} | wc -l

In [None]:
# data_folder_name = r'asr-spanish-v1-carlfm01'

In [None]:
example_file_name= os.path.join(data_folder_name, 'clips', r'common_voice_es_18306544.mp3')
description_file_name = os.path.join(data_folder_name,'train.tsv')

# example_file_name= os.path.join(data_folder_name, 'clips', r'0000df16-47ea-428f-8367-df2ce365d5c4.wav')
# description_file_name = os.path.join(data_folder_name,'files.csv')

In [None]:
%ls -l {description_file_name}

In [None]:
# df = pd.read_csv(description_file_name, index_col='wav_filename')

df = pd.read_csv(description_file_name, sep='\\t')

In [None]:
df.head()

In [None]:
len(df.index) == len(set(df.index))

In [None]:
df_raw = df.loc[df.index[0]]
df_raw

In [None]:
#os.path.join(*([data_folder_name]+df_raw.name.split('/')))

In [None]:
def segment_by_df_raw(df_raw):
#     print(df_raw)
    audio, sample_rate = librosa.load(os.path.join(data_folder_name,'clips',df_raw.path))
    plt.rcParams['figure.figsize'] = (15,7)
    plt.title(f'Waveform of Audio Example: {example_file_name}')
    plt.ylabel('Amplitude')

#     print(df_raw['wav_filesize'])
    print(df_raw['sentence'])
    _ = librosa.display.waveplot(audio)
    return ipd.Audio(audio, rate=sample_rate)

In [None]:
segment_by_df_raw(df_raw)

In [None]:
# def segment_by_df_raw(df_raw):
#     audio, sample_rate = sf.read(os.path.join(*([data_folder_name]+df_raw.name.split('/'))))
#     plt.rcParams['figure.figsize'] = (15,7)
#     plt.title(f'Waveform of Audio Example: {example_file_name}')
#     plt.ylabel('Amplitude')

#     print(df_raw['wav_filesize'])
#     print(df_raw['transcript'])
#     _ = librosa.display.waveplot(audio)
#     return ipd.Audio(audio, rate=sample_rate)

In [None]:
# segment_by_df_raw(df_raw)

In [None]:
def segment_by_idx(idx):
    return segment_by_df_raw(df.iloc[idx])

In [None]:
interact(segment_by_idx, idx=widgets.IntSlider(min=0, max=df.shape[0]-1, step=1, value=10));

### Data Preparation

#### "text"

__"text"__ contains the transcriptions of each utterance.<br>
The first element is the utterance-id, which is an arbitrary text string. (but if you have speaker information in your setup, you should make the speaker-id a prefix of the utterance id; this is important for reasons relating to the sorting of these files). The rest of the line is the transcription of each sentence. You don't have to make sure that all words in this file are in your vocabulary; out of vocabulary words will get mapped to a word specified in the file data/lang/oov.txt.<br>
Example:
```
s5# head -3 data/train/text
sw02001-A_000098-001156 HI UM YEAH I'D LIKE TO TALK ABOUT HOW YOU DRESS FOR WORK AND
sw02001-A_001980-002131 UM-HUM
sw02001-A_002736-002893 AND IS
```

In [None]:
sub_df = df[:10];sub_df

In [None]:
the_series = sub_df['transcript']
the_series = the_series.apply(lambda x: x.lower())
the_series.index = [(lambda x: x+'_'+x)(ent.split('.')[0].split('/')[1]) for ent in sub_df.index]
the_series

In [None]:
pwd

In [None]:
# %mkdir -p data

In [None]:
%ls

In [None]:
def save_and_fix(the_df, the_file_name):
    the_df.to_csv(the_file_name, sep =' ', header = False, quotechar = '@')
    with open (the_file_name, 'r') as fr:
        the_file_str = fr.read()
        fixed_file_str = re.sub('@','',the_file_str)
    with open (the_file_name, 'w') as fw:
        fw.write(fixed_file_str)

In [None]:
save_and_fix(the_series.sort_index(), os.path.join('data','sub_text'))       

#### "wav.scp"

Format:
```
<recording-id> <extended-filename>
```
where "extended-filename" may be an actual filename or a command that extracts a wav-format file. The pipe symbol on the end of the extended-filename specifies that it is to be interpreted as a pipe. If the "segments" file does not exist, the first token on each line of "wav.scp" file is just the utterance id.

In [None]:
sub_wav_scp_df = pd.DataFrame(list(sub_df.index), index = [ent.split('.')[0].split('/')[1] for ent in sub_df.index])
sub_wav_scp_df.index = [(lambda x: x+'_'+x)(ent) for ent in sub_wav_scp_df.index]
sub_wav_scp_df

In [None]:
save_and_fix(sub_wav_scp_df.sort_index(), os.path.join('data','sub_wav.scp'))

#### "utt2spk"

Format
```
<utterance-id> <speaker-id>
```
If you have no information at all about the speaker identities, you can just make the speaker-ids the same as the utterance-ids,so the format of the file would be just `<utterance-id> <utterance-id>`.

In [None]:
utts = [ent.split('.')[0].split('/')[1] for ent in sub_df.index]
sub_utt2spk_df = pd.DataFrame(utts, index = utts)
sub_utt2spk_df.index = [(lambda x: x+'_'+x)(ent) for ent in sub_utt2spk_df.index]
sub_utt2spk_df

In [None]:
save_and_fix(sub_utt2spk_df.sort_index(), os.path.join('data','sub_utt2spk'))

In [None]:
sub_spk2utt_df = pd.DataFrame(utts, index = utts)
sub_spk2utt_df = sub_spk2utt_df.applymap(lambda x: x+'_'+x)
sub_spk2utt_df

In [None]:
save_and_fix(sub_spk2utt_df.sort_index(), os.path.join('data','sub_spk2utt'))

In [None]:
# book_num = '1234'
# chap_num = '123456'

# path = Path(os.path.join(f"{book_num}",f"{chap_num}"))
# path.mkdir(parents=True, exist_ok=True)

In [None]:
# file_path = path.joinpath(f'{book_num}-{chap_num}.trans.txt')

In [None]:
# file_path

In [None]:
# pwd

In [None]:
# data_folder_name = r'asr-spanish-v1-carlfm01'
# description_file_name = os.path.join(data_folder_name,'files.csv')
# df = pd.read_csv(description_file_name, index_col='wav_filename')
# sub_df = df[:10];sub_df

# book_num = '1234'
# chap_num = '123456'
# path = Path(os.path.join(f"{book_num}",f"{chap_num}"))
# path.mkdir(parents=True, exist_ok=True)
# file_path = path.joinpath(f'{book_num}-{chap_num}.trans.txt')

# path = Path(os.path.join(f"{book_num}",f"{chap_num}"))
# path.mkdir(parents=True, exist_ok=True)

# with open(file_path, 'w') as fw:
#     for idx in range(sub_df.shape[0]):        
#         df_raw = sub_df.iloc[idx]
#         #print(df_raw)
#         fw.write(f'{book_num}-{chap_num}-{idx:04}'+' '+df_raw['transcript'].upper())
#         fw.write('\n')
#         source_path = Path(os.path.join(*([data_folder_name]+df_raw.name.split('/'))))
#         #print(source_path)
#         #print(source_path.exists ())
#         destination_path = Path(os.path.join(book_num,chap_num,f'{book_num}-{chap_num}-{idx:04}'+'.wav'))
#         #destination_path.touch()
#         print(destination_path)
#         destination_path.write_bytes(source_path.read_bytes())
#         with open(source_path, 'rb') as src, open(destination_path, 'wb') as dst: dst.write(src.read())

In [None]:
# with open('AWS_SPEAKERS.TXT') as fr:
#     speakers_str = fr.read()

In [None]:
#print(speakers_str)

In [None]:
train_utt2dur_df = pd.read_csv('train_utt2dur', sep= ' ', header = None, names=['utt', 'dur', 'na'])

In [None]:
train_utt2dur_df.head()

In [None]:
type(train_utt2dur_df['na'].values[0])

In [None]:
train_utt2dur_df['dur'].sum()/3600

In [None]:
train_utt2dur_df['dur'].mean()