# Prepare GECO (german dataset)
1. grab linguistic units from dataset
2. grab dataset statistics

In [None]:
import numpy as np
from glob import glob
from tqdm import tqdm_notebook as tqdm
import pandas as pd
from parallelspaper.speech_datasets import prep_GECO
from parallelspaper.config.paths import DATA_DIR

In [None]:
GECO_DIR = '/mnt/cube/Datasets/German/GECO/textgrids/'

### Load data

In [None]:
text_grids = glob(GECO_DIR+'*.textGrid')

In [None]:
(track_durations, word_durations, phone_durations, syll_durations, all_words, all_sylls, all_phones) = prep_GECO(text_grids)

### Get dataset statistics

In [None]:
num_phonemes = len(np.concatenate(np.concatenate(all_phones)))
num_words = len(np.concatenate(all_words))
word_durations_s = word_durations
word_length_phones = [len(i) for i in np.concatenate(all_phones)]
phone_duration_s = np.concatenate(phone_durations)
unique_phones = len(np.unique(np.concatenate(np.concatenate(all_phones))))
unique_words = len(np.unique(np.concatenate(all_words)))
utterance_length_phones = None
n_sessions = len(all_words)
session_durations = track_durations
total_duration = np.sum(session_durations)

In [None]:
stats_df = pd.DataFrame([[
        num_phonemes,
        num_words,
        word_durations_s,
        word_length_phones,
        phone_duration_s,
        unique_phones,
        unique_words,
        utterance_length_phones,
        n_sessions,
        session_durations,
        total_duration
    ]],
    columns = [
        'num_phonemes',
        'num_words',
        'word_durations_s',
        'word_length_phones',
        'phone_duration_s',
        'unique_phones',
        'unique_words',
        'utterance_length_phones',
        'n_sessions',
        'session_durations',
        'total_duration'
        ])
stats_df

In [None]:
# statistics for this language
stats_df.to_pickle((DATA_DIR / 'stats_df/GECO_stats_df.pickle'))

### make sequence dataframes

In [None]:
seq_df = pd.DataFrame(columns = ['language', 'levels', 'data'])

In [None]:
seq_df.loc[len(seq_df)] = ['german', 'speaker/word/phoneme', all_phones]
seq_df.loc[len(seq_df)] = ['german', 'speaker/word', all_words]
seq_df.loc[len(seq_df)] = ['german', 'speaker/word/sylls', all_sylls]

In [None]:
seq_df.to_pickle((DATA_DIR / 'speech_seq_df/GECO_seq_df.pickle'))