# Prepare CSJ (japanese dataset)
1. grab linguistic units from dataset
2. grab dataset statistics

In [None]:
import numpy as np
from glob import glob
from tqdm.autonotebook import tqdm
from parallelspaper.speech_datasets import prep_CSJ
import pandas as pd
from parallelspaper.config.paths import DATA_DIR

In [None]:
CSJ_DIR = '/mnt/cube/Datasets/Japanese/XML/BaseXML/core/'

In [None]:
xml_locs = glob(CSJ_DIR+'*.xml')

### Load data

In [None]:
(words, pos, mora, phonemes, phones, phone_class, session_lens,
 IPU_lens, phone_lens, word_lens, session_lens, IPU_phonemes) = prep_CSJ(xml_locs)

### Get dataset statistics

In [None]:
num_phonemes = len(np.concatenate(phonemes))
num_words = len(np.concatenate(words))
word_durations_s = np.nan
word_length_phones = word_lens
phone_duration_s = phone_lens
unique_phones = len(np.unique(np.concatenate(phonemes)))
unique_words = len(np.unique(np.concatenate(words)))
utterance_length_phones = [len(i) for i in np.concatenate(IPU_phonemes)]
n_sessions = len(phones)
session_durations = [np.sum(i) for i in session_lens]
total_duration = np.sum(IPU_lens)

In [None]:
stats_df = pd.DataFrame([[
    num_phonemes,
    num_words,
    word_durations_s,
    word_length_phones,
    phone_duration_s,
    unique_phones,
    unique_words,
    utterance_length_phones,
    n_sessions,
    session_durations,
    total_duration
]],
    columns=[
        'num_phonemes',
        'num_words',
        'word_durations_s',
        'word_length_phones',
        'phone_duration_s',
        'unique_phones',
        'unique_words',
        'utterance_length_phones',
        'n_sessions',
        'session_durations',
        'total_duration'
])
stats_df

In [None]:
# statistics for this language
stats_df.to_pickle((DATA_DIR / 'stats_df/CSJ_stats_df.pickle'))

### make sequence dataframes

In [None]:
# words, pos, mora, phonemes, phones, phone_class

In [None]:
seq_df = pd.DataFrame(columns = ['language', 'levels', 'data'])

In [None]:
seq_df.loc[len(seq_df)] = ['japanese', 'speaker/IPU/phonemes', IPU_phonemes]
seq_df.loc[len(seq_df)] = ['japanese', 'speaker/word', words]
seq_df.loc[len(seq_df)] = ['japanese', 'speaker/pos', pos]
seq_df.loc[len(seq_df)] = ['japanese', 'speaker/word/mora', mora]
seq_df.loc[len(seq_df)] = ['japanese', 'speaker/word/phonemes', phonemes]
seq_df.loc[len(seq_df)] = ['japanese', 'speaker/word/phones', phones]
seq_df.loc[len(seq_df)] = ['japanese', 'speaker/word/phone_class', phone_class]

In [None]:
seq_df.to_pickle((DATA_DIR / 'speech_seq_df/CSJ_seq_df.pickle'))