# Prepare AsiCA (italian dataset)
1. grab linguistic units from dataset
2. grab dataset statistics

In [None]:
from glob import glob
import pandas as pd
from tqdm.autonotebook import tqdm
from praatio import tgio
import numpy as np
from parallelspaper.config.paths import DATA_DIR

In [None]:
ASICA_DIR = '/mnt/cube/Datasets/Italian/AsiCA-corpus/AsiCa/DATA/'

### Load data

In [None]:
# textgrids
text_grids = glob(ASICA_DIR+'*.TextGrid')
# names of textgrids
tg_name = text_grids[0].split('/')[-1][:-9]

In [None]:
# use textgrid names to create a dataframe of textgrid info
all_grids = pd.DataFrame(columns = ['grid', 'place', 'gen', 'gender', 'mig', 'int', 'indexing'])
for grid in text_grids:
    tg_name = grid.split('/')[-1][:-9]
    place = tg_name[:3] 
    gen=tg_name[3]
    gender=tg_name[4]
    migration_experience = tg_name[5]
    interview_type = tg_name[6]
    indexing = tg_name[7]
    all_grids.loc[len(all_grids)] = [grid, place, gen, gender, migration_experience, interview_type, indexing]

#### subset spontaneous grids

In [None]:
# int == 'D' are spontaneous interviews
spon_grids = all_grids[all_grids['int'] == 'D']

### Get phonemes from textgrids

In [None]:
def read_text(fname):
    with open(fname, encoding="latin-1") as f:
        content = f.readlines()
    content = [x.strip() for x in content] 
    return '\n'.join(content)

def flatlist(list_of_lists):
    return [val for sublist in list_of_lists for val in sublist]

In [None]:
tier_lens = []
phon_list = []
label_durs = []
for tg in tqdm(spon_grids.grid.values):
    text_grid = tgio._parseNormalTextgrid(read_text(tg))
    labels = [i.label for i in text_grid.tierDict[tg.split('/')[-1][:-9]].entryList if not ((':' in i.label) and (i.label[:2] != 'I:')) and (i.label != '.')]
    labels = [i if i[:2] != 'I:' else i[2:] for i in labels]
    label_dur = [i.end-i.start for i in text_grid.tierDict[tg.split('/')[-1][:-9]].entryList if not ((':' in i.label) and (i.label[:2] != 'I:')) and (i.label != '.')]
    label_durs.append(label_dur)
    phon_list.append(labels)
    tier_len = text_grid.tierDict[tg.split('/')[-1][:-9]].entryList[-1].end - text_grid.tierDict[tg.split('/')[-1][:-9]].entryList[0].start
    tier_lens.append(tier_len)

### get dataset statistics

In [None]:
n_phones = [[len(utterance.replace(" ", "")) for utterance in grid] for grid in phon_list]
n_words = [[len(utterance.split(" ")) for utterance in grid] for grid in phon_list]
avg_phone_lens = np.array([dur/nphone for dur, nphone in zip(np.concatenate(label_durs), np.concatenate(n_phones))])
avg_word_lens = np.array([dur/nword for dur, nword in zip(np.concatenate(label_durs), np.concatenate(n_words))])

In [None]:
# remove spacing
phon_list = [[[list(word) for word in utterance.split(' ') if len(word)>0] for utterance in session] for session in phon_list]

In [None]:
phones_per_word = [len(i) for i in flatlist(flatlist(phon_list))]
np.median(phones_per_word), np.sum(np.array(phones_per_word) == 1)/len(phones_per_word)

In [None]:
num_phonemes = len(flatlist(flatlist(flatlist(phon_list))))
num_words = len(flatlist(phon_list))
word_durations_s = avg_word_lens
word_length_phones = phones_per_word
phone_duration_s = avg_phone_lens
unique_phones = len(np.unique(flatlist(flatlist(flatlist(phon_list)))))
unique_words = None
utterance_length_phones = None
n_sessions = len(phon_list)
session_durations = tier_lens
total_duration = np.sum(tier_lens)

stats_df = pd.DataFrame([[
        num_phonemes,
        num_words,
        word_durations_s,
        word_length_phones,
        phone_duration_s,
        unique_phones,
        unique_words,
        utterance_length_phones,
        n_sessions,
        session_durations,
        total_duration
    ]],
    columns = [
        'num_phonemes',
        'num_words',
        'word_durations_s',
        'word_length_phones',
        'phone_duration_s',
        'unique_phones',
        'unique_words',
        'utterance_length_phones',
        'n_sessions',
        'session_durations',
        'total_duration'
        ])
stats_df

In [None]:
# statistics for this language
stats_df.to_pickle((DATA_DIR / 'stats_df/AsiCA_stats_df.pickle'))

### make sequence dataframes

In [None]:
seq_df = pd.DataFrame(columns = ['language', 'levels', 'data'])

In [None]:
# create a list of speakers/words/phonemes
word_seqs = [flatlist(i) for i in phon_list]

In [None]:
seq_df.loc[len(seq_df)] = ['italian', 'speaker/word/phoneme', word_seqs]

In [None]:
seq_df.to_pickle((DATA_DIR / 'speech_seq_df/AsiCA_seq_df.pickle'))