In [19]:
import os, re
from collections import Counter
from tqdm import tqdm
from collections import OrderedDict
from razdel import tokenize
from bs4 import BeautifulSoup
import warnings
from bs4 import XMLParsedAsHTMLWarning
warnings.filterwarnings("ignore", category=XMLParsedAsHTMLWarning, module='bs4')
from pyaspeller import YandexSpeller
from pyaspeller import Word
speller = YandexSpeller()

### Транслитератор lat2cyr

In [206]:
TRANSLIT_TABLE = {
    "`": "'",
    "_": " ",
    "о": "o",
    "а": "a",
    "а": "a",
    "е": "e",
    "с": "c",
    "р": "p", # конец кир-лат
    "ịa": "я",
    "ː" : "",
    ":" : "",
    "ị" : "i",
    "ọ" : "o",
    "ụ" : "u",
    "ae": "aе",  # кир. е
    "oe": "oе",  # кир. е
    "ue": "uе",  # кир. е
    "ee": "eе",  # кир. е
    "ie": "iе",  # кир. е
    "dʒo" : "дё",
    "dʒe" : "де",
    "dʒi" : "ди",
    "dʒu" : "дю",
    "dʒa" : "дя",
    "d'o" : "дё",
    "d'a" : "дя",
    "d'e" : "де",
    "d'u" : "дю",
    "d'i" : "ди",
    "dʒ" : "дь",
    "di" : "ды",
    "ti" : "ты",
    "ja" : "я",
    "je" : "е",
    "jo" : "ё",
    "ju" : "ю",
    "ńa" : "ня",
    "ńo" : "нё",
    "ńi" : "ни",
    "ńe" : "не",
    "ńu" : "ню",
    "ń" : "нь",
    "n'" : "нь",
    "s'" : "сь",
    "sj" : "сь",
    "t'" : "ть",
    "d'" : "дь",
    "l'u" : "лю",
    "l'a" : "ля",
    "l'e" : "ле",
    "l'i" : "ли",
    "r'u" : "рю",
    "r'a" : "ря",
    "r'e" : "ре",
    "r'i" : "ри",
    "r'" : "рь",
    "l'" : "ль",
    "š'" : "шь",
    "zh" : "ж",
    "a" : "а",
    "b" : "б",
    "č" : "ч",
    "ch" : "ч",
    "c" : "ц",
    "d" : "д",
    "e" : "э",
    "f" : "ф",
    "g" : "г",
    "γ" : "г",
    "h" : "х",
    "i" : "и",
    "j" : "й",
    "k" : "к",
    "l" : "л",
    "m" : "м",
    "n" : "н",
    "ŋ" : "ӈ",
    "o" : "о",
    "ö" : "ё",
    "p" : "п",
    "r" : "р",
    "s" : "с",
    "š" : "ш",
    "t" : "т",
    "u" : "у",
    "ü" : "ю",
    "v" : "в",
    "w" : "в",
    "x" : "х",
    "χ" : "х",
    "y" : "ы",
    "ï" : "ы",
    "ə" : "э",
    "z" : "з",
    "ž" : "ж",
    "гэро": "геро",
    "пэрэд": "перед",
    "звэр": "звер",
    "корэн": "корен",
    "чэв": "чев",
    "цэ": "це",
    "сэло": "село",
    "осхо": "ошо",
    "госты": "гости"}
tr_pairs = OrderedDict(TRANSLIT_TABLE)

In [207]:
def lat2cyr(text):
    text = text.lower().strip()
    for lat, cyr in tr_pairs.items():
        text = text.replace(lat,cyr)
    return text

### Транслитератор cyr2cyr

In [208]:
TRANSLIT_TABLE_cyr = {
    "ŋ" : "ӈ",
    "қ" : "к",
    "" : "г",
    "ҕ" : "г",
    "h" : "х",
    "χ" : "х",
    "дьо" : "дё",
    "дьэ" : "де",
    "дьа" : "дя",
    "дьу" : "ню",
    "ньо" : "нё",
    "ньэ" : "не",
    "ньа" : "ня",
    "ньу" : "ню"
}
tr_pairs_cyr = OrderedDict(TRANSLIT_TABLE_cyr)

def cyr2cyr(sent):
    sent = sent.lower().strip()
    for lat, cyr in tr_pairs_cyr.items():
        sent = sent.replace(lat,cyr)
    correction = re.sub(r'(.*?\w[.,!?:])("?\w.*?)', r'\1 \2', sent)
    correction = re.sub(r'(.*?\w\w\w)-(\w\w\w.*?)', r'\1 - \2', correction)
    return correction

### Проверка русского

In [5]:
# я написала спелчекер с яндексовской библиотекой, но работало не очень
# sent = 'я, я-то в эссо эздыла, вот они здэсь отделно дэлаэтэ приэду отличаэтся мэдвэдь иды по-своэму'
# def rucheck(sent):
#     for word in tokenize(sent):
#         correction = Word(word.text).spellsafe
#         if correction:
#             if word.text.replace('э', '').replace('ы', '').replace('ь', '').replace('и', '') == correction.replace('э', '').replace('е', '').replace('ы', '').replace('и', '').replace('ь', ''):  # otdelno
#                 sent = sent[:word.start] + correction + sent[word.stop:]
#     return sent
# 
# rucheck(sent)

In [6]:
wiki_data = open('data/wiki_data.txt', encoding='utf8').readlines()
lib_rus = open('data/lib_rus_ec_2016_utf.txt', encoding='utf8').readlines()  # Источник: minio.cosyco.ru


KeyboardInterrupt



In [None]:
corpus = wiki_data + lib_rus

In [27]:
tokens = []
for line in tqdm(corpus):
    tokens.extend(re.findall(r'[\w-]+', line.lower()))

 11%|███▍                           | 861910/7892774 [12:41<02:48, 41646.13it/s]IOStream.flush timed out
100%|█████████████████████████████| 7892774/7892774 [1:32:19<00:00, 1424.88it/s]


In [36]:
vocab = Counter(tokens)

In [44]:
with open('tokens.txt', 'w', encoding='utf8') as f:
    for w, k in vocab.items():
        f.write(w + ' ' + str(k)+ '\n')

In [23]:
with open('tokens.txt', encoding='utf8') as f:
    word_num = f.readlines()

In [24]:
vocab = {}
for line in tqdm(word_num):
    word, num = line.split()
    vocab[word] = int(num)

100%|█████████████████████████████| 2081652/2081652 [00:08<00:00, 252053.85it/s]


In [25]:
N = sum(vocab.values())
def P(word, N=N):
    return vocab[word] / N

In [26]:
del vocab['еч']
del vocab['иавь']
del vocab['ечь']
del vocab['нань']
del vocab['бее']
del vocab['беем']
del vocab['радытели']
del vocab['явь']
del vocab['нянь']
del vocab['муть']

In [93]:
vocab['охотничее'] = 1
vocab['радители'] = 1
vocab['нартовые'] = 1
vocab['чумработница'] = 1

In [78]:
# Medved podvel
word_rules = {'эе': 'ээ',
              'эссо': 'эссо'}
def check_word(word):
    vars = [word]
    for var in vars:
        for i, letter in enumerate(var):
            if letter == 'э':
                vars.append(var[:i] + 'е' + var[i+1:])
    for var in vars:
        if 'ты' in var:
            vars.append(var.replace('ты', 'ти', 1))
    for var in vars:
        if 'ды' in var:
            vars.append(var.replace('ды', 'ди', 1))
    for var in vars:
        if 'шч' in var:
            vars.append(var.replace('шч', 'щ', 1))
    for var in vars:
            if 'тс' in var:
                vars.append(var.replace('тс', 'ц', 1))
    for var in vars:
        splits  = [(var[:i], var[i:]) for i in range(len(var) + 1)]
        inserts = [L + 'ь' + R for L, R in splits]
    set_vars = set(vars + inserts)
    # print('set_vars', set_vars)
    vocab_vars = set(w for w in set_vars if w in vocab)
    # print('vocab_vars', vocab_vars)
    if word in word_rules:
        correction = word_rules[word]
    elif vocab_vars:
        correction = max(vocab_vars, key=P)
    else:
        correction = word
    return correction

def check_sent(sent):
    sent = sent.lower().strip()
    for word in tokenize(sent):
        correction = check_word(word.text)
        if correction != word.text:
            sent = sent.replace(word.text, correction, 1)
    return sent

# Форматы -> таблица (аудио, фраза, checked_фраза, start, end)

In [28]:
# Получаем soup файла
def get_soup(filename):
    with open('../audio/'+filename) as f:
        file = f.read()
    soup = BeautifulSoup(file)
    return soup

In [29]:
# Получаем таймкоды границ
def get_timecodes(soup):
    timecodes = soup.find_all('time_slot')
    ts_msec = {}
    for timecode in timecodes:
        if timecode.get('time_value'):
            ts_msec[timecode.get('time_slot_id')] = int(timecode.get('time_value'))
    return ts_msec

### Формат id, tx (латиница)

In [259]:
filenames = ["Bystraja District__Conversations__Adukanova_Ichanga_Managic__Managic.eaf",
             "Bystraja District__Narratives__Personal Narratives__Adukanova_DB_life__Adukanova_Darja_Borisovna_2010_02_01.eaf",
             "Bystraja District__Narratives__Personal Narratives__Adukanova_EG_Managic__Adukanova_EG_Managic.eaf",
             "Bystraja District__Narratives__Personal Narratives__Amganov_EI_crossing_river_140609__Amganov_EI_crossing_river.eaf",
             "Bystraja District__Narratives__Personal Narratives__Amganov_EI_ducks_Esso_140609__Amganov_EI_ducks_Esso_140609.eaf",
             "Bystraja District__Narratives__Personal Narratives__Amganov_EI_eggs_Esso_140609__Amganov_eggs.eaf",
             "Bystraja District__Narratives__Personal Narratives__Amganov_EI_first_tractor__Amganov_EI_first_tractor_2010_01_30.eaf",
             "Bystraja District__Narratives__Personal Narratives__Amganov_EI_food_from_sky_Esso_140609__Amganov_food_from_sky.eaf",
             "Bystraja District__Narratives__Personal Narratives__Amganov_EI_kids_uphill_Esso_140609__Amganov_kids_uphill.eaf",
             "Bystraja District__Narratives__Personal Narratives__Amganov_EI_kino_Esso_140609__Amganov_kino.eaf",
             "Bystraja District__Narratives__Personal Narratives__Amganov_EI_live_duck_Esso_140609__Amganov_EI_live_duck_Esso_140609.eaf",
             "Bystraja District__Narratives__Personal Narratives__Amganov_EI_pugalo_Esso_140609__Amganov_pugalo.eaf",
             "Bystraja District__Narratives__Personal Narratives__Amganov_EI_whip_Esso_140609__Amganov_whip.eaf",
             "Bystraja District__Narratives__Personal Narratives__Ichanga_NF_Managic__Ichanga_Nikolaj_Fomich.eaf",
             "Bystraja District__Procedural texts__Axmetova_VI_scraping_fur__Axmetova_VI_pyzhik_LZ_RM.eaf",
             "Sebjan-Küöl__Conversations__RDA_TPK_birth__RDA_TPK_birth.eaf",
             "Sebjan-Küöl__Conversations__RDA_TPK_death__RDA_TPK_death.eaf",
             "Sebjan-Küöl__Conversations__RDA_TPK_delburge__RDA_TPK_delburge.eaf",
             "Sebjan-Küöl__Conversations__RDA_TPK_hongachan_baldarakan__RDA_TPK_hongachan_baldarakan.eaf",
             "Sebjan-Küöl__Conversations__RDA_TPK_names_origins__RDA_TPK_names_origins.eaf",
             "Sebjan-Küöl__Conversations__RDA_TPK_spirits__RDA_TPK_spirits.eaf",
             "Sebjan-Küöl__Explanatory materials__Daily life__Krivoshapkin_SP_oxota__Krivoshapkin_SP_oxota_260210_Sebjan.eaf",
             "Sebjan-Küöl__Explanatory materials__Social Organization__Burtsev_ID_traditions__Burtsev_ID_traditions_040310.eaf",
             "Sebjan-Küöl__Explanatory materials__Social Organization__Kirillina_KK_history__Kirillina_KK_history_210310_Sebjan.eaf",
             "Sebjan-Küöl__Explanatory materials__Social Organization__Krivoshapkina_AX_Sebjan_history1__Krivoshapkina_AX_Sebjan_history1_240310_Sebjan.eaf",
             "Sebjan-Küöl__Explanatory materials__Social Organization__Stepanova_NA_nomadic_school_110310__Stepanova_NA_kochevaja_shkola_110310_Sebjan.eaf",
             "Sebjan-Küöl__Narratives__Personal Narratives__Kejmetinova_AA_headmistress__Kejmetinova_AA_headmistress_Yakutsk_311010.eaf",
             "Sebjan-Küöl__Narratives__Personal Narratives__Kejmetinova_XA_various__Kejmetinova_XA_various.eaf",
             "Sebjan-Küöl__Narratives__Personal Narratives__Krivoshapkin_Misha_school__Krivoshapkin_Misha_school_220310_Sebjan.eaf",
             "Sebjan-Küöl__Narratives__Personal Narratives__Stepanova_ZA_jubki_Aniwrin__Stepanova_ZA_jubki_Aniwrin_230210_Sebjan.eaf",
             "Sebjan-Küöl__Narratives__Personal Narratives__Suzopova_Olja_Eveny_Tompo__Suzopova_Olja_eveny_Tompo_240310_Sebjan.eaf",
             "Sebjan-Küöl__Narratives__Personal Narratives__Suzopova_Olja_pro_babushku__Suzopova_Olja_pro_babushku_240310_Sebjan.eaf",
             "Sebjan-Küöl__Narratives__Personal Narratives__Suzopova_Olja_stado__Suzopova_Olja_stado_240310_Sebjan.eaf",
             "Topolinoe__Explanatory texts__Social Organization__Top09_ZKM_2.4_Hebdjek_Festival_1__Top09_ZKM_2_4_Hebdjek_1a.eaf",
             "Topolinoe__Narratives__Folklore__Top07_GNM1_Ed'ek__Top07_Golikova1_Edjek.eaf",
             "Topolinoe__Narratives__Folklore__Top07_ZKM_1a_Raven__Top_07_ZKM_1a_Raven.eaf",
             "Topolinoe__Narratives__Folklore__Top07_ZKM_1b_Birds__Top07_ZKM_1b_Birds.eaf",
             "Topolinoe__Narratives__Personal Narratives__Top09_GNM_3_3-fun-misunderst-dial1__Top09_GNM_3_3_Fun_Misunderst_Dial1-reimport.eaf"]

In [260]:
files_phrases_tc_idtx = []

for filename in tqdm(filenames):
    soup = get_soup(filename)
    ts_msec = get_timecodes(soup)
    wavname = filename[:-4]+'.wav'
    
    # У слотов фраз узнаём границы
    parent_anns = {}
    for tier in soup.find_all('tier'):
        if tier.get('linguistic_type_ref') == 'id' and tier.get('tier_id') != 'id@comment':
            al_anns = tier.find_all('alignable_annotation')
            for al_ann in al_anns:
                ann_id = al_ann.get('annotation_id')
                ref1 = al_ann.get('time_slot_ref1')
                ref2 = al_ann.get('time_slot_ref2')
                parent_anns[ann_id] = [ref1, ref2]
    # Соединяем слова в предложения, помним родительский слот фразы.
    ann_ref = 0
    ann_phrase = {}
    for tier in soup.find_all('tier'):
        if tier.get('linguistic_type_ref') == 'tx' and tier.get('tier_id') != 'tx@comment':
            ref_anns = tier.find_all('ref_annotation')
            for ref_ann in ref_anns:
                ann_ref = ref_ann.get('annotation_ref')
                word = ref_ann.find('annotation_value').text
                if ann_ref in ann_phrase:
                    ann_phrase[ann_ref] += ' ' + word
                else:
                    ann_phrase[ann_ref] = word
    for ann in ann_phrase:
        files_phrases_tc_idtx.append([wavname,
                                      ann_phrase[ann].strip(),
                                      check_sent(lat2cyr(ann_phrase[ann])),
                                      ts_msec[parent_anns[ann][0]],
                                      ts_msec[parent_anns[ann][1]]])

100%|███████████████████████████████████████████| 38/38 [00:28<00:00,  1.34it/s]


In [261]:
idtx = pd.DataFrame(files_phrases_tc_idtx, columns = ['filename', 'phrase',
                                                      'checked', 'start', 'end'])

### Формат ev (кириллица)

In [262]:
filenames = ["Bystraja District__Conversations__Paderina_JI_Egorova_RM_conversation_Esso__Paderina_JI_Egorova_RM_razgovor_070709_Esso.eaf",
             "Topolinoe__Music Sound__Songs Dances__Top09_PAI_1a_songs&life__Top_PAI_1a_Songs-Life.eaf",
             "Topolinoe__Narratives__Folklore__Top09_PAI_1b_nimkan&life__Top_PAI_1b_Nimkan-Life.eaf",
             "Topolinoe__Narratives__Personal Narratives__Top09_DMP_1a_Life__Top09_DMP_1a-Life.ZOOM.eaf",
             "Topolinoe__Narratives__Personal Narratives__Top09_DMP_1b_Shaman__Top09_DMP_1b_Schaman.eaf",
             "Topolinoe__Narratives__Personal Narratives__Top09_GNM_1_1-Childhood__Top_gol.n.m.1.1.eaf",
             "Topolinoe__Procedural texts__Top09_GNM_5_Sewing-Unty__TOP_GNM_5_Unty_Sewing.eaf",
             "Topolinoe__Narratives__Personal Narratives__Top09_SEV_2_1_childhood_history__Top_SEV_2_1_childhood_history.eaf"]

In [263]:
files_phrases_tc_ev = []

for filename in tqdm(filenames):
    wavname = filename[:-4]+'.wav'
    soup = get_soup(filename)
    ts_msec = get_timecodes(soup)
    # Соединяем слова в предложения, помним родительский слот фразы.
    ann_ref = 0
    ann_phrase = {}
    for tier in soup.find_all('tier'):
        if tier.get('tier_id').startswith('ev@'):
            anns = tier.find_all('alignable_annotation')
            for ann in anns:
                ann_ts_1 = ann.get('time_slot_ref1')
                ann_ts_2 = ann.get('time_slot_ref2')
                phrase = ann.find('annotation_value').text
                if re.search(r'[A-Za-z]', cyr2cyr(phrase)):
                    checked = check_sent(lat2cyr(phrase))
                else:
                    checked = cyr2cyr(phrase)
                files_phrases_tc_ev.append([wavname,
                                    phrase.strip(),
                                    checked,
                                    ts_msec[ann_ts_1],
                                    ts_msec[ann_ts_2]])

100%|█████████████████████████████████████████████| 8/8 [00:01<00:00,  4.20it/s]


In [264]:
ev = pd.DataFrame(files_phrases_tc_ev, columns = ['filename', 'phrase',
                                                  'checked', 'start', 'end'])

### Формат id, tx (латиница), ft объединяет

In [265]:
filenames = ["Topolinoe__Explanatory texts__Reindeer__Top09_GNM_1_2-sacred-reindeer__Top09_GNM_1_2-sacred-reindeer.eaf",
             "Bystraja District__Narratives__Folklore__Egorova_RM_fox_wolf__Egorova_RM_fox_wolf_Esso_240609.eaf",
             "Bystraja District__Narratives__Folklore__Egorova_RM_Uindja__Egorova_RM_Uindja_Esso_240609.eaf",
             "Bystraja District__Narratives__Personal Narratives__Amganov_EI_leaving_Twajan__Amganov_EI_leaving_Twajan_2010_01_30.eaf",
             "Bystraja District__Narratives__Personal Narratives__Bataxaeva_TE_childhood__Bataxaeva_TE_childhood_Esso_14-06-09.eaf",
             "Bystraja District__Narratives__Personal Narratives__Solodikov_AA_bearhunt__Solodikov_AA_bearhunt.eaf",
             "Bystraja District__Narratives__Personal Narratives__Solodikov_AA_muxa_roditeli__Solodikov_AA_muxa_roditeli_NA.eaf",
             "Bystraja District__Narratives__Personal Narratives__Solodikov_AA_Tabun_1__Solodikov_AA_tabun_1.eaf",
             "Bystraja District__Narratives__Personal Narratives__Solodikov_AA_Tabun_2__Solodikov_AA_tabun_2.eaf",
             "Bystraja District__Narratives__Personal Narratives__Solodikov_Artem_Nikolaevich_Managich_180609__Solodikov_Artem_Nikolaevich_Managich_180609.eaf",
             "Bystraja District__Narratives__Folklore__Lomovceva_Grigorieva_Duo__Kam10_Lomovceva_Grigorieva_Duo.eaf"]

In [266]:
files_phrases_tc_ft = []

for filename in tqdm(filenames):
    soup = get_soup(filename)
    ts_msec = get_timecodes(soup)
    phrase_timecodes = OrderedDict()
    phrase = []
    for tier in soup.find_all('tier'):
        if tier.get('tier_id').startswith('tx@') and tier.get('tier_id') != 'tx@comment':
            anns = tier.find_all('alignable_annotation')
            for ann in anns:
                ann_ts_1 = ann.get('time_slot_ref1')
                ann_ts_2 = ann.get('time_slot_ref2')
                word = ann.find('annotation_value').text
                phrase.append(word)
                if ann_ts_1 in ts_msec:
                    start = ts_msec[ann_ts_1]
                if ann_ts_2 in ts_msec:
                    end = ts_msec[ann_ts_2]
                    phrase_timecodes[' '.join(phrase)] = (start, end)
                    phrase = []
    wavname = filename[:-4]+'.wav'
    for phrase, timecodes in phrase_timecodes.items():
        files_phrases_tc_ft.append([wavname,
                                    phrase.strip(),
                                    check_sent(lat2cyr(phrase)),
                                    timecodes[0],
                                    timecodes[1]])

100%|███████████████████████████████████████████| 11/11 [00:06<00:00,  1.74it/s]


In [267]:
ft = pd.DataFrame(files_phrases_tc_ft, columns = ['filename', 'phrase',
                                                  'checked', 'start', 'end'])

### Формат id, tx (латиница), ev (кириллица)

In [268]:
filenames = ["Sebjan-Küöl__Explanatory materials__Social Organization__Krivoshapkina_AX_Even_wedding__Krivoshapkina_AX_svatovstvo_25032009_Sebjan.eaf",
             "Sebjan-Küöl__Music Sound__Songs Dances__3_women_songs__3_women_songs_190310_Sebjan.eaf",
             "Sebjan-Küöl__Music Sound__Songs Dances__Zaxarov_AV_song__Zaxarov_AV_song_180310_Sebjan.eaf",
             "Sebjan-Küöl__Narratives__Folklore__Kirillina_KK_emchen__Kirillina_KK_emcheni_220310_Sebjan.eaf",
             "Sebjan-Küöl__Narratives__Folklore__Neustroev_PD_Sebjan__Neustroev_PD_nimkan_25032009_Sebjan.eaf",
             "Sebjan-Küöl__Narratives__Personal Narratives__Alekseeva_RD_chuchuna_family__Alekseeva_RD_chuchuna_family_190210.eaf",
             "Sebjan-Küöl__Narratives__Personal Narratives__Alekseeva_RD_lost_tapes_znatoki__Alekseeva_RD_lost_tapes_znatoki_190210.eaf",
             "Sebjan-Küöl__Narratives__Personal Narratives__Alekseeva_RD_old_remains_traditions__Alekseeva_RD_old_remains_traditions_190210.eaf",
             "Sebjan-Küöl__Narratives__Personal Narratives__Alekseeva_RD_shatun__Alekseeva_RD_shatun_190210.eaf",
             "Sebjan-Küöl__Narratives__Personal Narratives__Krivoshapkina_MK_nastavlenie_materi__Krivoshapkina_MK_nastavlenie_materi_Sebjan_11032012.eaf",
             "Sebjan-Küöl__Narratives__Personal Narratives__Stepanova_ZA_arrival_Tashkent__Stepanova_ZA_arrival_Tashkent.eaf",
             "Sebjan-Küöl__Narratives__Personal Narratives__Stepanova_ZA_naled__Stepanova_ZA_naled_23022010_Sebjan.eaf",
             "Sebjan-Küöl__Narratives__Personal Narratives__Zaxarov_AV_indjuk_internat__Zaxarov_AV_indjuk_internat_180310_Sebjan_2.eaf",
             "Topolinoe__Conversations__Top09_GNM_SAT_conversation__Top09_GNM_SAT_conversation.eaf",
             "Topolinoe__Explanatory texts__Flora__Top09_ZTA_2_2_healing-shaman-taboo__Top09_ZTA_2_2_HealingDreamsTaboos.eaf",
             "Topolinoe__Explanatory texts__Reindeer__Top09_GNM_4_1_Reindeer-birds-colours-rituals__TOP09_GNM_4_1_Reindeer_masti_ritual.eaf",
             "Topolinoe__Music Sound__Songs Dances__Top09_GMG_1275-bird.song__Top09_GMG_1275-bird.song.eaf",
             "Topolinoe__Music Sound__Songs Dances__Top09_GMG_1276-chipmunk__Top09_GMG_1276-chipmunk.eaf",
             "Topolinoe__Narratives__Folklore__Top09_GMG_1277-Glove.Love__Top09_GMG_1277.Glove.Love.eaf",
             "Topolinoe__Narratives__Personal Narratives__Top09_GNM_2_3_shamans and kulaks__Top09_GNM_2_3_shamans_and_kulaks.eaf",
             "Topolinoe__Narratives__Personal Narratives__Top09_GNM_3_2_Funny Story about Father__Top09_GNM_3_2_Funny_Story_about_Father.eaf",
             "Topolinoe__Narratives__Personal Narratives__Top09_GNM_4_3_Funny Story about a fawn__Top09_GNM_4_3_FunStory_Angacha.eaf",
             "Topolinoe__Narratives__Personal Narratives__Top09_LVA_LSM_1_1a_Shaman__TOP09_LVA_LSM_1_1a_Shaman-reimport.eaf",
             "Topolinoe__Narratives__Personal Narratives__Top09_LVA_LSM_1_1b_Shaman__Top09_LVA_LSM_1_1b_Shaman-reimport.eaf",
             "Topolinoe__Narratives__Personal Narratives__Top09_TMA_1_2_Hunt__Top09_TMA_1_2_Hunt-reimport.eaf",
             "Topolinoe__Narratives__Ritual texts__TOP09_GNM_6_3_Dead Spirit Baby__Top09_GNM_6_3_Dead_Spirit_Baby.eaf",
             "Topolinoe__Narratives__Personal Narratives__Top09_LVA_LSM_1_2_Shaman__Top09_LVA_LSM_1_2_Shaman.eaf"]

In [269]:
files_phrases_tc_idtxev = []

for filename in tqdm(filenames):
    soup = get_soup(filename)
    ts_msec = get_timecodes(soup)

    # У слотов фраз узнаём границы
    parent_anns = {}
    for tier in soup.find_all('tier'):
        if tier.get('linguistic_type_ref') == 'id' and tier.get('tier_id') != 'id@comment':
            al_anns = tier.find_all('alignable_annotation')
            for al_ann in al_anns:
                ann_id = al_ann.get('annotation_id')
                ref1 = al_ann.get('time_slot_ref1')
                ref2 = al_ann.get('time_slot_ref2')
                parent_anns[ann_id] = [ref1, ref2]
    # Соединяем слова в предложения, помним родительский слот фразы.
    ann_ref = 0
    ann_phrase = {}
    for tier in soup.find_all('tier'):
        if tier.get('linguistic_type_ref') == 'tx' and tier.get('tier_id') != 'tx@comment':
            ref_anns = tier.find_all('ref_annotation')
            for ref_ann in ref_anns:
                ann_ref = ref_ann.get('annotation_ref')
                word = ref_ann.find('annotation_value').text
                if ann_ref in ann_phrase:
                    ann_phrase[ann_ref] += ' ' + word
                else:
                    ann_phrase[ann_ref] = word
    wavname = filename[:-4]+'.wav'
    for ann in ann_phrase:
        files_phrases_tc_idtxev.append([wavname,
                                       ann_phrase[ann].strip(),
                                       check_sent(lat2cyr(ann_phrase[ann])),
                                       ts_msec[parent_anns[ann][0]],
                                       ts_msec[parent_anns[ann][1]]])

100%|███████████████████████████████████████████| 27/27 [00:20<00:00,  1.33it/s]


In [270]:
idtxev = pd.DataFrame(files_phrases_tc_idtxev, columns = ['filename', 'phrase',
                                               'checked', 'start', 'end'])

# Итого

In [271]:
spans = pd.concat([idtx, ev, ft, idtxev])
len(spans)

9145

### Очистка от комментариев и пунктуационных проблем

In [282]:
comments = [r'\(русс\.\)', r'\(unclear\)', 'russian', 'Russian', 'laughter', 'stammers', r'\(R\)', r'\?\?\?', r'\[BP:', 'sound', 'incomprehensible']
for comment in comments:
    spans = spans.drop(spans[spans['phrase'].str.contains(comment)].index)
spans = spans.drop(spans[spans['phrase']==''].index)
len(spans)

8681

In [283]:
spans

Unnamed: 0,filename,phrase,checked,start,end,length,place
0,Bystraja District__Conversations__Adukanova_Ic...,Teːleŋni,тэлэӈни,18090,21025,2935,Bystraja District
1,Bystraja District__Conversations__Adukanova_Ic...,V armiju služili...,в армию служили,21025,24065,3040,Bystraja District
2,Bystraja District__Conversations__Adukanova_Ic...,Ọrọttịč teːleŋni,ороттыч тэлэӈни,34415,35575,1160,Bystraja District
3,Bystraja District__Conversations__Adukanova_Ic...,Tugenidu tụrkị...,тугэниду турки,108325,110685,2360,Bystraja District
4,Bystraja District__Conversations__Adukanova_Ic...,"Nu, a sobak obučat', kak vy eto delaete?",ну а собак обучать как вы это делаете,395194,403985,8791,Bystraja District
...,...,...,...,...,...,...,...
2540,Topolinoe__Narratives__Personal Narratives__To...,Tačịn upe ukčeːneddin.,тачин упе укчэнэддын,233362,235870,2508,Topolinoe
2541,Topolinoe__Narratives__Personal Narratives__To...,Tara.,тара,238521,239317,796,Topolinoe
2542,Topolinoe__Narratives__Personal Narratives__To...,Tar.,тар,241800,242230,430,Topolinoe
2543,Topolinoe__Narratives__Personal Narratives__To...,Ewedič.,эвэдыч,58520,59340,820,Topolinoe


In [284]:
def punct_clear(sent):
    chars_to_remove_regex = r'[,\?\.\!\-\;\:\"\“\%\‘\”\�\'\»\«\„\‐\–_]'
    sent = re.sub(chars_to_remove_regex, ' ', sent)
    sent = re.sub(' +', ' ', sent)
    return sent

In [285]:
spans['checked'] = spans['checked'].apply(punct_clear)

In [286]:
spans.to_csv('../spans.csv', index=False) 

### Статистика

In [287]:
spans['length'] = spans['end'] - spans['start']
# столько звучащих минут
spans['length'].sum() / 60000

677.2684666666667

In [288]:
# столько звучащих часов
spans['length'].sum() / 60000 / 60

11.287807777777777

In [289]:
spans['place'] = spans['filename'].apply(lambda x: x.split('__')[0])
spans.head()

Unnamed: 0,filename,phrase,checked,start,end,length,place
0,Bystraja District__Conversations__Adukanova_Ic...,Teːleŋni,тэлэӈни,18090,21025,2935,Bystraja District
1,Bystraja District__Conversations__Adukanova_Ic...,V armiju služili...,в армию служили,21025,24065,3040,Bystraja District
2,Bystraja District__Conversations__Adukanova_Ic...,Ọrọttịč teːleŋni,ороттыч тэлэӈни,34415,35575,1160,Bystraja District
3,Bystraja District__Conversations__Adukanova_Ic...,Tugenidu tụrkị...,тугэниду турки,108325,110685,2360,Bystraja District
4,Bystraja District__Conversations__Adukanova_Ic...,"Nu, a sobak obučat', kak vy eto delaete?",ну а собак обучать как вы это делаете,395194,403985,8791,Bystraja District


In [290]:
spans.groupby('place')['length'].sum() / 60000 / 60

place
Bystraja District    3.479447
Sebjan-Küöl          4.233915
Topolinoe            3.574446
Name: length, dtype: float64

In [291]:
spans.groupby('place')['length'].sum() / 60000

place
Bystraja District    208.766833
Sebjan-Küöl          254.034900
Topolinoe            214.466733
Name: length, dtype: float64