In [1]:
from database import ParserDBHandler

In [2]:
from pathlib import Path

In [3]:
falignment_folder = Path('..').resolve().parent.joinpath('forced_alignment')
db_path = Path('../website/math_corpus_database.db').resolve()

In [4]:
class DBHandlerFAlignment(ParserDBHandler):
    
    def get_text_sentences(self, text_name):
        self.cur.execute('''SELECT sents.id, sents.sent
                            FROM sents
                            LEFT JOIN texts
                            ON texts.id = sents.text_id
                            WHERE texts.name = (?)''', (text_name,))
        sentences = self.cur.fetchall()
        return sentences
    
    def add_timecodes(self, timecodes):
        self.cur.executemany('''UPDATE sents
                            SET timecode = (?)
                            WHERE id = (?)''', (timecodes))
        self.conn.commit()

In [45]:
db = DBHandlerFAlignment(db_path)

In [46]:
text_name = 'Производная.Начало'

In [47]:
sentences = db.get_text_sentences(text_name)

In [48]:
result_folder = falignment_folder.joinpath(text_name)
if not result_folder.exists():
    result_folder.mkdir()

In [49]:
txt_filepath = result_folder.joinpath('text.txt')
audio_filepath = result_folder.joinpath('video.mp4')
output_filepath = result_folder.joinpath('alignment.txt')
elan_filepath = result_folder.joinpath('subtitles.csv')

Создаем текстовый файл, который потом скормим элайнеру

In [50]:
with open(txt_filepath, 'w', encoding='utf-8') as newf:
    newf.write('\n'.join(str(s[0]) + '|' + s[1] for s in sentences))

Создаем команду для элайнера, которую надо будет написать в терминале и запустить

In [51]:
f'python -m aeneas.tools.execute_task {audio_filepath} {txt_filepath} "task_language=eng|is_text_type=parsed|os_task_file_format=tsv" {output_filepath}'

'python -m aeneas.tools.execute_task C:\\Users\\Kra$0t04ka\\Desktop\\me\\study\\3d_year\\workshops\\forced_alignment\\Производная.Начало\\video.mp4 C:\\Users\\Kra$0t04ka\\Desktop\\me\\study\\3d_year\\workshops\\forced_alignment\\Производная.Начало\\text.txt "task_language=eng|is_text_type=parsed|os_task_file_format=tsv" C:\\Users\\Kra$0t04ka\\Desktop\\me\\study\\3d_year\\workshops\\forced_alignment\\Производная.Начало\\alignment.txt'

Создадим csv файл для elan

In [52]:
import csv

In [53]:
sent_dict = {str(sent[0]): sent[1] for sent in sentences}

In [54]:
with open(output_filepath, 'r') as f:
    with open(elan_filepath, 'w', encoding='utf-8') as newf:
        csv_reader = csv.reader(f, delimiter='\t')
        csv_writer = csv.writer(newf, delimiter='\t')
        for row in csv_reader:
            csv_writer.writerow([row[0], row[1], sent_dict[row[2]]])

Теперь запишем таймкоды в базу данных

In [27]:
timecodes = []
video_start = 26  # секунда по которой обрезали видео
with open(output_filepath, 'r') as f:
    csv_reader = csv.reader(f, delimiter='\t')
    for row in csv_reader:
        timecodes.append((' '.join([str(float(row[0]) + video_start), str(float(row[1]) + video_start)]), int(row[2])))
timecodes

[('26.0 28.8', 59),
 ('28.8 31.96', 60),
 ('31.96 34.24', 61),
 ('34.24 46.0', 62),
 ('46.0 49.239999999999995', 63),
 ('49.239999999999995 53.480000000000004', 64),
 ('53.480000000000004 56.32', 65),
 ('56.32 64.68', 66),
 ('64.68 65.32', 67),
 ('65.32 66.16', 68),
 ('66.16 67.4', 69),
 ('67.4 70.08', 70),
 ('70.08 70.72', 71),
 ('70.72 73.84', 72),
 ('73.84 82.68', 73),
 ('82.68 86.28', 74),
 ('86.28 98.36', 75),
 ('98.36 102.64', 76),
 ('102.64 103.88', 77),
 ('103.88 110.08', 78),
 ('110.08 110.72', 79),
 ('110.72 113.2', 80),
 ('113.2 114.16', 81),
 ('114.16 115.48', 82),
 ('115.48 116.84', 83),
 ('116.84 118.8', 84),
 ('118.8 122.84', 85),
 ('122.84 134.24', 86),
 ('134.24 148.4', 87),
 ('148.4 151.44', 88),
 ('151.44 156.32', 89),
 ('156.32 158.4', 90),
 ('158.4 167.84', 91),
 ('167.84 173.44', 92),
 ('173.44 177.08', 93),
 ('177.08 199.88', 94),
 ('199.88 203.84', 95),
 ('203.84 210.84', 96),
 ('210.84 213.12', 97),
 ('213.12 223.88', 98),
 ('223.88 232.32', 99),
 ('232.32 235.

In [28]:
db.add_timecodes(timecodes)