# Big Data Project: Transforming Scientific Articles into Videos with Speech using Apache Spark and Kafka


In [None]:
# make modules from py files auto-reload when changed
%load_ext autoreload
%autoreload 2

# TTS 


## Basic run just for test 

In [None]:
import torch
from datetime import datetime
from ArticleReader.Chunker import Chunker
from ArticleReader.LatexToSpeech import LatexParser
from ArticleReader.Narrator import Narrator
from Benchmarking import Bench
import pandas as pd 
import json

In [None]:
input_file = "data/arXiv-2106.04624v1/main.tex"
output_file = "output/" + datetime.now().strftime(r"%y.%m.%d-%H")

parser = LatexParser()
content = parser.read_latex(input_file)
processed = parser.custom_latex_to_text(content)
parser.save_text(processed, "dbg/spec_my.txt")

tables = parser.get_tables()
parser.save_text(tables, "dbg/tables.tex")

In [None]:
chunker = Chunker(max_len=200)
chunker.split_text_into_chunks(processed)
chunks = chunker.get_test_batch(10, 0)
# chunks = chunker.chunks
chunker.save_chunks_as_text(output_file + ".md", chunks)
print("text chunks:", [len(ch) for ch in chunks])

In [None]:

narrator = Narrator()

In [None]:
# break
# waveforms, durations = narrator.text_to_speech_batched(chunks)
# durations_sec = durations / 22050.0

# print("durations: ", durations_sec)

# waveform = torch.cat(waveforms, dim=1)

# print("saving audio")
# narrator.save_audio(output_file + ".wav", waveform)

# narrator.save_video(output_file)

# narrator.generate_srt(chunks, durations_sec, output_file + ".srt")

## Run batch from sorted

In [None]:
batch_from_sorted = chunker.get_batch_sorted(batch_size=100, start=0)

In [None]:
batch_from_sorted

In [None]:
#batch_converted = narrator.text_to_speech_df(batch_from_sorted)

In [None]:
batch_converted.columns

In [None]:
batch_converted

In [None]:
# restore order of sentences
batch_converted.sort_values("index", ascending=True, inplace=True)
# recombine and save sound
waveform = torch.cat(tuple(batch_converted.waveform), dim=1)

In [None]:
print("saving batch")
narrator.save_audio(output_file + ".wav", waveform)
chunker.save_chunks_as_text(output_file + ".md", batch_converted.sentence)

In [None]:
[print(s) for s in batch_converted.sentence]

## TTS benchmarking
Benchmarking batch sizes - how they impact memory utilization

#### Experiment report v.1  
schema of the json file   
```
variables: 
    device(CPU/GPU) 
    tts_model (tacotron, fastspeech,...)
    vocoder_model (hifigan, ...)
    batch_size (1,2,3, 5, 10, 20, 30, 50, 70, 100, 200)
    chunk_length (50 : 50 : 500)
parameters: 
    time 
    experiment_id 
    chunk_duration 
    avg_percent_silence     
    tts_model:
        model_id (name)
        max_memory_use 
        run_time 
        memory_log []
        exceptions 
        n_threads?
    vocoder_model:
        model_id (name)
        max_memory_use 
        run_time 
        memory_log []
        exceptions 
        n_threads?
```


#### Experiment report v.2 
schema of the json file   
```

    #variables: 
device(CPU/GPU) 
tts_model (tacotron, fastspeech,...)
vocoder_model (hifigan, ...)
batch_size (1,2,3, 5, 10, 20, 30, 50, 70, 100, 200)
chunk_length (50 : 50 : 500)

    #parameters: 
time 
experiment_id
stage: (tts/vocoder)
chunk_durations 
avg_percent_silence
max_memory_use 
run_time 
memory_log []
exceptions 
n_threads?

```


### Gather benchmark data

In [None]:

bench = Bench()

In [None]:
case = {"chunk_length": 50, # (50 : 50 : 500)
     "batch_size": 2, # (1, 2, 3, 5, 10, 20, 30, 50, 70, 100, 200)        
     "tts_model": "tts-tacotron2-ljspeech",
     "vocoder_model": "tts-hifigan-ljspeech",
     "device": "CPU", 
       }
case

In [None]:
# experiment_run = bench.run_experiment(processed, case)
# with open("benchmark/" + experiment_run[0]["experiment_id"] + ".json", "w+") as f:
#     json.dump(experiment_run,f)
# print('done')

In [None]:
#experiment_run

## Multiple experiments run 

In [None]:
import torch
from datetime import datetime
from ArticleReader.Chunker import Chunker
from ArticleReader.LatexToSpeech import LatexParser
from ArticleReader.Narrator import Narrator
from Benchmarking import Bench
import pandas as pd 
import json

In [None]:
input_file = "data/arXiv-2106.04624v1/main.tex"
output_file = "output/" + datetime.now().strftime(r"%y.%m.%d-%H")

parser = LatexParser()
content = parser.read_latex(input_file)
processed = parser.custom_latex_to_text(content)
parser.save_text(processed, "dbg/spec_my.txt")
 
tables = parser.get_tables()
parser.save_text(tables, "dbg/tables.tex")

In [None]:
smallgrid = {"chunk_length": [75,100],
             "batch_size": (2, 3),
             "tts_model": ["tts-tacotron2-ljspeech"],
             "vocoder_model": ["tts-hifigan-ljspeech"],
             "device": ["CPU"], 
       }
smallgrid

In [None]:
bench = Bench()
bench.run_experiments(processed, smallgrid, 1)

In [None]:
largegrid = {"chunk_length": [1000],
             "batch_size": (100, 2),
             "tts_model": ["tts-tacotron2-ljspeech"],
             "vocoder_model": ["tts-hifigan-ljspeech"],
             "device": ["CPU"],
            }
largegrid

In [None]:

bench = Bench()
bench.run_experiments(processed, largegrid)

In [None]:
1270546432 /  1.074e+9

In [None]:
fullgrid = {"chunk_length": list(range(50, 1500, 135)),
             "batch_size": (1, 3, 5, 10, 20, 50, 70, 100, 125, 150, 200, 300),
             "tts_model": ["tts-tacotron2-ljspeech", ],
             "vocoder_model": ["tts-hifigan-ljspeech"],
             "device": ["CPU"], 
       }
fullgrid

In [None]:
print("grid sizes:")
print("-"*20)
cases = 1
for k,v in fullgrid.items():
    cases = cases*len(v)
    print(k, ":", len(v))
print("-"*20)
print("total cases: ", cases)

In [None]:
bench = Bench()
bench.run_experiments(processed, fullgrid)

In [None]:
cases = bench.permutations(grid)

with open("cases.json", 'w+') as f:
    json.dump(cases, f, indent=4)

In [None]:
bnch_data.columns

In [None]:
type(experiment_run["parameters"]["tts_model"]["memory_log"]["time"][0])

### Examine individual memory logs

In [None]:
log = pd.DataFrame(narrator.profilers['vocoder'].memory_log,)
log['time'] = pd.to_datetime(log[0], unit='s')

In [None]:
log.plot(x='time', y=1)

In [None]:
log.shape

In [None]:
torch.torch_version.internal_version

# Trash

## GPU benchmarking

In [None]:
import torch
#import torchvision.models as models
from torchaudio.pipelines import Tacotron2TTSBundle 
from torch.profiler import profile, record_function, ProfilerActivity



In [None]:
# model = models.hubert_base()
# inputs = torch.randn(5, 224)

# with profile(activities=[ProfilerActivity.CPU],
#         profile_memory=True, record_shapes=True) as prof:
#     model(inputs)

# print(prof.key_averages().table(sort_by="self_cpu_memory_usage", row_limit=10))

In [None]:
ttcp = narrator.profiles['tacotron']


In [None]:
tbl = ttcp.key_averages().table(sort_by="self_cpu_memory_usage", row_limit=20)
tbl

In [None]:
eve = ttcp.events()

In [None]:
eve1000 = eve[1000]


In [None]:
dir(eve1000)

In [None]:
eve1000.time_range.start

In [None]:
pd.DataFrame(eve)

In [None]:
eve1000.__dict__

In [None]:
ka = ttcp.key_averages()

In [None]:
type(ka)

In [None]:
len(ka)

In [None]:
ka0 = ka[0]
ka0

In [None]:
dir(ka0)

In [None]:
type(ka0.key)

In [None]:
ttcp.export_memory_timeline.__dir__()

In [None]:
type(ttcp.export_memory_timeline.__self__)

In [None]:
ttcp.export_memory_timeline("CPU_tacotron.raw.json.gz")

In [None]:
vcdp = narrator.profiles['vocoder']


In [None]:
print(vcdp.key_averages().table(sort_by="self_cpu_memory_usage", row_limit=10))

# End