# Big Data Project: Transforming Scientific Articles into Videos with Speech using Apache Spark and Kafka


In [None]:
# make modules from py files auto-reload when changed
%load_ext autoreload
%autoreload 2

# Spark 

In [None]:
#import findspark
import os
import sys
#from pyspark.sql import SparkSession

In [None]:
sys.executable

In [None]:
os.getcwd()

In [None]:
# Set Spark environment variables
# os.environ['SPARK_HOME'] = '/path/to/spark'
# os.environ['PYSPARK_PYTHON'] = 'python'
# os.environ['PYSPARK_DRIVER_PYTHON'] = 'python'
# os.environ['JAVA_HOME'] = '/path/to/java'

# Initialize findspark
#findspark.init()

# Create SparkSession and print version
# spark = SparkSession.builder.appName("PySpark Test").getOrCreate()
# print(f"Spark version: {spark.version}")
# spark.stop()

#TODO:   
create folders for data, checkpoints, output, dbg


# TTS 
## Basic run just for test 

In [None]:
import torch
from datetime import datetime
from ArticleReader.LatexToSpeech import LatexParser, Chunker
from ArticleReader.Narrator import Narrator
import pandas as pd 

In [None]:
input_file = "data/arXiv-2106.04624v1/main.tex"
output_file = "output/" + datetime.now().strftime(r"%y.%m.%d-%H")

parser = LatexParser()
content = parser.read_latex(input_file)
processed = parser.custom_latex_to_text(content)
parser.save_text(processed, "dbg/spec_my.txt")

tables = parser.get_tables()
parser.save_text(tables, "dbg/tables.tex")

In [None]:
chunker = Chunker(max_len=200)
chunker.split_text_into_chunks(processed)
chunks = chunker.get_test_batch(10, 0)
# chunks = chunker.chunks
chunker.save_chunks_as_text(output_file + ".md", chunks)
print("text chunks:", [len(ch) for ch in chunks])

Experiment report v.1  
schema of the json file   
```
variables: 
    device(CPU/GPU) 
    tts_model (tacotron, fastspeech,...)
    vocoder_model (hifigan, ...)
    batch_size (1,2,3, 5, 10, 20, 30, 50, 70, 100, 200)
    chunk_length (50 : 50 : 500)
parameters: 
    time 
    experiment_id 
    chunk_duration 
    avg_percent_silence     
    tts_model:
        model_id (name)
        max_memory_use 
        run_time 
        memory_log []
        exceptions 
        n_threads?
    vocoder_model:
        model_id (name)
        max_memory_use 
        run_time 
        memory_log []
        exceptions 
        n_threads?
```


Experiment report v.2 
schema of the json file   
```

    #variables: 
device(CPU/GPU) 
tts_model (tacotron, fastspeech,...)
vocoder_model (hifigan, ...)
batch_size (1,2,3, 5, 10, 20, 30, 50, 70, 100, 200)
chunk_length (50 : 50 : 500)

    #parameters: 
time 
experiment_id
stage: (tts/vocoder)
chunk_durations 
avg_percent_silence
max_memory_use 
run_time 
memory_log []
exceptions 
n_threads?

```


## TTS benchmarking
Benchmarking batch sizes - how they impact memory utilization

### Gather benchmark data

In [None]:
from Benchmarking import Bench
bench = Bench()

In [None]:
case = {"device": "CPU", 
    "tts_model": "tacotron",
    "vocoder_model": "hifigan",
    "batch_size": 2, # (1, 2, 3, 5, 10, 20, 30, 50, 70, 100, 200)
    "chunk_length": 50 # (50 : 50 : 500)
       }
case

In [None]:
experiment_run = bench.run_experiment(processed, case)
with open("benchmark/" + experiment[0]["experiment_id"] + ".json", "w+") as f:
    json.dump(experiment,f)


In [None]:
experiment_run

### Load gathered data

In [None]:
from pathlib import Path
import pandas as pd


In [None]:
paths = Path("benchmark").glob("*.json")
bnch_data = pd.concat([pd.read_json(p, orient="records") for p in paths])

In [None]:
bnch_data

In [None]:
type(experiment["parameters"]["tts_model"]["memory_log"]["time"][0])

### Examine individual memory logs

In [None]:
log = pd.DataFrame(narrator.profilers['vocoder'].memory_log,)
log['time'] = pd.to_datetime(log[0], unit='s')

In [None]:
log.plot(x='time', y=1)

In [None]:
log.shape

In [None]:
torch.torch_version.internal_version

# Trash

## GPU benchmarking

In [None]:
import torch
#import torchvision.models as models
from torchaudio.pipelines import Tacotron2TTSBundle 
from torch.profiler import profile, record_function, ProfilerActivity



In [None]:
# model = models.hubert_base()
# inputs = torch.randn(5, 224)

# with profile(activities=[ProfilerActivity.CPU],
#         profile_memory=True, record_shapes=True) as prof:
#     model(inputs)

# print(prof.key_averages().table(sort_by="self_cpu_memory_usage", row_limit=10))

In [None]:
ttcp = narrator.profiles['tacotron']


In [None]:
tbl = ttcp.key_averages().table(sort_by="self_cpu_memory_usage", row_limit=20)
tbl

In [None]:
eve = ttcp.events()

In [None]:
eve1000 = eve[1000]


In [None]:
dir(eve1000)

In [None]:
eve1000.time_range.start

In [None]:
pd.DataFrame(eve)

In [None]:
eve1000.__dict__

In [None]:
ka = ttcp.key_averages()

In [None]:
type(ka)

In [None]:
len(ka)

In [None]:
ka0 = ka[0]
ka0

In [None]:
dir(ka0)

In [None]:
type(ka0.key)

In [None]:
ttcp.export_memory_timeline.__dir__()

In [None]:
type(ttcp.export_memory_timeline.__self__)

In [None]:
ttcp.export_memory_timeline("CPU_tacotron.raw.json.gz")

In [None]:
vcdp = narrator.profiles['vocoder']


In [None]:
print(vcdp.key_averages().table(sort_by="self_cpu_memory_usage", row_limit=10))

# End