# My Webpage-to-Audio tool

In [1]:
# conda install -c conda-forge newspaper3k
from newspaper import Article

from textblob import TextBlob

import io

## Scrape the webpage

In [5]:
url = 'https://martinfowler.com/articles/developer-effectiveness.html'
article = Article(url)
article.download()
article.parse()

blob = TextBlob(article.text)

In [2]:
# For dev, let's use this subset:
blob = TextBlob('''Tim Cochran is a Technical Director for the US East Market at ThoughtWorks. Tim has over 19 years of experience leading work across start-ups and large enterprises in various domains such as retail, financial services, and government. He advises organizations on technology strategy and making the right technology investments to enable digital transformation goals. He is a vocal advocate for the developer experience and passionate about using data-driven approaches to improve it.''')

## Text-to-Speech
This is done by calling the service at `localhost:5002`, powered by [synesthesiam/docker-mozillatts](https://github.com/synesthesiam/docker-mozillatts).

### Via `aiohttp`

In [6]:
import asyncio
import aiohttp
# Req'd for Jupyter.
import nest_asyncio
nest_asyncio.apply()

async def fetch(client, sentence):
    async with client.post('http://localhost:5002/api/tts', data=sentence) as resp:
        if resp.status != 200:
            print(f'Failed at "{sentence}": {resp.status}: {resp.reason}')
            return sentence, None
        # Get WAV as binary.
        wave = await resp.content.read()
        return sentence, wave

    
sent_to_wave = dict()
async def main():
    async with aiohttp.ClientSession() as client:
        futures = []
        for sentence in blob.sentences:
            sentence_str = str(sentence).replace('\n', '. ').encode('utf-8')
            future = fetch(client, sentence_str)
            futures.append(future)
        responses = await asyncio.gather(*futures)
        
        # Convert to dict:
        for sent, wave in responses:
            if not wave: continue
            sent_to_wave[sent] = wave

loop = asyncio.get_event_loop()
loop.run_until_complete(main())

The requests used to fail when I used the vanilla Docker Image [`synesthesiam/docker-mozillatts`](https://github.com/synesthesiam/docker-mozillatts). After my fix at [this commit](https://github.com/tslmy/tts/commit/78a28a2763646132c042356408adafbd63cba2ee), things went on well.

### Via `requests`

In [3]:
import requests
from tqdm import tqdm

In [4]:
sent_to_wave = dict()
with tqdm(blob.sentences, desc='Sentences') as pbar:
    for sentence in pbar:
        resp = requests.post('http://localhost:5002/api/tts', data = str(sentence))
        if resp.status_code != requests.codes.ok:
            print(f'Failed at "{sentence}": {resp.status}: {resp.reason}')
            continue
        # else:
        sent_to_wave[str(sentence)] = resp.content

Sentences: 100%|██████████| 4/4 [00:11<00:00,  2.77s/it]


## Audio Manipulation

### Via `wave`

One approach is via the `wave` package. See [this](https://stackoverflow.com/a/2900266/1147061).

In [39]:
import wave

sent_to_frames = dict()
sent_to_durations = dict()
wave_params = None
for sentence, wave_bytes in sent_to_wave.items():
    wave_file_io = io.BytesIO(wave_bytes)
    with wave.open(wave_file_io, 'rb') as wave_file:
        if not wave_params:
            wave_params = wave_file.getparams()
        
        # get audio content:
        num_frames = wave_file.getnframes()
        frames = wave_file.readframes(num_frames)
        sent_to_frames[sentence] = frames
        
        # get duration:
        duration = num_frames / wave_file.getframerate()
        sent_to_durations[sentence] = duration

In [40]:
sent_to_durations

{'Tim Cochran is a Technical Director for the US East Market at ThoughtWorks.': 5.050340136054421,
 'Tim has over 19 years of experience leading work across start-ups and large enterprises in various domains such as retail, financial services, and government.': 11.563537414965987,
 'He advises organizations on technology strategy and making the right technology investments to enable digital transformation goals.': 9.195102040816327,
 'He is a vocal advocate for the developer experience and passionate about using data-driven approaches to improve it.': 7.1401360544217685}

In [42]:
# Combine sentences into one wav file:
with wave.open('output.wav', 'wb') as wave_file:
    wave_file.setparams(wave_params)
    with tqdm(blob.sentences, desc='Sentences') as pbar:
        for sentence in pbar:
            frames = sent_to_frames[str(sentence)]
            wave_file.writeframes(frames)

Sentences: 100%|██████████| 4/4 [00:00<00:00, 1856.30it/s]


In [43]:
!play output.wav


output.wav:

 File Size: 1.45M     Bit Rate: 353k
  Encoding: Signed PCM    
  Channels: 1 @ 16-bit   
Samplerate: 22050Hz      
Replaygain: off         
  Duration: 00:00:32.95  

In:100%  00:00:32.95 [00:00:00.00] Out:1.45M [      |      ] Hd:4.8 Clip:21   
play WARN rate: rate clipped 7 samples; decrease volume?
play WARN sox: `coreaudio' output clipped 14 samples; decrease volume?
Done.


### Via `pydub`

In [5]:
from pydub import AudioSegment # conda install pydub -c conda-forge -y

In [9]:
sent_to_segments = dict()
for sentence, wave_bytes in sent_to_wave.items():
    wave_file_io = io.BytesIO(wave_bytes)
    segment = AudioSegment.from_file(wave_file_io, format="wav")
    sent_to_segments[sentence] = segment

In [18]:
silence = AudioSegment.silent(duration=400) # in ms

In [21]:
# Combine the sentences:
playlist = AudioSegment.empty()
for sentence in blob.sentences:
    segment = sent_to_segments[str(sentence)]
    playlist += segment + silence

In [20]:
from pydub.playback import play
play(playlist)

In [36]:
# Convert to an in-memory WAV file:
out_bytes = io.BytesIO()
playlist.export(out_bytes, format='wav')
out_bytes.seek(0)


0

In [37]:
with open('out.wav', 'wb') as f:
    f.write(out_bytes.read())

In [38]:
!ls -s out.wav

2976 out.wav


In [39]:
!play out.wav


out.wav:

 File Size: 1.52M     Bit Rate: 353k
  Encoding: Signed PCM    
  Channels: 1 @ 16-bit   
Samplerate: 22050Hz      
Replaygain: off         
  Duration: 00:00:34.55  

In:100%  00:00:34.55 [00:00:00.00] Out:1.52M [      |      ]        Clip:21   
play WARN rate: rate clipped 7 samples; decrease volume?
play WARN sox: `coreaudio' output clipped 14 samples; decrease volume?
Done.
