verbio-technologies · mjbverbio · Jun 3, 2022 · May 26, 2022 · May 26, 2022 · May 27, 2022
diff --git a/.github/workflows/lint.yaml → .github/workflows/ci.yaml b/.github/workflows/lint.yaml → .github/workflows/ci.yaml
@@ -1,12 +1,12 @@
-name: lint
+name: Rust package
 on:
   pull_request: {}
   push:
     branches:
       - main
 jobs:
   test:
-    name: lint
+    name: test
     runs-on: ubuntu-20.04
     steps:
       # Setup repo
@@ -27,3 +27,6 @@ jobs:
 
       # Clippy (linter)
       - run: cargo clippy --all --all-targets -- -D warnings
+
+      # Run tests
+      - run: cargo test --all
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/README.md b/README.md
@@ -1,6 +1,12 @@
-# Rust Verbio SpeechCenter Client
+# Rust integration with the Verbio Speech Center cloud.
 
-[![Lint](https://github.com/cquintana92/rust-verbio-speech-center/actions/workflows/lint.yaml/badge.svg)](https://github.com/cquintana92/rust-verbio-speech-center/actions/workflows/lint.yaml)
+This repository contains a Rust based example of how to use the Verbio Technologies Speech Center cloud.
+
+[![Build Status](https://github.com/verbio-technologies/rust-verbio-speech-center/actions/workflows/ci.yaml/badge.svg)](https://github.com/verbio-technologies/rust-verbio-speech-center/actions/workflows/ci.yaml)
+
+[Website](https://speechcenter.verbio.com) |
+[Guides](https://github.com/verbio-technologies) |
+[API Docs](https://speechcenter.verbio.com/documentation/)
 
 ## How to build
 
@@ -16,14 +22,43 @@ It will build two binaries: `batch-client` and `cli-client`.
 
 ### CLI client
 
-The CLI client allows you to launch a single file to the server. It also allows you to use either a grammar or a language model.
+The CLI client integrates two sub-commands:
+* Recognition: Speech-to-Text operation, using either an ABNF Grammar or a topic for an out-of-the-box statistical model.
+* Synthesis: Text-to-Speech operation with customizable audio options.
+
+You can use the `--help` command to find out more about the client.
 
 ```
 λ ./target/release/cli-client --help
-cli-client 0.1.0
+Speech-Center 0.1.0
+Verbio Technologies S.L.
+
+USAGE:
+    cli-client <SUBCOMMAND>
+
+FLAGS:
+    -h, --help       Prints help information
+    -V, --version    Prints version information
+
+SUBCOMMANDS:
+    help           Prints this message or the help of the given subcommand(s)
+    recognition    Run a Speech Center gRPC recognition client
+    synthesis      Run a Speech Center gRPC synthesis client
+```
+
+
+#### CLI client recognition
+
+The CLI client recognition allows you to transcribe a single audio file. To do so, it will require either an ABNF grammar or an out-of-the-box recognition topic such as: `GENERIC`, as of general discussion; `BANKING`, concerning financial-related speech; or `TELCO`, including telecommunications and technology-centered talks.
+
+If you wish to know more about our [Recognition Topics](https://speechcenter.verbio.com/documentation/topics) or [ABNF Grammars](https://speechcenter.verbio.com/documentation/abnf), please check out our documentation.
+
+```
+λ ./target/release/cli-client recognition --help
+cli-client-recognition 0.1.0
 
 USAGE:
-    cli-client [OPTIONS] --audio <audio> --language <language> --token-file <token-file> --url <url>
+    cli-client recognition [OPTIONS] --audio <audio> --language <language> --token-file <token-file> --url <url>
 
 FLAGS:
     -h, --help       Prints help information
@@ -32,22 +67,78 @@ FLAGS:
 OPTIONS:
     -a, --audio <audio>              Path to a .wav audio in 8kHz and PCM16 encoding to use for the recognition
     -g, --grammar <grammar>          Path to the ABNF grammar file to use for the recognition
-    -l, --language <language>        Language to use for the recognition [default: en-US]
-    -t, --token-file <token-file>    Path to the authentication token file
+    -l, --language <language>        IETF BCP-47 Language to use for the recognition. Supported en-US | es-ES | pt-BR [default: en-US]
+    -t, --token-file <token-file>    Path to the JWT authentication token file
     -T, --topic <topic>              Topic to use for the recognition. Must be GENERIC | BANKING | TELCO
-    -u, --url <url>                  The URL of the host or server trying to reach [default: https://speechcenter.verbio.com:2424]
+    -u, --url <url>                  The URL of the gRPC host or server trying to reach [default: https://speechcenter.verbio.com:2424]
+```
+
+An example execution could be:
+
+```
+λ ./target/debug/cli-client recognition -a example.wav -l en-US -t my.token -T generic
+```
+
+
+#### CLI client synthesis
+
+The CLI client synthesis grants you the ability to create customizable speech from a mere text sentence. You simply need to specify a target text sentence, a destination output file to store the resulting audio, the voice or speaker, the language and some optional parameters such as the speech encoding, header or the sample rate.
+
+If you wish to know more about certain parameters such as [Audio Encoding](https://en.wikipedia.org/wiki/Audio_codec), [Audio Header or Format](https://en.wikipedia.org/wiki/Audio_file_format), or [Sample Rate](https://en.wikipedia.org/wiki/Sampling_(signal_processing)#Audio_sampling), you can take a look at one of the links.
+
+**_Note_**: Regarding Voice - Language combinations, not all of them are possible or available. Find below a table on the supported combinations:
+
+<div align="center">
+
+|Language<br>-<br>Voice|en-US|es-ES|pt-BR|ca-CA|
+|:--------------------:|:---:|:---:|:---:|:---:|
+|         Tommy        |**X**|     |     |     |
+|         Annie        |**X**|     |     |     |
+|         David        |     |**X**|     |**X**|
+|         Aurora       |     |**X**|     |     |
+|         Luma         |     |     |**X**|     |
+
+</div>
+
+
+```
+λ ./target/release/cli-client synthesis --help
+cli-client-synthesis 0.1.0
+Run a Speech Center gRPC synthesis client
+
+USAGE:
+    cli-client synthesis --encoding <encoding> --header <header> --language <language> --output <output> --sample-rate <sample-rate> --text <text> --token-file <token-file> --url <url> --voice <voice>
+
+FLAGS:
+        --help       Prints help information
+    -V, --version    Prints version information
+
+OPTIONS:
+    -e, --encoding <encoding>          Output audio encoding algorithm. Supported PCM (Signed 16-bit little endian PCM)
+                                       [default: PCM]
+    -h, --header <header>              Output audio header. Supported: WAV (Wav audio header) | RAW (No header)
+                                       [default: WAV]
+    -l, --language <language>          IETF BCP-47 Language to use for the recognition. Supported en-US | es-ES | pt-BR
+                                       | ca-CA [default: en-US]
+    -o, --output <output>              Path to store the synthesis resulting audio
+    -s, --sample-rate <sample-rate>    Output audio sample rate in Hz. Available 8000 [default: 8000]
+    -T, --text <text>                  Text to synthesize to audio
+    -t, --token-file <token-file>      Path to the JWT authentication token file
+    -u, --url <url>                    The URL of the gRPC host or server trying to reach [default:
+                                       https://speechcenter.verbio.com:2424]
+    -v, --voice <voice>                Voice to use for the synthesis. Supported Tommy | Annie | Aurora | Luma | David
 ```
 
 An example execution could be:
 
 ```
-λ ./target/debug/cli-client -a example.wav -l en-US -t my.token -T generic
+λ ./target/debug/cli-client synthesis --text "Hello World" --voice Tommy -l en-US -t my.token -o example_tommy_en-US.wav
 ```
 
 
-### Batch client
+### Batch client (Recognition Only)
 
-The batch client iterates over wav files inside a directory, sends them in parallel to the server and stores the transcription in another directory.
+The batch client iterates over wav files inside a directory, sends them in parallel to the server and stores the transcription in the specified folder.
 
 ```
 λ ./target/release/batch-client --help
@@ -62,24 +153,17 @@ FLAGS:
 
 OPTIONS:
     -D, --dest-dir <dest-dir>        Destination directory for the transcriptions
-    -l, --language <language>        Language to use for the recognition [default: en-US]
+    -l, --language <language>        IETF BCP-47 Language to use for the recognition. Supported en-US | es-ES | pt-BR [default: en-US]
     -L, --log-level <log-level>      Log level. Must be TRACE | DEBUG | INFO | WARN | ERROR [default: info]
     -d, --dir <source-dir>           Directory containing .wav audios in 8kHz and PCM16 encoding to use for the recognition
-    -t, --token-file <token-file>    Path to the authentication token file
+    -t, --token-file <token-file>    Path to the JWT authentication token file
     -T, --topic <topic>              Topic to use for the recognition. Must be GENERIC | BANKING | TELCO
-    -u, --url <url>                  The URL of the host or server trying to reach [default: https://speechcenter.verbio.com:2424]
+    -u, --url <url>                  The URL of the gRPC  host or server trying to reach [default: https://speechcenter.verbio.com:2424]
     -w, --workers <workers>          Number of workers to use for the recognition [default: 4]
 ```
 
 An example execution could be:
 
 ```
 λ ./target/release/batch-client -w 4 -d ~/tmp/commonvoice/clips -D /tmp/results -t my.token -T generic --log-level debug
-```
-
-## Improvements to be done
-
-Right now, the build process generates the `speech-center-client/src/csr_grpc_gateway.rs` file. However, the `tonic-build` is able to generate it into the target directory and include it via macros. Unfortunately, my IDE was not able to detect the file, so autocomplete didn't work and I prioritized developer ergonomy over "correctness" for this PoC (sorry guys).
-
-In order to change it, edit the `build.rs` and see the documentation of [tonic-build](https://github.com/hyperium/tonic/tree/master/tonic-build).
-
+```
diff --git a/batch-client/src/main.rs b/batch-client/src/main.rs
@@ -24,11 +24,11 @@ struct Args {
     )]
     log_level: String,
 
-    /// Path to the authentication token file
+    /// Path to the JWT authentication token file
     #[structopt(short = "t", long = "token-file", required = true)]
     token_file: String,
 
-    /// The URL of the host or server trying to reach
+    /// The URL of the gRPC host or server trying to reach
     #[structopt(
         short = "u",
         long = "url",
@@ -49,7 +49,7 @@ struct Args {
     #[structopt(short = "D", long = "dest-dir", required = true)]
     dest_dir: String,
 
-    /// Language to use for the recognition
+    /// IETF BCP-47 Language to use for the recognition. Supported en-US | es-ES | pt-BR
     #[structopt(
         short = "l",
         long = "language",

diff --git a/batch-client/src/worker.rs b/batch-client/src/worker.rs
@@ -1,5 +1,5 @@
 use async_channel::{Receiver, Sender};
-use speech_center_client::{Client, Result, SpeechCenterError, Topic};
+use speech_center_client::{RecognitionClient, Result, SpeechCenterError, Topic};
 
 pub enum Payload {
     File {
@@ -12,13 +12,13 @@ pub enum Payload {
 }
 
 pub struct Worker {
-    client: Client,
+    client: RecognitionClient,
     rx: Receiver<Payload>,
 }
 
 impl Worker {
     pub async fn new(url: &str, token: &str, rx: Receiver<Payload>) -> Result<Self> {
-        let client = Client::new(url, token).await?;
+        let client = RecognitionClient::new(url, token).await?;
         Ok(Self { client, rx })
     }
 

diff --git a/cli-client/Cargo.toml b/cli-client/Cargo.toml
@@ -6,5 +6,7 @@ edition = "2021"
 [dependencies]
 speech-center-client = { path = "../speech-center-client" }
 
+bytes = "1.1.0"
+hound = "3.4"
 structopt = { version = "0.3", default-features = false }
 tokio = { version = "1", features = ["full"] }
diff --git a/cli-client/src/main.rs b/cli-client/src/main.rs
@@ -1,81 +1,21 @@
-use speech_center_client::{Client, Topic};
-use structopt::StructOpt;
-
-#[derive(Clone, Debug, StructOpt)]
-struct Args {
-    /// Path to the authentication token file
-    #[structopt(short = "t", long = "token-file", required = true)]
-    token_file: String,
-
-    /// The URL of the host or server trying to reach
-    #[structopt(
-        short = "u",
-        long = "url",
-        required = true,
-        default_value = "https://speechcenter.verbio.com:2424"
-    )]
-    url: String,
+mod recognition;
+mod synthesis;
 
-    /// Topic to use for the recognision. Must be GENERIC | BANKING | TELCO
-    #[structopt(short = "T", long = "topic")]
-    topic: Option<String>,
-
-    /// Path to the ABNF grammar file to use for the recognition
-    #[structopt(short = "g", long = "grammar")]
-    grammar: Option<String>,
+use structopt::StructOpt;
 
-    /// Path to a .wav audio in 8kHz and PCM16 encoding to use for the recognition
-    #[structopt(short = "a", long = "audio", required = true)]
-    audio: String,
+const VERSION: &str = env!("CARGO_PKG_VERSION");
 
-    /// Language to use for the recognition
-    #[structopt(
-        short = "l",
-        long = "language",
-        required = true,
-        default_value = "en-US"
-    )]
-    language: String,
+#[derive(StructOpt)]
+#[structopt(name = "Speech-Center", author = "Verbio Technologies S.L.", version = VERSION)]
+enum Args {
+    Recognition(recognition::Recognition),
+    Synthesis(synthesis::Synthesis),
 }
 
 #[tokio::main]
 async fn main() {
-    let opts = Args::from_args();
-
-    let token = std::fs::read_to_string(&opts.token_file).expect("Error reading token from file");
-    let token = token.trim().to_string();
-    if token.is_empty() {
-        panic!("Token cannot be empty");
-    }
-
-    let audio = std::fs::read(&opts.audio).expect("Error reading audio file");
-    if audio.is_empty() {
-        panic!("Audio cannot be empty");
-    }
-
-    let mut client = Client::new(&opts.url, &token)
-        .await
-        .expect("Error creating client");
-
-    match (opts.grammar, opts.topic) {
-        (Some(grammar), _) => {
-            let grammar = std::fs::read_to_string(&grammar).expect("Error reading grammar file");
-            let res = client
-                .recognise_with_grammar(&grammar, &opts.language, audio)
-                .await
-                .expect("Error in recognision");
-            println!("Res: {}", res);
-        }
-        (_, Some(topic)) => {
-            let topic = Topic::from_name(&topic).expect("Error converting topic");
-            let res = client
-                .recognise_with_topic(&opts.language, topic, audio)
-                .await
-                .expect("Error in recognision");
-            println!("Res: {}", res);
-        }
-        _ => {
-            panic!("Either grammar or topic must be defined");
-        }
+    match Args::from_args() {
+        Args::Recognition(c) => recognition::process_subcommand(c).await,
+        Args::Synthesis(c) => synthesis::process_subcommand(c).await,
     }
 }