Permalink
Browse files

Documentation updates. Released as version 0.2

  • Loading branch information...
1 parent 06afdeb commit e0af9f63883124d3703dd81ef58abab9f99f00a1 @zaf committed Jan 4, 2012
Showing with 62 additions and 31 deletions.
  1. +6 −0 ChangeLog
  2. +32 −23 README
  3. +11 −2 samples/speech-recog-cli.pl
  4. +13 −6 speech-recog.agi
View
@@ -1,2 +1,8 @@
+2012-01-04 Lefteris Zafiris <zaf.000@gmail.com> - 0.2
+ Added option for setting the speech language.
+ More debugging messages added.
+ Recording timeout set to 15 seconds.
+ Documentation updates and diaplan examples.
+
2011-12-29 Lefteris Zafiris <zaf.000@gmail.com> - 0.1
Initial release 0.1
View
@@ -1,9 +1,9 @@
-=====================================================
- Google Speech recognition script for Asterisk
-=====================================================
+==============================================
+ Speech recognition script for Asterisk
+==============================================
This script makes use of Google's speech recognition engine
-in order to redner speech to text and return it back to the user
+in order to redner speech to text and return it back to the dialplan
as an asterisk channel variable.
------------
@@ -25,38 +25,47 @@ To make sure check your /etc/asterisk/asterisk.conf file
Usage
-----
agi(speech-recog.agi,[lang])
-Records from the current channel. Pressing # terminates the recording and
-returns the generated text string as the value of the variable utterance.
+Records from the current channel untill the pound key (#) is pressed or the
+timeout (15 seconds) is reached. The recording is send over to googles speech
+recognition service and the returned text string is assigned as the value
+of the channel variable 'utterance'.
+The scripts sets the following channel variables:
+status : Return status. 0 means success, non zero values indicating different errors.
+id : Some id string that googles engine returns, not very useful(?).
+utterance : The generated text string.
+confidence : A value between 0 and 1 indicating the probability of a correct recognition.
+ Values biger than 0.95 usually mean that the resulted text is correct.
--------
Examples
--------
sample dialplan code for your extensions.conf
;Simple speech recognition
-exten => 123,1,Answer()
-exten => 123,n,agi(speech-recog.agi,en-US)
-exten => 123,n,Noop(== The text you just said was: ${utterance} ==)
-exten => 123,n,Hangup()
+exten => 1234,1,Answer()
+exten => 1234,n,agi(speech-recog.agi,en-US)
+exten => 1234,n,Noop(== The text you just said was: ${utterance} ==)
+exten => 1234,n,Noop(== The probability to be right is: ${confidence} ==)
+exten => 1234,n,Hangup()
;Speech recognition demo also using googletts.agi for text to speech synthesis:
-exten => 124,1,Answer()
-exten => 124,n,agi(googletts.agi,"Please say something in English. When done press the pound key.",en)
-exten => 124,n(record),agi(speech-recog.agi,en-US)
-exten => 124,n,Noop(== Script returned: ${status} , ${id} , ${confidence} , ${utterance} ==)
-exten => 124,n,GotoIf($["${status}" = "0"]?success:fail)
+exten => 1235,1,Answer()
+exten => 1235,n,agi(googletts.agi,"Please say something in English. When done press the pound key.",en)
+exten => 1235,n(record),agi(speech-recog.agi,en-US)
+exten => 1235,n,Noop(== Script returned: ${status} , ${id} , ${confidence} , ${utterance} ==)
+exten => 1235,n,GotoIf($["${status}" = "0"]?success:fail)
-exten => 124,n(success),GotoIf($["${confidence}" > "0.9"]?playback:retry)
+exten => 1235,n(success),GotoIf($["${confidence}" > "0.9"]?playback:retry)
-exten => 124,n(retry),agi(googletts.agi,"I'm not feeling confident enough with the result, can you please repeat more clearly?",en)
-exten => 124,n,goto(record)
+exten => 1235,n(playback),agi(googletts.agi,"The text you just said was...",en)
+exten => 1235,n,agi(googletts.agi,"${utterance}",en)
+exten => 1235,n,goto(end)
-exten => 124,n(playback),agi(googletts.agi,"The text you just said was...",en)
-exten => 124,n,agi(googletts.agi,"${utterance}",en)
-exten => 124,n,goto(end)
+exten => 1235,n(retry),agi(googletts.agi,"Can you please repeat more clearly?",en)
+exten => 1235,n,goto(record)
-exten => 124,n(fail),agi(googletts.agi,"Failed to get speech data.",en)
-exten => 124,n(end),Hangup()
+exten => 1235,n(fail),agi(googletts.agi,"Failed to get speech data.",en)
+exten => 1235,n(end),Hangup()
-------
License
@@ -3,12 +3,20 @@
#
# Render speech to text using Google's speech recognition engine.
#
-# Copyright (C) 2011, Lefteris Zafiris <zaf.000@gmail.com>
+# Copyright (C) 2011 - 2012, Lefteris Zafiris <zaf.000@gmail.com>
#
# This program is free software, distributed under the terms of
# the GNU General Public License Version 2. See the COPYING file
# at the top of the source tree.
#
+# The scripts sets the following values:
+# status : Return status. 0 means success, non zero values indicating different errors.
+# id : Some id string that googles engine returns, not very useful(?).
+# utterance : The generated text string.
+# confidence : A value between 0 and 1 indicating how 'confident' the recognition engine
+# feels about the result. Values biger than 0.95 usually mean that the
+# resulted text is correct.
+#
use strict;
use warnings;
@@ -25,6 +33,7 @@
#my $filetype = "x-speex-with-header-byte";
my $filetype = "x-flac";
my $language = "en-US";
+my $results = 1;
my @file_list = @ARGV;
foreach my $file (@file_list) {
@@ -36,7 +45,7 @@
$ua->agent("Mozilla/5.0 (X11; Linux) AppleWebKit/535.2 (KHTML, like Gecko)");
$ua->timeout(20);
my $response = $ua->post(
- "$url&lang=$language",
+ "$url&lang=$language&maxresults=$results",
Content_Type => "audio/$filetype; rate=$samplerate",
Content => "$audio",
);
View
@@ -3,7 +3,7 @@
#
# AGI script that renders speech to text using Google's speech recognition engine.
#
-# Copyright (C) 2011, Lefteris Zafiris <zaf.000@gmail.com>
+# Copyright (C) 2011 - 2012, Lefteris Zafiris <zaf.000@gmail.com>
#
# This program is free software, distributed under the terms of
# the GNU General Public License Version 2. See the COPYING file
@@ -13,8 +13,17 @@
# Usage
# -----
# agi(speech-recog.agi,[lang])
-# Records from the current channel. Pressing # terminates the recording and
-# returns the generated text string as the value of the variable utterance.
+# Records from the current channel untill the pound key (#) is pressed or the
+# timeout (15 seconds) is reached. The recording is send over to googles speech
+# recognition service and the returned text string is assigned as the value
+# of the channel variable 'utterance'.
+# The scripts sets the following channel variables:
+# status : Return status. 0 means success, non zero values indicating different errors.
+# id : Some id string that googles engine returns, not very useful(?).
+# utterance : The generated text string.
+# confidence : A value between 0 and 1 indicating how 'confident' the recognition engine
+# feels about the result. Values biger than 0.95 usually mean that the
+# resulted text is correct.
#
use warnings;
@@ -132,9 +141,7 @@ die "$name Unable to get speech data.\n" if (!$uaresponse->is_success);
if ($uaresponse->content =~ /^\{"status":(\d*),"id":"(.*)","hypotheses":\[(.*)\]\}$/) {
$response{status} = "$1";
$response{id} = "$2";
- if ($response{status} == 5) {
- die "Error reading audio file\n";
- }
+ print STDERR "Error reading audio file\n" if ($response{status} == 5);
if ($3 =~ /^\{"utterance":"(.*)","confidence":(.*)\}/) {
$response{utterance} = "$1";
$response{confidence} = "$2";

0 comments on commit e0af9f6

Please sign in to comment.