-
Notifications
You must be signed in to change notification settings - Fork 41
/
speech_recognizer.rb
156 lines (127 loc) · 4.14 KB
/
speech_recognizer.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
module Pocketsphinx
# Reads audio data from a recordable interface and decodes it into utterances
#
# Essentially orchestrates interaction between Recordable and Decoder, and detects new utterances.
class SpeechRecognizer
# Recordable interface must implement #start_recording, #stop_recording and #read_audio
attr_writer :recordable
attr_writer :decoder
attr_writer :configuration
ALGORITHMS = [:after_speech, :continuous]
def initialize(configuration = nil)
@configuration = configuration
end
def recordable
@recordable or raise "A SpeechRecognizer must have a recordable interface"
end
def decoder
@decoder ||= Decoder.new(configuration)
end
def configuration
@configuration ||= Configuration.default
end
# Reinitialize the decoder with updated configuration.
#
# See Decoder#reconfigure
#
# @param [Configuration] configuration An optional new configuration to use. If this is
# nil, the previous configuration will be reloaded, with any changes applied.
def reconfigure(configuration = nil)
self.configuration = configuration if configuration
pause do
decoder.reconfigure(configuration)
end
end
# Recognize speech and yield hypotheses in infinite loop
#
# @param [Fixnum] max_samples Number of samples to process at a time
def recognize(max_samples = 2048, &b)
unless ALGORITHMS.include?(algorithm)
raise NotImplementedError, "Unknown speech recognition algorithm: #{algorithm}"
end
start unless recognizing?
FFI::MemoryPointer.new(:int16, max_samples) do |buffer|
loop do
send("recognize_#{algorithm}", max_samples, buffer, &b) or break
end
end
ensure
stop
end
def in_speech?
# Use Pocketsphinx's implementation by default
decoder.in_speech?
end
def recognizing?
@recognizing == true
end
def pause
recognizing?.tap do |was_recognizing|
stop if was_recognizing
yield
start if was_recognizing
end
end
def start
recordable.start_recording
decoder.start_utterance
@recognizing = true
end
def stop
decoder.end_utterance
recordable.stop_recording
@recognizing = false
end
# Determine which algorithm to use for co-ordinating speech recognition
#
# @return [Symbol] :continuous or :after_speech
# :continuous yields as soon as any hypothesis is available
# :after_speech yields hypothesis on speech -> silence transition if one exists
# Default is :after_speech
def algorithm
if configuration.respond_to?(:recognition_algorithm)
configuration.recognition_algorithm
else
ALGORITHMS.first
end
end
private
# Yields as soon as any hypothesis is available
def recognize_continuous(max_samples, buffer)
process_audio(buffer, max_samples).tap do
if hypothesis = decoder.hypothesis
decoder.end_utterance
yield hypothesis
decoder.start_utterance
end
end
end
# Splits speech into utterances by detecting silence between them.
# By default this uses Pocketsphinx's internal Voice Activity Detection (VAD) which can be
# configured by adjusting the `vad_postspeech`, `vad_prespeech`, and `vad_threshold` settings.
def recognize_after_speech(max_samples, buffer)
if in_speech?
while in_speech?
process_audio(buffer, max_samples) or break
end
decoder.end_utterance
if hypothesis = decoder.hypothesis
yield hypothesis
end
decoder.start_utterance
end
process_audio(buffer, max_samples)
end
def process_audio(buffer, max_samples)
sample_count = recordable.read_audio(buffer, max_samples)
if sample_count
decoder.process_raw(buffer, sample_count)
# Check for a delay for example in case of non-blocking live audio
if recordable.respond_to?(:read_audio_delay)
sleep recordable.read_audio_delay(max_samples)
end
end
sample_count
end
end
end