Permalink
Browse files

better handling of small files

  • Loading branch information...
1 parent 8d06f9e commit 03adbde6d6c5fb3dcce42bca81386669b89b6e70 @taf2 committed Apr 5, 2011
View
@@ -3,5 +3,10 @@
require 'speech'
+if ARGV[0].nil? || !File.exist?(ARGV[0])
+ STDERR.puts "usage: #{$0} input.wav"
+ exit(1)
+end
+
captured_json = Speech::AudioToText.new(ARGV[0]).to_text
puts captured_json.inspect
@@ -14,7 +14,7 @@ def initialize(duration_str)
def to_s
s,f = seconds.split('.')
- sprintf "%.2d:%.2d:%.2d:%.2d", self.hours.to_s.gsub(/^0/,'').to_i, self.minutes.to_s.gsub(/^0/,'').to_i, s.to_s.gsub(/^0/,'').to_i, (f||0)
+ sprintf "%.2d:%.2d:%.2d:%.2d", self.hours.to_i, self.minutes.to_i, s.to_i, (f||0).to_i
#"#{hours}:#{minutes}:#{seconds}:#{f}"
end
@@ -5,22 +5,33 @@ class AudioSplitter
attr_accessor :original_file, :size, :duration, :chunks
class AudioChunk
- attr_accessor :splitter, :chunk, :flac_chunk, :offset, :duration, :flac_rate
+ attr_accessor :splitter, :chunk, :flac_chunk, :offset, :duration, :flac_rate, :copied
def initialize(splitter, offset, duration)
self.offset = offset
self.chunk = File.join(File.dirname(splitter.original_file), "chunk-" + File.basename(splitter.original_file).gsub(/\.(.*)$/, "-#{offset}" + '.\1'))
self.duration = duration
self.splitter = splitter
+ self.copied = false
+ end
+
+ def self.copy(splitter)
+ chunk = AudioChunk.new(splitter, 0, splitter.duration.to_f)
+ chunk.copied = true
+ system("cp #{splitter.original_file} #{chunk.chunk}")
+ chunk
end
# given the original file from the splitter and the chunked file name with duration and offset run the ffmpeg command
def build
+ return self if self.copied
# ffmpeg -y -i sample.audio.wav -acodec copy -vcodec copy -ss 00:00:00:00 -t 00:00:30:00 sample.audio.out.wav
offset_ts = AudioInspector::Duration.from_seconds(self.offset)
duration_ts = AudioInspector::Duration.from_seconds(self.duration)
+ # NOTE: kind of a hack, but if the original source is less than or equal to 1 second, we should skip ffmpeg
+ puts "building chunk: #{duration_ts.inspect} and offset: #{offset_ts}"
#puts "offset: #{ offset_ts.to_s }, duration: #{duration_ts.to_s}"
- cmd = "ffmpeg -y -i #{splitter.original_file} -acodec copy -vcodec copy -ss #{offset_ts} -t #{duration_ts} #{self.chunk} >/dev/null 2>&1"
+ cmd = "ffmpeg -y -i #{splitter.original_file} -acodec copy -vcodec copy -ss #{offset_ts} -t #{duration_ts} #{self.chunk}"# >/dev/null 2>&1"
if system(cmd)
self
else
@@ -30,7 +41,9 @@ def build
# convert the audio file to flac format
def to_flac
- if system("flac #{chunk} >/dev/null 2>&1")
+ puts "convert: #{chunk} to flac"
+ if system("flac #{chunk}")
+ puts "success?"
self.flac_chunk = chunk.gsub(File.extname(chunk), ".flac")
# convert the audio file to 16K
self.flac_rate = `ffmpeg -i #{self.flac_chunk} 2>&1`.strip.scan(/Audio: flac, (.*) Hz/).first.first.strip
@@ -42,6 +55,8 @@ def to_flac
raise "failed to convert to lower audio rate"
end
+ else
+ raise "failed to convert chunk: #{chunk} with flac #{chunk}"
end
end
@@ -75,10 +90,11 @@ def split
end
if chunks.empty?
- chunks << AudioChunk.new(self, 0, self.duration.to_f)
+ chunks << AudioChunk.copy(self)#, 0, self.duration.to_f)
else
chunks << AudioChunk.new(self, chunks.last.offset.to_i + chunks.last.duration.to_i, self.size + last_chunk)
end
+ puts "Chunk count: #{chunks.size}"
chunks
end
@@ -2,17 +2,16 @@
module Speech
class AudioToText
- attr_accessor :file, :rate, :captured_json, :confidence, :captured_file
+ attr_accessor :file, :rate, :captured_json, :captured_file
def initialize(file)
self.file = file
self.captured_file = self.file.gsub(/\.wav$/,'.json')
- self.captured_json = []
- self.confidence = 0.0
+ self.captured_json = {}
end
def to_text
- url = "https://www.google.com/speech-api/v1/recognize?xjerr=1&client=speech2text&lang=en-US&maxresults=1"
+ url = "https://www.google.com/speech-api/v1/recognize?xjerr=1&client=speech2text&lang=en-US&maxresults=10"
splitter = Speech::AudioSplitter.new(file) # based off the wave file because flac doesn't tell us the duration
easy = Curl::Easy.new(url)
splitter.split.each do|chunk|
@@ -31,10 +30,12 @@ def clean
def convert_chunk(easy, chunk, options={})
puts "sending chunk of size #{chunk.duration}..."
retrying = true
- while retrying
+ retry_count = 0
+ while retrying && retry_count < 5
#easy.verbose = true
easy.headers['Content-Type'] = "audio/x-flac; rate=#{chunk.flac_rate}"
easy.headers['User-Agent'] = "https://github.com/taf2/speech2text"
+ #puts chunk.inspect
easy.post_body = "Content=#{chunk.to_flac_bytes}"
easy.on_progress {|dl_total, dl_now, ul_total, ul_now| printf("%.2f/%.2f\r", ul_now, ul_total); true }
easy.on_complete {|easy| puts }
@@ -44,25 +45,16 @@ def convert_chunk(easy, chunk, options={})
if easy.response_code == 500
puts "500 from google retry after 0.5 seconds"
retrying = true
- sleep 0.5 # wait longer on error?
+ retry_count += 1
+ sleep 0.5 # wait longer on error?, google??
else
# {"status":0,"id":"ce178ea89f8b17d8e8298c9c7814700a-1","hypotheses":[{"utterance":"I like pickles","confidence":0.92731786}]}
data = JSON.parse(easy.body_str)
- puts data.inspect
- data['hypotheses'].each {|utterance|
- puts utterance.inspect
- self.captured_json << [utterance['utterance'], utterance['confidence']]
- self.confidence += utterance['confidence']
- }
- File.open("#{self.captured_file}", "wb") {|f|
- size = self.captured_json.size
- if size > 0
- confidence_calc = self.confidence / size
- else
- confidence_calc = 0
- end
- f << {:captured_json => captured_json, :confidence => confidence_calc}.to_json
- }
+ self.captured_json['status'] = data['status']
+ self.captured_json['id'] = data['id']
+ self.captured_json['hypotheses'] = data['hypotheses'].map {|ut| [ut['utterance'], ut['confidence']] }
+ puts self.captured_json.inspect
+ File.open("#{self.captured_file}", "wb") {|f| f << captured_json.to_json }
retrying = false
end
sleep 0.1 # not too fast there tiger
@@ -1,6 +1,6 @@
# -*- encoding: binary -*-
module Speech
class Info
- VERSION='0.3.1'
+ VERSION='0.3.2'
end
end
Binary file not shown.
@@ -18,4 +18,22 @@ def test_audio_to_text
ensure
audio.clean
end
+
+ def test_short_audio_clip
+ audio = Speech::AudioToText.new("samples/i-like-pickles.chunk5.wav")
+ captured_json = audio.to_text
+ assert captured_json
+ assert captured_json.key?("hypotheses")
+ assert !captured_json['hypotheses'].empty?
+ #{"status"=>0, "id"=>"552de5ba35bb769ce3493ff113e158a8-1", "hypotheses"=>[["eagles", 0.7214844], ["pickles", nil], ["michaels", nil], ["giggles", nil], ["tickles", nil]]}
+ assert captured_json.keys.include?('status')
+ assert captured_json.keys.include?('id')
+ assert captured_json.keys.include?('hypotheses')
+ puts captured_json.inspect
+ assert_equal "eagles", captured_json['hypotheses'][0].first
+ assert_equal "pickles", captured_json['hypotheses'][1].first
+ #assert captured_json['confidence'] > 0.9
+ ensure
+ audio.clean
+ end
end

0 comments on commit 03adbde

Please sign in to comment.