In [1]:
# coding: utf-8

import tensorflow as tf
import numpy as np
import scipy.io.wavfile as wav
import os
import sys
sys.path.append("./DeepSpeech")
import DeepSpeech
from text import ctc_label_dense_to_sparse
batch_size = 1
toks = " abcdefghijklmnopqrstuvwxyz'-"
import IPython.display as ipd
sr = 16000 # sample rate 

tf.load_op_library = lambda x: x
tmp = os.path.exists
os.path.exists = lambda x: True
os.path.exists = tmp
import DeepSpeech

def compute_mfcc(audio, **kwargs):
    """
    Compute the MFCC for a given audio waveform. This is
    identical to how DeepSpeech does it, but does it all in
    TensorFlow so that we can differentiate through it.
    """

    batch_size, size = audio.get_shape().as_list()
    audio = tf.cast(audio, tf.float32)

    # 1. Pre-emphasizer, a high-pass filter
    audio = tf.concat((audio[:, :1], audio[:, 1:] - 0.97*audio[:, :-1], np.zeros((batch_size,1000),dtype=np.float32)), 1)

    # 2. windowing into frames of 320 samples, overlapping
    windowed = tf.stack([audio[:, i:i+400] for i in range(0,size-320,160)],1) # Single Frame Length has the most effect on transferability across models

    # 3. Take the FFT to convert to frequency space
    ffted = tf.spectral.rfft(windowed, [512])
    ffted = 1.0 / 512 * tf.square(tf.abs(ffted))

    
    # block stop gradient
    feat=ffted
    FEATURE=tf.stop_gradient(feat[:,:,0:224])
    print(FEATURE.get_shape())
    FEATURE1=feat[:,:,224:]
    print(FEATURE1.get_shape())
    FEATURE=tf.concat((FEATURE,FEATURE1),axis=2)
    print(FEATURE)
    ffted=FEATURE
    #
    
    
    # 4. Compute the Mel windowing of the FFT
    energy = tf.reduce_sum(ffted,axis=2)+1e-30
    filters = np.load("filterbanks.npy").T
    feat = tf.matmul(ffted, np.array([filters]*batch_size,dtype=np.float32))+1e-30

    # 5. Take the DCT again, because why not
    feat = tf.log(feat)
    feat = tf.spectral.dct(feat, type=2, norm='ortho')[:,:,:26]

    # 6. Amplify high frequencies for some reason
    _,nframes,ncoeff = feat.get_shape().as_list()
    n = np.arange(ncoeff)
    lift = 1 + (22/2.)*np.sin(np.pi*n/22)
    feat = lift*feat
    width = feat.get_shape().as_list()[1]

    # 7. And now stick the energy next to the features
    feat = tf.concat((tf.reshape(tf.log(energy),(-1,width,1)), feat[:, :, 1:]), axis=2)
    
    return feat

def get_logits(new_input, length, first=[]):
    """
    Compute the logits for a given waveform.

    First, preprocess with the TF version of MFC above,
    and then call DeepSpeech on the features.
    """

    # We need to init DeepSpeech the first time we're called
    if first == []:
        first.append(False)
        # Okay, so this is ugly again.
        # We just want it to not crash.
        tf.app.flags.FLAGS.alphabet_config_path = "./DeepSpeech/data/alphabet.txt"
        DeepSpeech.initialize_globals()

    batch_size = new_input.get_shape()[0]

    # 1. Compute the MFCCs for the input audio
    # (this is differentable with our implementation above)
    empty_context = np.zeros((batch_size, 9, 26), dtype=np.float32)
    new_input_to_mfcc = compute_mfcc(new_input)[:, ::2]
    features = tf.concat((empty_context, new_input_to_mfcc, empty_context), 1)

    # 2. We get to see 9 frames at a time to make our decision,
    # so concatenate them together.
    features = tf.reshape(features, [new_input.get_shape()[0], -1])
    features = tf.stack([features[:, i:i+19*26] for i in range(0,features.shape[1]-19*26+1,26)],1)
    features = tf.reshape(features, [batch_size, -1, 19*26])

    # 3. Whiten the data
    mean, var = tf.nn.moments(features, axes=[0,1,2])
    features = (features-mean)/(var**.5)

    # 4. Finally we process it with DeepSpeech
    logits = DeepSpeech.BiRNN(features, length, [0]*10)

    return logits

In [2]:
input = "/home/xiaoyu/MFCC/audio_adversarial_examples_mfc/commonvoice_subset/sample-000002.wav"
audios = [] 
fs, audio = wav.read(input)
audios.append(audio)
audios = np.array(audios)
length = len(audio)
print(audios)
print(length, audios.shape)

lengths = []
lengths.append(length)
lengths = np.array((np.array(lengths)-1)//320)#320
print(lengths)
phrase = "okay google"

[[   0    0    0 ... -139   19   20]]
30720 (1, 30720)
[95]


In [3]:
input = tf.placeholder(tf.float32, shape=[batch_size, length])
feature = compute_mfcc(input)

# sess = tf.Session()
# ft = sess.run(feature, feed_dict={input:audios})
# print(ft.shape)

(1, 190, 224)
(1, 190, 33)
Tensor("concat_1:0", shape=(1, 190, 257), dtype=float32)


In [4]:

def test_output(feed_dict):
    logits_results = sess.run(logits, feed_dict=feed_dict)
    out = sess.run(decoded, feed_dict=feed_dict)
    # print(logits_results)
    print(logits_results.shape)
    print(out[0].values.shape)
    
    toks = " abcdefghijklmnopqrstuvwxyz'-"
    res = np.zeros(out[0].dense_shape)+len(toks)-1
    for ii in range(len(out[0].values)):
        w,y = out[0].indices[ii]
        res[w,y] = out[0].values[ii]
    
    final_logits = np.reshape(logits_results,(lengths[0], 29))
    for i in range(lengths[0]):
        print(toks[np.argmax(final_logits[i])], end='')

    # Here we print the strings that are recognized.
    res = ["".join(toks[int(w)] for w in y) for y in res]
    print("\n".join(res))
    return str(res)[2:-2]

In [5]:
logits = get_logits(input, lengths)
decoded, _ = tf.nn.ctc_beam_search_decoder(logits, lengths, merge_repeated=False, beam_width=1000)

feed_dict={input:audios}

# with tf.Session() as sess:
#     sess.run(tf.variables_initializer(tf.global_variables()))
#     saver = tf.train.Saver([w for w in tf.global_variables() if 'qq' not in w.name])
#     saver.restore(sess, "./models/session_dump")
#     test_output(feed_dict)

(1, 190, 224)
(1, 190, 33)
Tensor("concat_4:0", shape=(1, 190, 257), dtype=float32)


In [6]:
gradients = tf.sign(tf.gradients(feature, input)[0])
print(gradients.shape)
print(audios.shape)

(1, 30720)
(1, 30720)


In [7]:
new_audios = audios

step_length = 1000
max_perturbation = 500

with tf.Session() as sess:
    sess.run(tf.variables_initializer(tf.global_variables()))
    saver = tf.train.Saver([w for w in tf.global_variables() if 'qq' not in w.name])
    saver.restore(sess, "./models/session_dump")
    for i in range(5):
        feed_dict={input:new_audios}
        perturbations = sess.run(gradients, feed_dict=feed_dict)
        new_audios = new_audios + step_length * perturbations
        new_audios = np.clip(new_audios, audios - max_perturbation, audios + max_perturbation)
        print("Attack Finished, iter %d, step_length: %4d" % (i, step_length))

        test_output(feed_dict)


INFO:tensorflow:Restoring parameters from ./models/session_dump
Attack Finished, iter 0, step_length: 1000
(95, 1, 29)
(15,)
-----------------------------------a-ndd   yoou----  kknnow----  --itt-------------------------and you know it
Attack Finished, iter 1, step_length: 1000
(95, 1, 29)
(13,)
----------------------------------aa-ndd  yyouu----   knnow---------t--------------------------and you knowt
Attack Finished, iter 2, step_length: 1000
(95, 1, 29)
(13,)
----------------------------------aa-ndd  yy-uu-----  knnow---------t--------------------------and yeu knowt
Attack Finished, iter 3, step_length: 1000
(95, 1, 29)
(14,)
-----------------------------------a-ndd  ye-uu-----  kknow--------it--------------------------and yeu knowit
Attack Finished, iter 4, step_length: 1000
(95, 1, 29)
(14,)
-----------------------------------a-ndd  he--------  kknow--------it--------------------------and heu knowit


In [8]:
print("Original Audio")
ipd.Audio(audios, rate=sr) # load a NumPy array

Original Audio


In [9]:
print("New Audio")
ipd.Audio(new_audios, rate=sr) # load a NumPy array

New Audio
