In [None]:
%env THEANO_FLAGS="device=gpu4"

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from tqdm import tqdm_notebook as tqdm
from PIL import Image

from librosa import load, logamplitude
from librosa.feature import melspectrogram

In [3]:
SOUND_SHAPE = (500, 513)

In [None]:
df = pd.DataFrame.from_csv("pronuns.csv")

In [4]:
def get_spectrogram(img_name):
    return np.array(Image.open("../../data/images/{}.png".format(img_name)))

# def get_spectrogram(path):
#     """Строим спектограмму из wav файла"""
#     y, sr = load("mongodb/sounds/%s.wav" % path)
#     S = melspectrogram(y, sr=sr, n_mels=100)
#     log_S = logamplitude(S, ref_power=np.max)
#     return log_S

In [None]:
plt.imshow(get_spectrogram("585fda10698f828c848d862d"))
get_spectrogram("585fda10698f828c848d862d").shape

In [None]:
np.min(get_spectrogram("585fda10698f828c848d862d"))

In [None]:
df.head()

In [None]:
for _id in tqdm(set([i[:-4] for i in os.listdir("mongodb/sounds")])):
    get_spectrogram("../../sounds/%s.wav" % _id)

In [None]:
final_train = {}
for user in tqdm(df.user.unique()):
    user_sp = {}
    user_sp_data = df[df.user == user].pronun_id.values
    final_train[user] = user_sp_data

In [None]:
from scipy.sparse import bsr_matrix as sp_matrix

In [5]:
def as_matrix(ar, shape):
    ret_mat = np.zeros((len(ar), shape[0], shape[1]), dtype="float16")
    for i, vec in enumerate(ar):
        try:
            if vec.shape[0] > shape[0]:
                ret_mat[i, :, :] = vec[:shape[0], :]/200.
            else:
                ret_mat[i, :vec.shape[0], :] = vec/200.
        except IndexError:
            print(vec.shape)
            raise IndexError

    return ret_mat

In [6]:
import dill
from numpy.random import choice

class VoicesData:
    def __init__(self, path='users.dl'):
        # Путь до dill моделf
        self.path = path
#         Defaultdict информации
        self.base = dill.load(open(self.path, 'rb'))
#         self.base = final_train
    def __getitem__(self, item):
        return self.base[item]
    def save(self):
        """Поскольку это не стандартная DB, тут нужна функция сохранения"""
        dill.dump(self.base, open(self.path, 'wb'))

    def get_train_vec(self, shape=SOUND_SHAPE):
        """Делаем из данных train выборку"""
        for key in tqdm(self.base.keys()): 
            train = []
            if len(self.base[key]) >= 2:
                for _ in range(0, len(self.base[key]), 2):
                    values = choice(list(self.base[key]), 2)
                    a = np.asarray(get_spectrogram(values[0]))
                    b = np.asarray(get_spectrogram(values[1]))
                    other = choice(list(self.base), 1)[0]
                    c = np.asarray(get_spectrogram(choice(list(self.base[other]), 1)[0]))

                    other_value = c
                    value_first = a
                    value_second = b
#                     print(a.shape)
                    # other_value.resize(shape) # value_first.resize(shape) # value_second.resize(shape)
                    some = as_matrix((value_first, other_value, value_second), shape)
                    train.append(some)
#                     break
                    
#                 print(len(train))
                yield train
            else:
#                 print("I'm empty")
                pass

    def get_train_people(self, shape=SOUND_SHAPE, count=10000):
        X = []
        y = []
        people = list(self.base.keys())
        for _ in tqdm(range(0, count//2)):
            man = choice(people, 1)
            track = choice(list(self.base[man[0]]), 2)

            a = get_spectrogram(track[0])
            b = get_spectrogram(track[1])

            X.extend([as_matrix([a, b], shape)])

        y.extend([1 for _ in range(len(X))])
        X_ol = len(X)
#                 break

        
        for _ in tqdm(range(count//2, count)):
            man1 = choice(people, 1)[0]
            people_new = people[:]
            people_new.remove(man1)

            man2 = choice(people_new, 1)[0]
            
#             print(man1)
            sou = self.base[man1]
            a = choice(list(sou), 1)[0]
            sou = self.base[man2]
            b = choice(list(sou), 1)[0]
            
            X.append(as_matrix([get_spectrogram(a),
                      get_spectrogram(b)], shape))
        y.extend([0 for _ in range(len(X) - X_ol)])
        return np.array(X, dtype="float16"), np.array(y)

In [None]:
choice([0], 2)

In [18]:
users = VoicesData("../../data/users.dl")

In [None]:
users.save()

In [19]:
X, y = users.get_train_people(count=40000)





In [None]:
data = users.get_train_vec()

In [None]:
del users

In [None]:
np.save("X", X)
np.save("y", y)

In [8]:
import theano
import theano.tensor as T

import lasagne

from lasagne.layers import InputLayer, DenseLayer, ReshapeLayer, Conv1DLayer, MaxPool1DLayer, GlobalPoolLayer, \
    get_output, get_all_params, get_all_param_values, set_all_param_values

from lasagne.nonlinearities import very_leaky_rectify, tanh

from lasagne.updates import adagrad

def make_speechtovec(incoming, sound_shape, num_units, **kwargs):
    """
    :param incoming: the layer feeding into this layer, or the expected input shape.
    :param sound_shape: shape of freq x time
    :param num_units: output vector dimension
    """


    input_reshape = ReshapeLayer(incoming, (-1,) + sound_shape)  # Сворачиваем все записи друг за другом
    convolution = Conv1DLayer(input_reshape, num_filters=100, filter_size=5,
                              nonlinearity=very_leaky_rectify, name="Convolutional")
    pooling = MaxPool1DLayer(convolution, 2)
    global_pooling = GlobalPoolLayer(pooling)
    dense = DenseLayer(global_pooling, num_units=300, name="Dense")
    output_dense = DenseLayer(dense, num_units=num_units, nonlinearity=lasagne.nonlinearities.linear, name='output')
    all_vectors_output = ReshapeLayer(output_dense, (-1, 3, num_units))

    return all_vectors_output, output_dense

Using gpu device 4: GeForce GTX 1080 (CNMeM is enabled with initial size: 45.0% of memory, cuDNN 5105)


In [9]:
input_triplets = T.tensor4("Triplets input", dtype="float32")
target = T.ivector("Target")

In [None]:
triplets_input = InputLayer((None, 3) + SOUND_SHAPE, input_var=input_triplets)
# people_inputs = InputLayer((None, 2, 500, 513))
# from lasagne.layers import dimshuffle
# dimshuffle(triplets_input,[0,1,3,2])
vectorizer, _ = make_speechtovec(lasagne.layers.dimshuffle(triplets_input,[0,1,3,2]), SOUND_SHAPE[::-1], 300)

In [None]:
lasagne.layers.set_all_param_values(vectorizer, param)

In [None]:
all_pred = get_output(vectorizer)
params = get_all_params(vectorizer, trainable=True)

In [None]:
def loss_func(all_predicted):
    def distance_sq(x1, x2):
        return T.sum(T.sqr(x1 - x2))

    d1 = distance_sq(all_predicted[:, 0], all_predicted[:, 1])
    d2 = distance_sq(all_predicted[:, 0], all_predicted[:, 2])
    alpha = 1e-2

    return T.maximum(d1 + alpha, 0) - T.maximum(d2 + alpha, 0)

In [None]:
def cos_sim(vec1, vec2):
    numerator = T.sum(vec1*vec2)
    denumenator = T.sqrt(T.sum(vec1**2)*T.sum(vec2**2))
    return numerator/denumenator

def denum_fun(v1, v2, similar_v, epsilon=0.0001):
    if similar_v:
        return (cos_sim(v1, v2) + 1)/2 + epsilon
    else:
        return (cos_sim(v1, v2) - 1)/2 + epsilon

def loss_func_new(all_predicted, epsilon=0.0001):
    def distance_sq(x1, x2):
        return T.sum(T.sqr(x1 - x2))

    d1 = distance_sq(all_predicted[:, 0], all_predicted[:, 1])
    d2 = distance_sq(all_predicted[:, 0], all_predicted[:, 2])
    
    d1 /= denum_fun(all_predicted[:, 0], all_predicted[:, 1], True)
    d2 /= denum_fun(all_predicted[:, 0], all_predicted[:, 2], False)
    alpha = 1e-2

    return T.maximum(d1 + alpha, 0) - T.maximum(d2 + alpha, 0)

In [None]:
loss = loss_func(all_pred)

updates = adagrad(loss, params)

In [None]:
train = theano.function([triplets_input.input_var], updates=updates)

In [None]:
shape = None
users = VoicesData("../../data/users.dl")
for i in range(5):
    data = users.get_train_vec()
    for t in data:
        try:
            t = np.array(t)

            if t.shape[0] > 1000:
                train(t[:500])
                train(t[500:1000])
                train(t[1000:])
            elif t.shape[0] > 500:
                train(t[:500])
                train(t[500:])
            else:
                train(t)

            shape = t.shape
            del t
        except MemoryError:
            print(t.shape, shape)
            break

In [None]:
np.save("../../data/weights_vec_new_new.npy", lasagne.layers.get_all_param_values(vectorizer))

In [None]:
param = np.load("../../data/weights_vec_new.npy")

In [12]:
param = np.load("../../data/tvorog_vectorizer.npy")

In [None]:
pr = theano.function([triplets_input.input_var], all_pred)

In [None]:
shape = None
for t in data:
    print(pr(t))
    break

In [None]:
t

In [None]:
np.array(t).dtype

In [None]:
for l in get_all_layers(vectorizer):
    print (l, l.output_shape)

In [10]:
siminput = T.tensor3("Similar voice")

In [34]:
triplets_input = InputLayer((None, 2) + SOUND_SHAPE, input_var=input_triplets)
# nn = lasagne.layers.batch_norm(triplets_input)
# people_inputs = InputLayer((None, 2, 500, 513))
# from lasagne.layers import dimshuffle
_ ,vectorizer= make_speechtovec(lasagne.layers.dimshuffle(triplets_input,[0,1,3,2]), SOUND_SHAPE[::-1], 300)

similar_inp = lasagne.layers.InputLayer((None, 2, 300), input_var=siminput)
# vector_output = ReshapeLayer(vectorizer, (-1, 2, 300))
nn = lasagne.layers.batch_norm(similar_inp)
conv_layer = Conv1DLayer(nn, 300, 2)
nn = lasagne.layers.batch_norm(conv_layer)
dense0 = DenseLayer(nn, 150)
nn = lasagne.layers.batch_norm(dense0)
dense0 = DenseLayer(nn, 50)
nn = lasagne.layers.batch_norm(dense0)
output = DenseLayer(nn, 1, nonlinearity=lasagne.nonlinearities.sigmoid)
# dense0 = DenseLayer(vector_output, 100)
# nn = lasagne.layers.batch_norm(dense0)
# output = DenseLayer(nn, 1, nonlinearity=lasagne.nonlinearities.sigmoid)

In [35]:
lasagne.layers.set_all_param_values(vectorizer, param)

In [36]:
predict = lasagne.layers.get_output(output)
vec_pr = lasagne.layers.get_output(vectorizer)

In [None]:
predict

In [37]:
parametrs = lasagne.layers.get_all_params(output, trainable=True)

In [None]:
parametrs

In [38]:
loss = lasagne.objectives.binary_crossentropy(predict, target).sum()
acc = lasagne.objectives.binary_accuracy(predict, target).mean()
updates = lasagne.updates.adamax(loss, parametrs,learning_rate = 0.0001)
# updates = lasagne.updates.apply_nesterov_momentum(updates, parametrs)

In [41]:
train = theano.function([similar_inp.input_var, target] ,updates=updates, allow_input_downcast=True)
vectoriz = theano.function([triplets_input.input_var], vec_pr, allow_input_downcast=True)
presd = theano.function([similar_inp.input_var],predict ,allow_input_downcast=True)

In [26]:
ind = np.arange(len(X))
np.random.shuffle(ind)


In [27]:
in_train, in_test = ind[1000:], ind[:1000]
# y_train, y_test = Y[1000:], Y[:1000]

In [22]:
def iterate_minibatches(inputs, targets, ind, batchsize, shuffle=True):
    assert len(inputs) == len(targets)
    if shuffle:
        indices = np.arange(len(ind))
        np.random.shuffle(indices)
    for start_idx in tqdm(range(0, len(ind) - batchsize + 1, batchsize)):
        if shuffle:
            excerpt = indices[start_idx:start_idx + batchsize]
        else:
            excerpt = slice(start_idx, start_idx + batchsize)
        yield inputs[ind[excerpt]], targets[ind[excerpt]]

In [23]:
from time import time

In [None]:
lasagne.layers.set_all_param_values(output, np.load("../../data/weights/symvoice_weights.npy"))

In [47]:
EPOCH = 100

for epoch in range(EPOCH):
    st = time()
    for i, batch in enumerate(iterate_minibatches(X, y, in_train, 1000)):
#         print("Hey")
        x_tr, y_tr = batch
        train(vectoriz(x_tr).reshape((-1, 2, 300)), y_tr)
#         print("Hop")
#         break
        if i > 100:
            break
#     break
    print('\r', "Time: ", (time()-st)/60.)
    print("\tAccuracy: ", roc_auc_score(y[in_test], presd(vectoriz(X[in_test]).reshape((-1, 2, 300)))))




Exception in thread Thread-32:
Traceback (most recent call last):
  File "/anaconda3/lib/python3.5/threading.py", line 914, in _bootstrap_inner
    self.run()
  File "/anaconda3/lib/python3.5/site-packages/tqdm/_tqdm.py", line 102, in run
    for instance in self.tqdm_cls._instances:
  File "/anaconda3/lib/python3.5/_weakrefset.py", line 60, in __iter__
    for itemref in self.data:
RuntimeError: Set changed size during iteration




 Time:  2.569850039482117
	Accuracy:  0.848934218747

 Time:  2.6202631910641987
	Accuracy:  0.851259231965

 Time:  2.6109565774599712
	Accuracy:  0.849961550169

 Time:  2.6408819874127705
	Accuracy:  0.850097726654

 Time:  2.610487627983093
	Accuracy:  0.847021740175


KeyboardInterrupt: 

In [40]:
del train, vectoriz, presd

In [None]:
lasagne.layers.get_all_param_values(vectorizer)

In [None]:
acc_fun(X[in_test], y[in_test])

In [28]:
y[in_test].sum()/len(y[in_test])

0.51800000000000002

In [24]:
from sklearn.metrics import roc_auc_score

In [None]:
roc_auc_score(y[in_test], presd(vectoriz(X[in_test]).reshape((-1, 2, 300))))

In [43]:
np.save("../../data/simvoice_weights.npy", lasagne.layers.get_all_param_values(output))

In [46]:
np.save("../../data/vectorizer_weights.npy", lasagne.layers.get_all_param_values(vectorizer))

In [None]:
t = None

In [48]:
presd(vectoriz(X[in_test]).reshape((-1, 2, 300)))

array([[ 0.12914398],
       [ 0.74728978],
       [ 0.25182235],
       [ 0.51052099],
       [ 0.10784314],
       [ 0.28822351],
       [ 0.03875925],
       [ 0.35590574],
       [ 0.14068604],
       [ 0.27607948],
       [ 0.60729241],
       [ 0.4615072 ],
       [ 0.99822563],
       [ 0.9522745 ],
       [ 0.64928299],
       [ 0.55753684],
       [ 0.98516035],
       [ 0.31106356],
       [ 0.45526385],
       [ 0.99878091],
       [ 0.2649135 ],
       [ 0.94076777],
       [ 0.12188268],
       [ 0.17551516],
       [ 0.18895182],
       [ 0.20877738],
       [ 0.55359083],
       [ 0.36299482],
       [ 0.23655155],
       [ 0.96475905],
       [ 0.36400989],
       [ 0.20507111],
       [ 0.36992726],
       [ 0.27127019],
       [ 0.97369474],
       [ 0.99461949],
       [ 0.21880427],
       [ 0.01912975],
       [ 0.98882854],
       [ 0.04067783],
       [ 0.12097206],
       [ 0.99018025],
       [ 0.45173803],
       [ 0.83675075],
       [ 0.56882501],
       [ 0