# Датасет

In [1]:
# Для того, чтобы получить чиселки из изображения
import numpy as np

# Для того, чтобы получить изображение
from PIL import Image

In [2]:
def get_spectrogram(img_name):
    '''Функция, которая возращает спектограмму по id'''
    return np.array(Image.open("../../data/images/{}.png".format(img_name)))

In [3]:
# Для загрузки и обработки csv с данными
import pandas as pd

In [4]:
# Загрузим данные
data = pd.DataFrame.from_csv("../../data/pronuns.csv")

In [5]:
# Проверим
data.head()

Unnamed: 0,pronun_rank,_id,visits,gender,word_rank,user,accent,votes,pronun_id,pronuns,best_pronuns,word,when_word_added,global_listenings
0,662,585fc620698f824ee334a626,61503,False,1564,mariad,Spain,0,585fda10698f828c848d862d,573,199,0_zero,2010-04-25,192
1,21737,585fc7f4698f824ee334afc3,303426,True,14,Wojtula,United States,0,585fda11698f828c848d862e,7,0,11_jedenaście,2013-05-18,638
2,9,585fc5e7698f824ee334a4ea,162426,False,998,usako_usagiclub,Japan,1,585fda11698f828c848d862f,25504,0,１１９番,2015-06-06,743
3,228,585fc8e7698f824ee334b4d5,60338,True,13400,SeanMauch,United States,0,585fda11698f828c848d8630,1765,0,12,2008-07-10,57K
4,153,585fc854698f824ee334b1ca,342195,False,641,anakat,United States,0,585fda11698f828c848d8631,2851,969,12,2008-07-10,57K


In [6]:
# Загрузим данные в 'классном' виде
from collections import defaultdict

users_pronun_dict = defaultdict(lambda: list())

In [7]:
# Статус бар
from tqdm import tqdm_notebook

In [8]:
for row in tqdm_notebook(data.values):
    users_pronun_dict[row[5]].append(row[8]) 




In [9]:
ones = []
parity = []

for user in tqdm_notebook(users_pronun_dict.keys()):
    user_items = users_pronun_dict[user]
    
    if len(user_items) == 1:
        ones.append((user, user_items))
    elif len(user_items) % 2 == 0:
        for item_index in range(0, len(user_items), 2):
            parity.append((user, [user_items[item_index-1], user_items[item_index]]))
    
    else:
        ones.append((user, [user_items[0]]))
        
        user_items = user_items[1:]
        for item_index in range(0, len(user_items), 2):
            parity.append((user, [user_items[item_index-1], user_items[item_index]]))

for tpl in parity:
    if len(ones) + 2 < len(parity):
        ones.append((tpl[0], [tpl[1][0]]))
        ones.append((tpl[0], [tpl[1][1]]))

parity = parity[len(parity) - len(ones):]




In [10]:
dataset = []

for a, b in zip(parity, ones):
    dataset.append(a[1] + b[1])

In [11]:
from random import shuffle

In [12]:
SOUND_SHAPE = (500, 513)

In [13]:
def as_matrix(ar, shape = (500, 513)):
    ret_mat = np.zeros((len(ar), shape[0], shape[1]), dtype="float16")
    for i, vec in enumerate(ar):
        try:
            if vec.shape[0] > shape[0]:
                ret_mat[i, :, :] = vec[:shape[0], :]/200.
            else:
                ret_mat[i, :vec.shape[0], :] = vec/200.
        except IndexError:
            print(vec.shape)
            raise IndexError

    return ret_mat

In [27]:
def get_dataset():
    global dataset
    shuffle(dataset)
    
    train = []
    for i in dataset:
        m1, m2, m3 = get_spectrogram(i[0]), get_spectrogram(i[1]), get_spectrogram(i[2])
        if len(train) == 100:
            yield train
            train = []
        else:
            train.append(as_matrix((m1, m2, m3)))

# Нейронка

In [15]:
%env THEANO_FLAGS="device=gpu1"

env: THEANO_FLAGS="device=gpu1"


In [16]:
import theano
import theano.tensor as T

import lasagne

from lasagne.layers import InputLayer, DenseLayer, ReshapeLayer, Conv1DLayer, MaxPool1DLayer, GlobalPoolLayer, \
    get_output, get_all_params, get_all_param_values, set_all_param_values

from lasagne.nonlinearities import very_leaky_rectify, tanh

from lasagne.updates import adagrad

def make_speechtovec(incoming, sound_shape, num_units, **kwargs):
    """
    :param incoming: the layer feeding into this layer, or the expected input shape.
    :param sound_shape: shape of freq x time
    :param num_units: output vector dimension
    """


    input_reshape = ReshapeLayer(incoming, (-1,) + sound_shape)  # Сворачиваем все записи друг за другом
    convolution = Conv1DLayer(input_reshape, num_filters=100, filter_size=5,
                              nonlinearity=very_leaky_rectify, name="Convolutional")
    pooling = MaxPool1DLayer(convolution, 2)
    global_pooling = GlobalPoolLayer(pooling)
    dense = DenseLayer(global_pooling, num_units=300, name="Dense")
    output_dense = DenseLayer(dense, num_units=num_units, nonlinearity=lasagne.nonlinearities.linear, name='output')
    all_vectors_output = ReshapeLayer(output_dense, (-1, 3, num_units))

    return all_vectors_output, output_dense

Using gpu device 1: GeForce GTX 1080 (CNMeM is enabled with initial size: 45.0% of memory, cuDNN 5105)


In [17]:
input_triplets = T.tensor4("Triplets input", dtype="float32")

In [18]:
triplets_input = InputLayer((None, 3) + SOUND_SHAPE, input_var=input_triplets)
# people_inputs = InputLayer((None, 2, 500, 513))
# from lasagne.layers import dimshuffle
# dimshuffle(triplets_input,[0,1,3,2])
vectorizer, _ = make_speechtovec(lasagne.layers.dimshuffle(triplets_input,[0,1,3,2]), SOUND_SHAPE[::-1], 300)

In [19]:
all_pred = get_output(vectorizer)
params = get_all_params(vectorizer, trainable=True)

In [20]:
def loss_func(all_predicted):
    def distance_sq(x1, x2):
        return T.sum(T.sqr(x1 - x2))

    d1 = distance_sq(all_predicted[:, 0], all_predicted[:, 1])
    d2 = distance_sq(all_predicted[:, 0], all_predicted[:, 2])
    alpha = 1e-2

    return T.maximum(d1 + alpha, 0) - T.maximum(d2 + alpha, 0)

In [21]:
def cos_sim(vec1, vec2):
    numerator = T.sum(vec1*vec2)
    denumenator = T.sqrt(T.sum(vec1**2)*T.sum(vec2**2))
    return numerator/denumenator

def denum_fun(v1, v2, similar_v, epsilon=0.0001):
    if similar_v:
        return (cos_sim(v1, v2) + 1)/2 + epsilon
    else:
        return (cos_sim(v1, v2) - 1)/2 + epsilon

def loss_func_new(all_predicted, epsilon=0.0001):
    def distance_sq(x1, x2):
        return T.sum(T.sqr(x1 - x2))

    d1 = distance_sq(all_predicted[:, 0], all_predicted[:, 1])
    d2 = distance_sq(all_predicted[:, 0], all_predicted[:, 2])
    
    d1 /= denum_fun(all_predicted[:, 0], all_predicted[:, 1], True)
    d2 /= denum_fun(all_predicted[:, 0], all_predicted[:, 2], False)
    alpha = 1e-2

    return T.maximum(d1 + alpha, 0) - T.maximum(d2 + alpha, 0)

In [22]:
loss = loss_func_new(all_pred)

updates = lasagne.updates.adamax(loss, params,learning_rate = 0.0001)

In [23]:
train = theano.function([triplets_input.input_var], updates=updates)

In [None]:
for i in range(100000):
    for t in tqdm_notebook(get_dataset()):
        try:
            t = np.array(t)

            if t.shape[0] > 700:
                train(t[:700])
                train(t[700:])
            else:
                train(t)

            shape = t.shape
            del t
        except MemoryError:
            print(t.shape, shape)
            break

In [29]:
np.save(open('../../data/tvorog_vectorizer.npy', 'wb'), lasagne.layers.get_all_param_values(layer=vectorizer))

In [None]:
print("lol")