# Real word V2V

This notebook shows diversity of V2V usage

In [1]:
%env THEANO_FLAGS="device=gpu4"

env: THEANO_FLAGS="device=gpu4"


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from tqdm import tqdm_notebook as tqdm
from PIL import Image

from librosa import load, logamplitude
from librosa.feature import melspectrogram

import os
import shutil

from IPython.display import clear_output

In [3]:
import sys
sys.path.append("../clear")
from CoolSoundNetwork import Network

## Dataset preparation

In [4]:
df = pd.DataFrame.from_csv("../../data/pronuns.csv")

### Vectorizing voices

In [25]:
network = Network(load_weights=False, vec_weights_file_name='../../data/weights.npy')

In [54]:
data_path = "../../data/images/"
output_path = "../../data/vectors/"
for im in tqdm(os.listdir("../../data/images/")):
    img = Image.open(data_path+im)
    vectors = network.vectorizer(voice_array=np.array(img))
    for index, vector in enumerate(vectors):
        np.save(output_path+im.strip(".png")+"_"+str(index), vector)




### Filter out small classes

In [None]:
category = "accent"
final_categorys = []

min_num = 100

for cl, num_of_inst in enumerate(df[category].value_counts()):
    if num_of_inst >= min_num:
        final_categorys.append(cl)

In [None]:
df = df[df[category].isin(final_categorys)]

In [13]:
accent_count = len(df.accent.unique())

### Filtering dataset

In [74]:
count = 0
recordings = pd.Series(index=df.pronun_id.unique())
for file in tqdm(os.listdir("../../data/vectors/")):
    name = file.split("_")[0]
    try:
        recordings[name]
    except:
        os.remove("../../data/vectors/"+file)

### Making test

In [None]:
test_size = 1000

temp_df = df.set_index("pronun_id")

directory = os.listdir("../../data/vectors/")

men_counter = 0
women_counter = 0

accent_counter = np.zeros(accent_count)

for file in directory:
    if men_counter >= test_size and women_counter >= test_size:
        break
    if df.loc[file.split("_")[0]].gender and men_counter < test_size:
        shutil.move("../../data/vectors/"+file, "../../data/vectors/test/"+file)
        men_counter += 1
    elif women_counter < test_size:
        shutil.move("../../data/vectors/"+file, "../../data/vectors/test/"+file)
        women_counter += 1

del temp_df

test_df = df[df.pronun_id.isin(list(map(lambda x: x.split("_")[0], os.listdir("../../data/vectors/test/"))))]
test_df = test_df.set_index("pronun_id")

In [None]:
test_X = []
test_Y = [test.gender, test.accent]

dir_names = list(map(lambda x: x.strip(".npy"), directory))

for rec_id in test_df.keys:
    recordings = [x for x in dir_names if x.split("_")[0] == rec_id]
    if len(recordings) > 1:
        rec_name = np.random.choice(recordings)
    else:
        rec_name = recordings[0]
    rec_name += ".npy"
    
    test_X.append(np.load("../../data/vectors/test/"+rec_name))

## Networks architecture

In [5]:
import theano
import theano.tensor as T

import lasagne

ERROR (theano.sandbox.cuda): ERROR: Not using GPU. Initialisation of device 4 failed:
initCnmem: cnmemInit call failed! Reason=CNMEM_STATUS_OUT_OF_MEMORY. numdev=1

Using gpu device 0: GeForce GTX 1080 (CNMeM is enabled with initial size: 45.0% of memory, cuDNN 5105)


In [6]:
input_tensor = T.matrix("Vector input")
target_gender = T.ivector("Target gender")
target_accent = T.ivector("Target acent")

In [7]:
input_ = lasagne.layers.InputLayer((100, 100), input_var=input_tensor, name="Network input")
batch_norm0 = lasagne.layers.batch_norm(input_, name="Batch normalization")

### GenderNet

In [8]:
gend_dense0 = lasagne.layers.DenseLayer(batch_norm0, 50, name="Dense 0")
gend_dense1 = lasagne.layers.DenseLayer(gend_dense0, 20, name="Dense 1")
gender_out = lasagne.layers.DenseLayer(gend_dense1, 1, nonlinearity=lasagne.nonlinearities.sigmoid, name="Output")

In [9]:
gender_out.output_shape

(100, 1)

In [10]:
gender_predicted = lasagne.layers.get_output(gender_out)
gender_param = lasagne.layers.get_all_params(gender_out, trainable=True)

In [11]:
gender_loss = lasagne.objectives.binary_crossentropy(gender_predicted, target_gender).mean()

gender_updates = lasagne.updates.adagrad(gender_loss, gender_param)

### AccentNet

In [14]:
accent_dense0 = lasagne.layers.DenseLayer(batch_norm0, 200, name="Dense 0")
accent_dense1 = lasagne.layers.DenseLayer(accent_dense0, 100, name="Dense 1")
accent_out = lasagne.layers.DenseLayer(accent_dense1, accent_count,
                                       nonlinearity=lasagne.nonlinearities.softmax, name="Accent output")

In [15]:
accent_out.output_shape

(100, 111)

In [16]:
accent_predicted = lasagne.layers.get_output(accent_out)
accent_param = lasagne.layers.get_all_params(accent_out, trainable=True)

In [17]:
accent_loss = lasagne.objectives.binary_crossentropy(accent_predicted, target_accent).mean()

accent_updates = lasagne.updates.adagrad(accent_loss, accent_param)

### Compilation

In [18]:
gender_train = theano.function([input_.input_var, target_gender], updates=gender_updates)
accent_train = theano.function([input_.input_var, target_accent], updates=accent_updates)

gender_predict = theano.function([input_.input_var], gender_predicted)
accent_predict = theano.function([input_.input_var], accent_predicted)

predict = theano.function([input_.input_var], [gender_predicted, accent_predicted])

## Training

In [146]:
def iterate_minibatches(input_path, output_df, batchsize, index_col="pronun_id",
                            target_names=["gender"],shuffle=True):
    input_files = np.array(list(map(lambda x: input_path+x, os.listdir(input_path))))
    input_files = np.array(list(filter(lambda x: os.isfile(x), input_files)))
    output_df = output_df.set_index(index_col)
    targets = output_df[target_names]
    del output_df
    
    if shuffle:
        indices = np.arange(input_files.size)
        np.random.shuffle(indices)
    for start_idx in range(0, len(input_files) - batchsize + 1, batchsize):
        if shuffle:
            excerpt = indices[start_idx:start_idx + batchsize]
        else:
            excerpt = slice(start_idx, start_idx + batchsize)
        inputs = []
        rec_targ = []
        for inp in input_files[excerpt]:
            naming = inp.split("_")[0].split("/")
            name = naming[len(naming)-1]
            inputs.append(np.load(inp))
            rec_targ.append(targets[name])
        yield np.array(inputs), np.array(rec_targ)

In [147]:
EPOCH = 5
size = df.pronun_id.size

gender_pred, accent_pred = predict(test_X)
g_l =  gender_loss(gender_pred, test_Y[0])
a_l = acccent_loss(accent_pred, test_Y[1])

for epoch in range(EPOCH):
    curr = 0
    for x, y in iterate_minibatches("../../data/vectors/", df, 150, target_names=["gender", "accent"]):
        gender_train(x, y[0])
        accent_train(x, y[1])
        curr+= 150
        clear_output()
        print("Epoch: ", epoch)
        if curr % 100 == 0:
            gender_pred, accent_pred = predict(test_X)
            g_l =  gender_loss(gender_pred, test_Y[0])
            a_l = acccent_loss(accent_pred, test_Y[1])
        print("Gender accuracy: ", g_l)
        print("Accent accuracy: ", a_l)
        print(curr)    

Epoch:  0
13350


KeyboardInterrupt: 

In [None]:
np.save("accent_weights", accent_param())
np.save("gender_weights", gender_param())

## Result

In [None]:
predicted_Y = predict(input_X)
print("Gender accuracy: ", gender_loss(predicted_Y, test_Y[0]))
print("Accent accuracy: ", accent_loss(predicted_Y, test_Y[1]))