# Real word V2V

This notebook shows diversity of V2V usage

In [1]:
%env THEANO_FLAGS="device=gpu4"

env: THEANO_FLAGS="device=gpu4"


In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from tqdm import tqdm_notebook as tqdm
from PIL import Image

from librosa import load, logamplitude
from librosa.feature import melspectrogram

import os
import shutil

from IPython.display import clear_output

from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder

In [2]:
import sys
sys.path.append("../clear")
from CoolSoundNetwork import Network

## Dataset preparation

In [5]:
df = pd.DataFrame.from_csv("../../data/pronuns.csv")

In [6]:
enc = LabelEncoder()
encoded = enc.fit_transform(df.accent)
df["coded"] = encoded

### Vectorizing voices

In [7]:
network = Network(vectorizer_weights_file_name='../../data/vectorizer_weights.npy', load_similar_weights=False)

In [168]:
women_test = list(df[df.gender == False].pronun_id)[:500]
men_test = list(df[df.gender == True].pronun_id)[:500]

data_path = "../../data/images/"
output_path = "../../data/vectors/"

test = women_test + men_test
np.random.shuffle(np.array(test))
train = df[~df.pronun_id.isin(test)]

for t in test:
    img = Image.open(data_path+t+".png")
    vectors = network.vectorizer(voice_array=np.array(img))
    for index, vector in enumerate(vectors):
        np.save(output_path+"test/"+t+"_"+str(index), vector)

for tr in train.pronun_id:
    img = Image.open(data_path+t+".png")
    vectors = network.vectorizer(voice_array=np.array(img))
    for index, vector in enumerate(vectors):
        np.save(output_path+tr+"_"+str(index), vector)

In [4]:
train = []
test = []
for f in os.listdir("../../data/vectors"):
    train.append(f.split("_")[0])

for f in os.listdir("../../data/vectors/test"):
    test.append(f.split("_")[0])
train = np.unique(train)
test = np.unique(test)

In [7]:
train_df = df[df.pronun_id.isin(train)]
test_df = df[df.pronun_id.isin(test)]

In [39]:
train_df = train_df.set_index("pronun_id")
test_df = test_df.set_index("pronun_id")

### Filter out small classes

In [None]:
category = "accent"
final_categorys = []

min_num = 100

for cl, num_of_inst in enumerate(df[category].value_counts()):
    if num_of_inst >= min_num:
        final_categorys.append(cl)

In [None]:
df = df[df[category].isin(final_categorys)]

In [23]:
accent_count = len(df.accent.unique())

### Filtering dataset

In [175]:
count = 0
recordings = pd.Series(index=df.pronun_id.unique())
for file in tqdm(os.listdir("../../data/vectors/test")):
    name = file.split("_")[0]
    if name == "test":
        continue
    try:
        recordings[name]
    except:
        os.remove("../../data/vectors/test/"+file)




### Making test

In [14]:
test_size = 1000

temp_df = df.set_index("pronun_id")

directory = os.listdir("../../data/vectors/")

men_counter = 0
women_counter = 0

for file in directory:
    if men_counter >= test_size and women_counter >= test_size:
        break
    if temp_df.loc[file.split("_")[0]].gender and men_counter < test_size:
        shutil.move("../../data/vectors/"+file, "../../data/vectors/test/"+file)
        men_counter += 1
    elif women_counter < test_size:
        shutil.move("../../data/vectors/"+file, "../../data/vectors/test/"+file)
        women_counter += 1

del temp_df

In [177]:
test_df = df[df.pronun_id.isin(test)]

In [122]:
test_df = df[df.pronun_id.isin(list(map(lambda x: x.split("_")[0], os.listdir("../../data/vectors/test/"))))]
test_df = test_df.set_index("pronun_id")

In [123]:
t = df[~df.pronun_id.isin(test_df)]

In [23]:
women = t[t.gender == False].pronun_id
import shutil
counter = 0
dire = os.listdir("../../data/vectors/")
for f in dire:
    if f.split("_")[0] in women.values:
        shutil.move("../../data/vectors/"+f, "../../data/vectors/test/"+f)
        counter += 1
        if counter == 200:
            break

In [131]:
import glob

test_X = []
test_Y = np.array(test_df.gender)

for index in test_df.index:
    files = glob.glob("../../data/vectors/test/"+index+"*")
    test_X.append(np.load(files[0]))

In [42]:
############
test_X = []
test_Y = []

for f in os.listdir("../../data/vectors/test"):
    name = f.split("_")[0]
    test_X.append(np.load("../../data/vectors/test/"+f))
    test_Y.append(test_df.loc[name].gender)

## Networks architecture

In [15]:
import theano
import theano.tensor as T

import lasagne

In [16]:
input_tensor = T.matrix("Vector input")
target_gender = T.ivector("Target gender")
target_accent = T.ivector("Target acent")

In [17]:
input_ = lasagne.layers.InputLayer((None, 300), input_var=input_tensor, name="Network input")
batch_norm0 = lasagne.layers.batch_norm(input_, name="Batch normalization")

### GenderNet

In [18]:
gend_dense0 = lasagne.layers.DenseLayer(batch_norm0, 50, name="Dense 0")
gend_dense1 = lasagne.layers.DenseLayer(gend_dense0, 20, name="Dense 1")
gender_out = lasagne.layers.DenseLayer(gend_dense1, 1, nonlinearity=lasagne.nonlinearities.sigmoid, name="Output")

In [19]:
gender_out.output_shape

(None, 1)

In [20]:
gender_predicted = lasagne.layers.get_output(gender_out)
gender_param = lasagne.layers.get_all_params(gender_out, trainable=True)

In [21]:
gender_loss = lasagne.objectives.binary_crossentropy(gender_predicted, target_gender).mean()

gender_updates = lasagne.updates.adagrad(gender_loss, gender_param)


### AccentNet

In [24]:
accent_dense0 = lasagne.layers.DenseLayer(batch_norm0, 200, name="Dense 0")
accent_dense1 = lasagne.layers.DenseLayer(accent_dense0, 100, name="Dense 1")
accent_out = lasagne.layers.DenseLayer(accent_dense1, accent_count,
                                       nonlinearity=lasagne.nonlinearities.softmax, name="Accent output")

In [25]:
accent_out.output_shape

(None, 111)

In [26]:
accent_predicted = lasagne.layers.get_output(accent_out)
accent_param = lasagne.layers.get_all_params(accent_out, trainable=True)

In [27]:
accent_loss = lasagne.objectives.binary_crossentropy(accent_predicted, target_accent).mean()

accent_updates = lasagne.updates.adagrad(accent_loss, accent_param)

### Compilation

In [49]:
gender_train = theano.function([input_.input_var, target_gender], updates=gender_updates)
accent_train = theano.function([input_.input_var, target_accent], updates=accent_updates)

gender_predict = theano.function([input_.input_var], gender_predicted)
accent_predict = theano.function([input_.input_var], accent_predicted)

predict = theano.function([input_.input_var], [gender_predicted, accent_predicted])

print_gender_param = theano.function([], gender_param)

## Training

In [47]:
def iterate_minibatches(input_path, output_df, batchsize, index_col="pronun_id",
                            target_names=["gender"],shuffle=True):
    input_files = np.array(list(map(lambda x: input_path+x, os.listdir(input_path))))
    input_files = np.array(list(filter(lambda x: os.path.isfile(x), input_files)))
    #output_df = output_df.set_index(index_col)
    targets = output_df[target_names]
    del output_df
    if shuffle:
        indices = np.arange(input_files.size)
        np.random.shuffle(indices)
    for start_idx in range(0, len(input_files) - batchsize + 1, batchsize):
        if shuffle:
            excerpt = indices[start_idx:start_idx + batchsize]
        else:
            excerpt = slice(start_idx, start_idx + batchsize)
        inputs = []
        rec_targ = []
        for inp in input_files[excerpt]:
            naming = inp.split("_")[0].split("/")
            name = naming[len(naming)-1]
            inputs.append(np.load(inp))
            rec_targ.append(targets.loc[name])
        yield np.array(inputs), np.array(rec_targ)

In [63]:
EPOCH = 5
size = df.pronun_id.size

gender_pred, accent_pred = predict(test_X)

for epoch in range(EPOCH):
    curr = 0
    for x, y in iterate_minibatches("../../data/vectors/", train_df, 150, target_names=["gender"]):
        gender_train(x, list(map(lambda x:x[0], y)))
        curr+= 150
        clear_output()
        print("Epoch: ", epoch)
        if curr % 100 == 0:
            gender_pred, accent_pred = predict(test_X)
        print("Gender accuracy: ", roc_auc_score(test_Y, gender_pred))
        print(curr)    

Epoch:  0
Gender accuracy:  0.499554909297
142800


KeyboardInterrupt: 

In [64]:
print_gender_param()[6]

array([[ 0.56398916],
       [ 0.17789808],
       [ 0.66043985],
       [ 0.0262416 ],
       [ 0.26344365],
       [ 0.61810315],
       [ 0.62714779],
       [ 0.66823637],
       [ 0.41940391],
       [ 0.27778909],
       [ 0.19782983],
       [ 0.69163442],
       [ 0.27159095],
       [ 0.41585603],
       [ 0.68751299],
       [ 0.32484204],
       [ 0.09851819],
       [ 0.33192056],
       [ 0.90256232],
       [ 0.6444571 ]], dtype=float32)

In [None]:
np.save("accent_weights", accent_param())
np.save("gender_weights", gender_param())

## Result

In [None]:
predicted_Y = predict(input_X)
print("Gender accuracy: ", gender_loss(predicted_Y, test_Y[0]))
print("Accent accuracy: ", accent_loss(predicted_Y, test_Y[1]))