In [1]:
%env THEANO_FLAGS="device=gpu4"

env: THEANO_FLAGS="device=gpu4"


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from tqdm import tqdm_notebook as tqdm
from PIL import Image

from librosa import load, logamplitude
from librosa.feature import melspectrogram

import os

from IPython.display import clear_output

In [3]:
import sys
sys.path.append("../clear")

In [3]:
from CoolSoundNetwork import Network

In [4]:
df = pd.DataFrame.from_csv("../../data/pronuns.csv")

In [5]:
df.head()

Unnamed: 0,pronun_rank,_id,visits,gender,word_rank,user,accent,votes,pronun_id,pronuns,best_pronuns,word,when_word_added,global_listenings
0,662,585fc620698f824ee334a626,61503,False,1564,mariad,Spain,0,585fda10698f828c848d862d,573,199,0_zero,2010-04-25,192
1,21737,585fc7f4698f824ee334afc3,303426,True,14,Wojtula,United States,0,585fda11698f828c848d862e,7,0,11_jedenaście,2013-05-18,638
2,9,585fc5e7698f824ee334a4ea,162426,False,998,usako_usagiclub,Japan,1,585fda11698f828c848d862f,25504,0,１１９番,2015-06-06,743
3,228,585fc8e7698f824ee334b4d5,60338,True,13400,SeanMauch,United States,0,585fda11698f828c848d8630,1765,0,12,2008-07-10,57K
4,153,585fc854698f824ee334b1ca,342195,False,641,anakat,United States,0,585fda11698f828c848d8631,2851,969,12,2008-07-10,57K


In [5]:
accent_count = len(df.accent.unique())

## Deep Learning

In [6]:
import theano
import theano.tensor as T

import lasagne

ERROR (theano.sandbox.cuda): ERROR: Not using GPU. Initialisation of device 4 failed:
initCnmem: cnmemInit call failed! Reason=CNMEM_STATUS_OUT_OF_MEMORY. numdev=1

Using gpu device 0: GeForce GTX 1080 (CNMeM is enabled with initial size: 45.0% of memory, cuDNN 5105)


In [7]:
input_tensor = T.matrix("Vector input")
target_gender = T.ivector("Target gender")
target_accent = T.ivector("Target acent")

In [8]:
input_ = lasagne.layers.InputLayer((100, 100), input_var=input_tensor, name="Network input")
batch_norm0 = lasagne.layers.batch_norm(input_, name="Batch normalization")

### Gender

In [9]:
dense0 = lasagne.layers.DenseLayer(batch_norm0, 50, name="Dense 0")
dense1 = lasagne.layers.DenseLayer(dense0, 20, name="Dense 1")
gender_out = lasagne.layers.DenseLayer(dense1, 1, nonlinearity=lasagne.nonlinearities.sigmoid, name="Output")

In [10]:
gender_out.output_shape

(100, 1)

In [11]:
gender_predicted = lasagne.layers.get_output(gender_out)
gender_param = lasagne.layers.get_all_params(gender_out, trainable=True)

In [12]:
gender_loss = lasagne.objectives.binary_crossentropy(gender_predicted, target_gender).mean()

gender_updates = lasagne.updates.adagrad(gender_loss, gender_param)

### Acent

In [13]:
dense0 = lasagne.layers.DenseLayer(batch_norm0, 200, name="Dense 0")
accent_out = lasagne.layers.DenseLayer(dense0, accent_count,
                                       nonlinearity=lasagne.nonlinearities.softmax, name="Accent output")

In [14]:
accent_out.output_shape

(100, 111)

In [15]:
accent_predicted = lasagne.layers.get_output(accent_out)
accent_param = lasagne.layers.get_all_params(accent_out, trainable=True)

In [16]:
accent_loss = lasagne.objectives.binary_crossentropy(accent_predicted, target_accent).mean()

accent_updates = lasagne.updates.adagrad(accent_loss, accent_param)

### Compile

In [17]:
gender_train = theano.function([input_.input_var, target_gender], updates=gender_updates)
accent_train = theano.function([input_.input_var, target_accent], updates=accent_updates)

gender_predict = theano.function([input_.input_var], gender_predicted)
accent_predict = theano.function([input_.input_var], accent_predicted)

predict = theano.function([input_.input_var], [gender_predicted, accent_predicted])

### Vectorizing voices

In [25]:
network = Network(load_weights=False, vec_weights_file_name='../../data/weights.npy')

In [54]:
data_path = "../../data/images/"
output_path = "../../data/vectors/"
for im in tqdm(os.listdir("../../data/images/")):
    img = Image.open(data_path+im)
    vectors = network.vectorizer(voice_array=np.array(img))
    for index, vector in enumerate(vectors):
        np.save(output_path+im.strip(".png")+"_"+str(index), vector)




### Training

In [74]:
# Filter dataset
count = 0
recordings = pd.Series(index=df.pronun_id.unique())
for file in tqdm(os.listdir("../../data/vectors/")):
    name = file.split("_")[0]
    try:
        recordings[name]
    except:
        os.remove("../../data/vectors/"+file)

In [36]:
# Make test
test = df[df.pronun_id.isin(list(map(lambda x: x.split("_")[0], os.listdir("../../data/vectors/test/"))))]

In [19]:
t = df[~df.pronun_id.isin(test.pronun_id)]

In [20]:
women = t[t.gender == False].pronun_id

In [37]:
test.gender.value_counts()

False    673
True     626
Name: gender, dtype: int64

In [35]:
import shutil
counter = 0
dire = os.listdir("../../data/vectors/")
for f in dire:
    if f.split("_")[0] in women.values:
        shutil.move("../../data/vectors/"+f, "../../data/vectors/test/"+f)
        counter += 1
        if counter == 200:
            break

### Now i sware we are training

In [146]:
def iterate_minibatches(input_path, df, batchsize, index_col="pronun_id", target_names=["gender"],shuffle=True):
    input_files = np.array(list(map(lambda x: input_path+x, os.listdir(input_path))))
    df = df.set_index(index_col)
    targets = df[target_names]
    del df
    if shuffle:
        indices = np.arange(input_files.size)
        np.random.shuffle(indices)
    for start_idx in range(0, len(input_files) - batchsize + 1, batchsize):
        if shuffle:
            excerpt = indices[start_idx:start_idx + batchsize]
        else:
            excerpt = slice(start_idx, start_idx + batchsize)
        inputs = []
        recordings = []
        rec_targ = []
        for inp in input_files[excerpt]:
            naming = inp.split("_")[0].split("/")
            name = naming[len(naming)-1]
            inputs.append(np.load(inp))
            rec_targ.append(targets[name])
        yield np.array(inputs), np.array(rec_targ)

In [147]:
EPOCH = 5
size = df.pronun_id.size
curr = 0
for epoch in range(EPOCH):
    for x, y in iterate_minibatches("../../data/vectors/", df, 150):
        gender_train(x, y)
        curr+= 150
        clear_output()
        print("Epoch: ", epoch)
        print("Accuracy: ", gender_loss())
        print(curr)    

Epoch:  0
13350


KeyboardInterrupt: 

In [None]:
from sklearn