In [1]:
import pickle
import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split

In [2]:
def get_X_Y(df,target_sex,isTrain=True):
    if isTrain:
        Y = df.Country - 2 
        X = np.array([mfcc[i] for i in df.Id])
    
        if target_sex: 
            Y = df.Sex
            X_train, X_val, y_train, y_val = train_test_split(X,Y,test_size=0.1, random_state=45)
            return X_train, X_val, y_train, y_val
    
        X_train, X_val, y_train, y_val = train_test_split(X,Y,test_size=0.1, random_state=45)
        return X_train, X_val, y_train, y_val

    return np.array([mfcc[i] for i in df.Id])
    
def load_data(data_path, mfcc_path):    
    df = pd.read_csv(data_path)    
    with open(mfcc_path, 'rb') as fp:
        mfcc = pickle.load(fp)
    return df,mfcc


def train_model(model, X_train, X_test, y_train, y_test,target_sex=True):
    optimiser = keras.optimizers.Adam(learning_rate=0.0001)
    model.compile(optimizer=optimiser,
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])

    if target_sex:
        model.compile(optimizer=optimiser,
                      loss='binary_crossentropy',
                      metrics=['accuracy'])
        model.fit(X_train, y_train, validation_data=(X_test, y_test), batch_size=32, epochs=30, verbose=0)
        

    model.fit(X_train, y_train, validation_data=(X_test, y_test), batch_size=32, epochs=30, verbose=0)

In [3]:
def fnn_model_with_dropout(input_shape,output=1,function='relu',dropout=0.2):
    model = keras.Sequential() 
    model.add(keras.layers.Flatten(input_shape=input_shape))
    model.add(keras.layers.Dense(256, activation=function))
    model.add(keras.layers.Dense(64, activation=function))
    model.add(keras.layers.Dropout(dropout))
    model.add(keras.layers.Dense(output, activation='softmax'))
    return model

In [4]:
def cnn_model(input_shape,output=1):
    model = keras.Sequential()

    # 1st conv layer
    model.add(keras.layers.Conv2D(32, (3, 3), activation='relu', input_shape=input_shape))
    model.add(keras.layers.MaxPooling2D((3, 3), strides=(2, 2), padding='same'))
    model.add(keras.layers.BatchNormalization())

    # 2nd conv layer
    model.add(keras.layers.Conv2D(32, (3, 3), activation='relu'))
    model.add(keras.layers.MaxPooling2D((3, 3), strides=(2, 2), padding='same'))
    model.add(keras.layers.BatchNormalization())

    # 3rd conv layer
    model.add(keras.layers.Conv2D(32, (2, 2), activation='relu'))
    model.add(keras.layers.MaxPooling2D((2, 2), strides=(2, 2), padding='same'))
    model.add(keras.layers.BatchNormalization())

    # flatten output and feed it into dense layer
    model.add(keras.layers.Flatten())
    model.add(keras.layers.Dense(64, activation='relu'))
    model.add(keras.layers.Dropout(0.3))

    # output layer
    model.add(keras.layers.Dense(output, activation='softmax'))

    return model

In [5]:
path_train_csv = '/content/drive/MyDrive/tarea/dataset/dataset_aumentado/Train.csv'
path_train_mfcc = '/content/drive/MyDrive/tarea/dataset/dataset_aumentado/mfcc_train.json'
path_test_csv = '/content/drive/MyDrive/tarea/dataset/dataset_aumentado/Test.csv'
path_test_mfcc = '/content/drive/MyDrive/tarea/dataset/dataset_aumentado/mfcc_test.json'

## Train for sex target

In [6]:
## Training model
df,mfcc = load_data(path_train_csv,path_train_mfcc) 
cnn = cnn_model((61,13,1),1)
X_train, X_val, y_train, y_val = get_X_Y(df,target_sex=True,isTrain=True)
train_model(cnn, X_train, X_val, y_train, y_val,target_sex=True)

## Prediction model
df,mfcc = load_data(path_test_csv,path_test_mfcc)
X_test = get_X_Y(df,target_sex=True,isTrain=False)
y_predict_sex = cnn.predict(X_test)


## Train for country target

In [7]:
## Training model
df,mfcc = load_data(path_train_csv,path_train_mfcc) 
cnn = cnn_model((61,13,1),5)
X_train, X_val, y_train, y_val = get_X_Y(df,target_sex=False,isTrain=True)
train_model(cnn, X_train, X_val, y_train, y_val,target_sex=False)

## Prediction model
df,mfcc = load_data(path_test_csv,path_test_mfcc)
X_test = get_X_Y(df,target_sex=False,isTrain=False)
y_predict_country = cnn.predict(X_test)
y_predict_country = [np.argmax(x)+2 for x in y_predict_country]

## Dataframe for submit

In [8]:
expected = [f'{sex[0]} {country}' for sex,country in zip(y_predict_sex.astype(int),y_predict_country)]
df_submit = pd.DataFrame(df.Id,columns=['Id'])
df_submit = df_submit.assign(Expected=expected)
df_submit

Unnamed: 0,Id,Expected
0,00005132946.wav,1 2
1,00010648027.wav,1 3
2,00012996552.wav,1 2
3,00017540976.wav,1 4
4,00020418681.wav,1 3
...,...,...
595,02125896593.wav,1 2
596,02127267817.wav,1 5
597,02132921902.wav,1 3
598,02143867783.wav,1 2


In [11]:
df_submit.to_csv('/content/drive/MyDrive/tarea/submission.csv',index=False) 