In [79]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os

# librerias personalizadas
from libraries.Corte_audio import Corte_audio
from libraries.ProcessAudio import ProcessAudio

# preprocesando los datos
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

# para guardar el preprocesador de datos
import pickle

# Modelado
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# Evaluacion
import sklearn.metrics as metrics

# quitar alertas innecesarias
import warnings

warnings.filterwarnings('ignore')

In [80]:
FOLDER_OUT = "/kaggle/output/working/"

In [81]:
# CONFIG

MINIMA_VARIANA_EXPLICADA = 0.93 # se debe definir porque este valor
TIME_SPLIT = 2 # falta definir porque este split de 2

# Folder
FOLDER_SAVE_NORMALIZADOR_PCA = FOLDER_OUT + 'normalizador_pca.pkl'
FOLDER_MODEL = FOLDER_OUT + "randomforest.pkl"

FOLDER_TRAIN_DATA = FOLDER_OUT + "train_data.csv"
FOLDER_TRAIN_LABEL = FOLDER_OUT + "train_label.csv"
FOLDER_TEST_DATA = FOLDER_OUT + "test_data.csv"
FOLDER_TEST_LABEL = FOLDER_OUT + "test_label.csv"

# Model
SPLIT_DATA_TRAIN = 0.2

In [82]:
# iniciador de clases

cortador = Corte_audio()
pca_pipe = make_pipeline(StandardScaler(), PCA(MINIMA_VARIANA_EXPLICADA))

In [83]:
# organizando los datos en train y test (data y label para cada uno)

TRAIN = {}
TEST = {}

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        file = os.path.join(dirname, filename)
        name_file, extension = filename.split(".")

        if extension == "csv" or extension == "wav":
            if dirname.find("train")>0:
                if name_file not in TRAIN:
                    TRAIN[name_file] = {}
                if filename.find("csv")>0:
                    TRAIN[name_file]['label'] = file
                else:
                    TRAIN[name_file]['data'] = file
            else:
                if name_file not in TEST:
                    TEST[name_file] = {}
                if filename.find("csv")>0:
                    TEST[name_file]['label'] = file
                else:
                    TEST[name_file]['data'] = file

# print(TRAIN)
# print()
# print(TEST)

In [84]:
# Abriendo cada audio, a cada uno se le aplica el split y a cada audio resultante se le extraen caracteristicas

TODOS_LABEL = {}


def files_to_data(diccionario_datos, buscar_todos_label: bool = False, save_new_data: bool = False):
    global TODOS_LABEL
    DATA = []
    LABEL = []

    for name_file, value in diccionario_datos.items():
        print(name_file)
        muestras_wav, instrumentos, rate = cortador.split_data(value['data'], value['label'])

        processAudio = ProcessAudio(rate)
        for id_audio, dat in enumerate(muestras_wav):
            processAudio.set_data(dat)
            caracteristicas = processAudio.get_all(id_audio)  # Extrayendo caracteristicas audios, salen 26 caracteristicas
            DATA.append(caracteristicas[1:])
            LABEL.append(instrumentos[id_audio])

    # buscando todos los label
    if buscar_todos_label:
        for lab in LABEL:
            for la in lab:
                if la not in TODOS_LABEL:
                    TODOS_LABEL[la] = 0
        TODOS_LABEL = tuple(sorted(TODOS_LABEL))

    # expandiendo los label a su respectivo vector de etiquetas
    for j, lab in enumerate(LABEL):
        new_label = [0 for _ in TODOS_LABEL]
        for la in lab:
            for i, l in enumerate(TODOS_LABEL):
                if l == la:
                    new_label[i] = 1
        LABEL[j] = new_label

    if save_new_data:
        pass

    # convirtiendo a numpy los datos
    DATA = np.array(DATA, dtype=float)
    LABEL = np.array(LABEL, dtype=float)

    return DATA, LABEL

In [85]:
def save_data(data_save, file):
    df = pd.DataFrame(data=data_save)
    df.to_csv(file)

In [86]:
DATA, LABEL = files_to_data(TRAIN, buscar_todos_label=True)

print(len(DATA), len(LABEL))

1727
1728
1729
1730
1733
414 414


In [87]:
# Normalizando y aplicando PCA
pca_pipe.fit(DATA)
pickle.dump(pca_pipe, open(FOLDER_SAVE_NORMALIZADOR_PCA,'wb'))

In [88]:
normalizador_pca = pickle.load(open(FOLDER_SAVE_NORMALIZADOR_PCA, 'rb'))
x_for_model = normalizador_pca.transform(X=DATA)

In [89]:
print(f"Original vector size: {len(DATA[0])} -> New vector size {len(x_for_model[0])} ({int(MINIMA_VARIANA_EXPLICADA*100)}% información mantenida)")

Original vector size: 26 -> New vector size 12 (93% información mantenida)


In [90]:
# Guardando datos
save_data(DATA, FOLDER_TRAIN_DATA)
save_data(LABEL, FOLDER_TRAIN_LABEL)

In [91]:
mean = lambda lst: int((sum(lst) / len(lst)) * 100) / 100

def calcular_porcentajes_aciertos(y_f, y_t):
    verdaderos = dict()
    falsos = dict()
    for j in range(y_f.shape[1]):
        verdaderos[j] = 0
        falsos[j] = 0

    for i in range(y_f.shape[0]):
        for j in range(y_f.shape[1]):
            if y_f[i][j] == y_t[i][j]:
                verdaderos[j] += 1
            else:
                falsos[j] += 1

    for j in range(y_f.shape[1]):
        # y_final.shape[1] -> 100%
        # verdaderos[j]    -> X
        verdaderos[j] = int(verdaderos[j] * 100 / y_f.shape[0])
        falsos[j] = int(falsos[j] * 100 / y_f.shape[0])

    return verdaderos, falsos, str(mean([v for i, v in verdaderos.items()])) + "%"

In [92]:
seed = 1
grid = GridSearchCV(
          estimator = RandomForestClassifier(),
          param_grid={},
          cv = KFold(n_splits=10, shuffle=True, random_state=seed)
        )

In [93]:
X_train, X_valid, y_train, y_valid = train_test_split(DATA, LABEL, test_size=0.1)  # 0.2

In [94]:
# Entrenando
grid.fit(X_train, y_train)
model = grid.best_estimator_
pickle.dump(model, open(FOLDER_MODEL, 'wb'))

In [95]:
y_final = model.predict(X_valid)
print("ACC", metrics.accuracy_score(y_valid, y_final))
print("PREC", metrics.precision_score(y_valid, y_final, average='micro'))

ACC 0.8333333333333334
PREC 0.9341317365269461


In [96]:
# TESTEAR EL MODELO
model = pickle.load(open(FOLDER_MODEL, 'rb'))

DATA, LABEL = files_to_data(TEST)

1759
1819
2106


In [97]:
# Guardando datos
save_data(DATA, FOLDER_TEST_DATA)
save_data(LABEL, FOLDER_TEST_LABEL)

In [98]:
y_final = model.predict(DATA)
print("ACC", metrics.accuracy_score(LABEL, y_final))
print("PREC", metrics.precision_score(LABEL, y_final, average='micro'))

ACC 0.33884297520661155
PREC 0.4053156146179402
