In [65]:
import numpy as np
import pandas as pd
import librosa
import warnings
warnings.filterwarnings("ignore")
RND=42

In [2]:
# загружаем файл с названиями файлов и метками
meta = pd.read_csv('meta.txt', sep='\t', header=None) 
meta.columns = ['f_name','scene','start','end','label']

# столбцы scene и start у всех объектов одинаковые, поэтому удалим их
meta.drop(['scene','start'], axis=1, inplace=True)

# следующее необходимо, так как из тестовой выборки метки парсятся из названий файлов
mask = (meta.label == 'knocking_door')
meta.loc[mask, 'label'] = 'knocking'

In [159]:
meta.head()

Unnamed: 0,f_name,end,label
0,background_0001.wav,9.4585,background
1,background_0001_time_stretch_0.wav,9.98907,background
2,background_0001_time_stretch_1.wav,9.87941,background
3,background_0001_time_stretch_10.wav,11.156553,background
4,background_0001_time_stretch_11.wav,13.261587,background


Выборка несбалансированная:

In [160]:
meta.label.value_counts()

door          3416
tool          1659
knocking      1656
bags          1236
keyboard      1225
background    1126
ring           713
speech         276
Name: label, dtype: int64

In [3]:
def extract_feature(file_name, path):
    X, sample_rate = librosa.load(path + file_name)
    
    # short-time Fourier transform
    stft = np.abs(librosa.stft(X)) 
    # mel-frequency cepstral coefficients
    mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T,axis=0) 
    # chromagram from a waveform
    chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0) 
    # mel-scaled spectrogram
    mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0) 
    # spectral contrast 
    contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T,axis=0) 
    # tonal centroid features
    tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X), sr=sample_rate).T,axis=0) 
    
    return mfccs, chroma, mel, contrast, tonnetz

Loading dataset and feature extraction

In [5]:
from sklearn.preprocessing import LabelEncoder

features = []

for i in range(meta.shape[0]):
    fn = meta.iloc[i].f_name
    mfccs, chroma, mel, contrast, tonnetz = extract_feature(fn, 'audio/') #загружаем из папки ./audio
    features.append(np.hstack([mfccs, chroma, mel, contrast, tonnetz]))

X_train = pd.DataFrame(features)

encode = LabelEncoder()
y_train = encode.fit_transform(meta.label)



In [6]:
import os, re
test = os.listdir('test/')
X_test = []

for filename in test:
    mfccs, chroma, mel, contrast, tonnetz = extract_feature(filename, directory='test/')
    X_test.append(np.hstack([mfccs, chroma, mel, contrast, tonnetz]))

X_test = pd.DataFrame(X_test)

In [7]:
y_test = []

for filename in test:
    y_test.append(re.split('_', filename)[0])

unk_id = y_test.index('unknown') # accuracy по закрытой задаче можно оценить
y_test = encode.transform(y_test[:unk_id])

In [152]:
# будем записывать результаты в таблицу

from sklearn.metrics import accuracy_score
score_table = []

def score(model, name):
    score_table.append((name, accuracy_score(y_test, model.predict(X_test)[:unk_id])))

RandomForest

In [155]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators=100, max_depth=20, random_state=RND)
rfc.fit(X_train, y_train)
score(rfc, 'rfc')

K-nearest neighbours

In [156]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)
score(knn, 'knn')

Log-regression

In [157]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(C=0.05, solver='lbfgs', 
                            class_weight= 'balanced', max_iter=1000, 
                            random_state=RND, multi_class='multinomial')
logreg.fit(X_train, y_train)
score(logreg, 'logreg')

Simple perceptron

In [158]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(random_state=RND, activation='relu', max_iter=1000, solver='lbfgs')
mlp.fit(X_train, y_train)
score(mlp, 'mlp')

Выберем лучшую модель и получим результат:

In [173]:
score_table = pd.DataFrame(score_table, columns=['model', 'accuracy'])
score_table

Unnamed: 0,model,accuracy
0,rfc,0.866808
1,rfc,0.866808
2,knn,0.807611
3,logreg,0.830867
4,mlp,0.856237


In [147]:
best = score_table.loc[score_table['accuracy'].idxmax()].model
answer = pd.DataFrame(columns=['fn', 'proba', 'label'])
answer.fn = test
proba = eval(best + '.predict_proba(X_test)')

In [150]:
i = 0
for prob in proba:
    answer.loc[i,'proba'] = max(prob)
    answer.loc[i,'label'] = encode.inverse_transform(np.argmax(prob))
    if answer.loc[i,'label'] == 'knocking':
        answer.loc[i,'label'] = 'knocking_door'
    i += 1

Запишем результат в result.txt

In [62]:
with open("result.txt", "w") as text_file:
    text_file.write(answer.to_csv(sep='\t', na_rep='', header=False, index=False))