In [1]:
import numpy as np
import pandas as pd
import librosa
import warnings
warnings.filterwarnings("ignore")
RND=42

In [2]:
# загружаем файл с названиями файлов и метками
meta = pd.read_csv('meta.txt', sep='\t', header=None) 
meta.columns = ['f_name','scene','start','end','label']

# столбцы scene и start у всех объектов одинаковые, поэтому удалим их
meta.drop(['scene','start'], axis=1, inplace=True)

# следующее необходимо, так как из тестовой выборки метки парсятся из названий файлов
mask = (meta.label == 'knocking_door')
meta.loc[mask, 'label'] = 'knocking'

In [3]:
meta.head()

Unnamed: 0,f_name,end,label
0,background_0001.wav,9.4585,background
1,background_0001_time_stretch_0.wav,9.98907,background
2,background_0001_time_stretch_1.wav,9.87941,background
3,background_0001_time_stretch_10.wav,11.156553,background
4,background_0001_time_stretch_11.wav,13.261587,background


Выборка несбалансированная:

In [4]:
meta.label.value_counts()

door          3416
tool          1659
knocking      1656
bags          1236
keyboard      1225
background    1126
ring           713
speech         276
Name: label, dtype: int64

In [5]:
def extract_feature(file_name, path):
    X, sample_rate = librosa.load(path + file_name)
    
    # short-time Fourier transform
    stft = np.abs(librosa.stft(X)) 
    # mel-frequency cepstral coefficients
    mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T,axis=0) 
    # chromagram from a waveform
    chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0) 
    # mel-scaled spectrogram
    mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0) 
    # spectral contrast 
    contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T,axis=0) 
    # tonal centroid features
    tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X), sr=sample_rate).T,axis=0) 
    
    return mfccs, chroma, mel, contrast, tonnetz

Loading dataset and feature extraction

In [6]:
from sklearn.preprocessing import LabelEncoder

features = []

for i in range(meta.shape[0]):
    fn = meta.iloc[i].f_name
    mfccs, chroma, mel, contrast, tonnetz = extract_feature(fn, 'audio/') # загружаем из папки ./audio
    features.append(np.hstack([mfccs, chroma, mel, contrast, tonnetz]))

X_train = pd.DataFrame(features)

encode = LabelEncoder()
y_train = encode.fit_transform(meta.label)

In [8]:
import os, re
test = os.listdir('test/')
X_test = []

for filename in test:
    mfccs, chroma, mel, contrast, tonnetz = extract_feature(filename, 'test/') # загружаем из папки ./test
    X_test.append(np.hstack([mfccs, chroma, mel, contrast, tonnetz]))

X_test = pd.DataFrame(X_test)

In [9]:
y_test = []

for filename in test:
    y_test.append(re.split('_', filename)[0])

unk_id = y_test.index('unknown') # accuracy по закрытой задаче можно оценить
y_test = encode.transform(y_test[:unk_id])

In [10]:
# будем записывать результаты в таблицу

from sklearn.metrics import accuracy_score
score_table = []

def score(model, name):
    score_table.append((name, accuracy_score(y_test, model.predict(X_test)[:unk_id])))

RandomForest

In [11]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators=100, max_depth=20, random_state=RND)
rfc.fit(X_train, y_train)
score(rfc, 'rfc')

K-nearest neighbours

In [12]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)
score(knn, 'knn')

Log-regression

In [13]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(C=0.05, solver='lbfgs', 
                            class_weight= 'balanced', max_iter=1000, 
                            random_state=RND, multi_class='multinomial')
logreg.fit(X_train, y_train)
score(logreg, 'logreg')

Simple perceptron

In [14]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(random_state=RND, activation='relu', max_iter=1000, solver='lbfgs')
mlp.fit(X_train, y_train)
score(mlp, 'mlp')

Catboost

In [15]:
import catboost as cb

cat = cb.CatBoostClassifier(iterations=1000, loss_function='MultiClass', custom_loss=['Accuracy'], random_seed=RND)
cat.fit(X_train, y_train)
score(cat, 'cat')

0:	learn: -1.9700575	total: 956ms	remaining: 15m 55s
1:	learn: -1.8781755	total: 1.86s	remaining: 15m 27s
2:	learn: -1.7921704	total: 2.72s	remaining: 15m 4s
3:	learn: -1.7211640	total: 3.7s	remaining: 15m 20s
4:	learn: -1.6512794	total: 4.58s	remaining: 15m 11s
5:	learn: -1.5956088	total: 5.52s	remaining: 15m 14s
6:	learn: -1.5387844	total: 6.52s	remaining: 15m 24s
7:	learn: -1.4850605	total: 7.39s	remaining: 15m 16s
8:	learn: -1.4371701	total: 8.31s	remaining: 15m 14s
9:	learn: -1.3915399	total: 9.23s	remaining: 15m 13s
10:	learn: -1.3481005	total: 10.1s	remaining: 15m 10s
11:	learn: -1.3077227	total: 11s	remaining: 15m 6s
12:	learn: -1.2712432	total: 12s	remaining: 15m 10s
13:	learn: -1.2370694	total: 12.8s	remaining: 15m 4s
14:	learn: -1.2048398	total: 13.9s	remaining: 15m 10s
15:	learn: -1.1751292	total: 14.7s	remaining: 15m 4s
16:	learn: -1.1452338	total: 15.7s	remaining: 15m 7s
17:	learn: -1.1196450	total: 16.5s	remaining: 15m
18:	learn: -1.0936549	total: 17.4s	remaining: 14m 57

152:	learn: -0.2515093	total: 2m 20s	remaining: 12m 58s
153:	learn: -0.2496730	total: 2m 21s	remaining: 12m 57s
154:	learn: -0.2476266	total: 2m 22s	remaining: 12m 57s
155:	learn: -0.2460995	total: 2m 23s	remaining: 12m 56s
156:	learn: -0.2445745	total: 2m 24s	remaining: 12m 56s
157:	learn: -0.2430084	total: 2m 25s	remaining: 12m 55s
158:	learn: -0.2417835	total: 2m 26s	remaining: 12m 54s
159:	learn: -0.2403764	total: 2m 27s	remaining: 12m 54s
160:	learn: -0.2387689	total: 2m 28s	remaining: 12m 53s
161:	learn: -0.2369656	total: 2m 29s	remaining: 12m 52s
162:	learn: -0.2355874	total: 2m 30s	remaining: 12m 53s
163:	learn: -0.2342413	total: 2m 31s	remaining: 12m 52s
164:	learn: -0.2327485	total: 2m 32s	remaining: 12m 52s
165:	learn: -0.2314759	total: 2m 33s	remaining: 12m 50s
166:	learn: -0.2296709	total: 2m 34s	remaining: 12m 49s
167:	learn: -0.2280616	total: 2m 35s	remaining: 12m 48s
168:	learn: -0.2262636	total: 2m 36s	remaining: 12m 47s
169:	learn: -0.2249608	total: 2m 36s	remaining: 

300:	learn: -0.1189298	total: 4m 52s	remaining: 11m 20s
301:	learn: -0.1184974	total: 4m 53s	remaining: 11m 19s
302:	learn: -0.1179621	total: 4m 55s	remaining: 11m 18s
303:	learn: -0.1175142	total: 4m 56s	remaining: 11m 17s
304:	learn: -0.1169044	total: 4m 57s	remaining: 11m 17s
305:	learn: -0.1164043	total: 4m 58s	remaining: 11m 17s
306:	learn: -0.1159271	total: 4m 59s	remaining: 11m 16s
307:	learn: -0.1154445	total: 5m	remaining: 11m 15s
308:	learn: -0.1149525	total: 5m 1s	remaining: 11m 14s
309:	learn: -0.1146168	total: 5m 2s	remaining: 11m 13s
310:	learn: -0.1141018	total: 5m 3s	remaining: 11m 12s
311:	learn: -0.1138679	total: 5m 4s	remaining: 11m 11s
312:	learn: -0.1130502	total: 5m 5s	remaining: 11m 10s
313:	learn: -0.1124695	total: 5m 6s	remaining: 11m 9s
314:	learn: -0.1120925	total: 5m 7s	remaining: 11m 8s
315:	learn: -0.1116023	total: 5m 8s	remaining: 11m 7s
316:	learn: -0.1110600	total: 5m 9s	remaining: 11m 6s
317:	learn: -0.1106295	total: 5m 10s	remaining: 11m 5s
318:	learn

449:	learn: -0.0682805	total: 7m 9s	remaining: 8m 45s
450:	learn: -0.0681211	total: 7m 10s	remaining: 8m 44s
451:	learn: -0.0679043	total: 7m 11s	remaining: 8m 43s
452:	learn: -0.0675754	total: 7m 12s	remaining: 8m 42s
453:	learn: -0.0674216	total: 7m 13s	remaining: 8m 41s
454:	learn: -0.0672431	total: 7m 14s	remaining: 8m 40s
455:	learn: -0.0670279	total: 7m 15s	remaining: 8m 39s
456:	learn: -0.0668342	total: 7m 16s	remaining: 8m 38s
457:	learn: -0.0665650	total: 7m 17s	remaining: 8m 38s
458:	learn: -0.0664472	total: 7m 19s	remaining: 8m 37s
459:	learn: -0.0661972	total: 7m 19s	remaining: 8m 36s
460:	learn: -0.0658182	total: 7m 20s	remaining: 8m 35s
461:	learn: -0.0656171	total: 7m 21s	remaining: 8m 34s
462:	learn: -0.0654682	total: 7m 22s	remaining: 8m 33s
463:	learn: -0.0652075	total: 7m 23s	remaining: 8m 32s
464:	learn: -0.0650734	total: 7m 24s	remaining: 8m 31s
465:	learn: -0.0648322	total: 7m 25s	remaining: 8m 30s
466:	learn: -0.0646520	total: 7m 26s	remaining: 8m 29s
467:	learn:

600:	learn: -0.0436287	total: 9m 32s	remaining: 6m 19s
601:	learn: -0.0435160	total: 9m 33s	remaining: 6m 18s
602:	learn: -0.0434335	total: 9m 34s	remaining: 6m 18s
603:	learn: -0.0433559	total: 9m 35s	remaining: 6m 17s
604:	learn: -0.0432916	total: 9m 36s	remaining: 6m 16s
605:	learn: -0.0431280	total: 9m 37s	remaining: 6m 15s
606:	learn: -0.0430577	total: 9m 38s	remaining: 6m 14s
607:	learn: -0.0429066	total: 9m 39s	remaining: 6m 13s
608:	learn: -0.0427258	total: 9m 40s	remaining: 6m 12s
609:	learn: -0.0426036	total: 9m 41s	remaining: 6m 11s
610:	learn: -0.0425017	total: 9m 42s	remaining: 6m 10s
611:	learn: -0.0423509	total: 9m 42s	remaining: 6m 9s
612:	learn: -0.0421828	total: 9m 43s	remaining: 6m 8s
613:	learn: -0.0420473	total: 9m 44s	remaining: 6m 7s
614:	learn: -0.0419749	total: 9m 45s	remaining: 6m 6s
615:	learn: -0.0419167	total: 9m 46s	remaining: 6m 5s
616:	learn: -0.0417243	total: 9m 47s	remaining: 6m 4s
617:	learn: -0.0416649	total: 9m 48s	remaining: 6m 3s
618:	learn: -0.04

748:	learn: -0.0308556	total: 11m 59s	remaining: 4m 1s
749:	learn: -0.0307740	total: 12m	remaining: 4m
750:	learn: -0.0307466	total: 12m 1s	remaining: 3m 59s
751:	learn: -0.0306636	total: 12m 2s	remaining: 3m 58s
752:	learn: -0.0305448	total: 12m 3s	remaining: 3m 57s
753:	learn: -0.0304766	total: 12m 4s	remaining: 3m 56s
754:	learn: -0.0304315	total: 12m 5s	remaining: 3m 55s
755:	learn: -0.0303635	total: 12m 6s	remaining: 3m 54s
756:	learn: -0.0302796	total: 12m 6s	remaining: 3m 53s
757:	learn: -0.0302004	total: 12m 7s	remaining: 3m 52s
758:	learn: -0.0301297	total: 12m 8s	remaining: 3m 51s
759:	learn: -0.0300907	total: 12m 9s	remaining: 3m 50s
760:	learn: -0.0300064	total: 12m 10s	remaining: 3m 49s
761:	learn: -0.0299080	total: 12m 11s	remaining: 3m 48s
762:	learn: -0.0298097	total: 12m 12s	remaining: 3m 47s
763:	learn: -0.0297630	total: 12m 13s	remaining: 3m 46s
764:	learn: -0.0296843	total: 12m 13s	remaining: 3m 45s
765:	learn: -0.0296241	total: 12m 14s	remaining: 3m 44s
766:	learn:

896:	learn: -0.0224089	total: 14m 16s	remaining: 1m 38s
897:	learn: -0.0223576	total: 14m 18s	remaining: 1m 37s
898:	learn: -0.0223103	total: 14m 19s	remaining: 1m 36s
899:	learn: -0.0222727	total: 14m 20s	remaining: 1m 35s
900:	learn: -0.0222332	total: 14m 21s	remaining: 1m 34s
901:	learn: -0.0221865	total: 14m 22s	remaining: 1m 33s
902:	learn: -0.0221440	total: 14m 22s	remaining: 1m 32s
903:	learn: -0.0220947	total: 14m 23s	remaining: 1m 31s
904:	learn: -0.0220522	total: 14m 25s	remaining: 1m 30s
905:	learn: -0.0220183	total: 14m 26s	remaining: 1m 29s
906:	learn: -0.0219918	total: 14m 27s	remaining: 1m 28s
907:	learn: -0.0219503	total: 14m 27s	remaining: 1m 27s
908:	learn: -0.0219167	total: 14m 28s	remaining: 1m 26s
909:	learn: -0.0218877	total: 14m 30s	remaining: 1m 26s
910:	learn: -0.0218616	total: 14m 30s	remaining: 1m 25s
911:	learn: -0.0218107	total: 14m 31s	remaining: 1m 24s
912:	learn: -0.0217813	total: 14m 32s	remaining: 1m 23s
913:	learn: -0.0217549	total: 14m 33s	remaining:

Выберем лучшую модель и получим результат:

In [16]:
score_table = pd.DataFrame(score_table, columns=['model', 'accuracy'])
score_table

Unnamed: 0,model,accuracy
0,rfc,0.866808
1,knn,0.807611
2,logreg,0.830867
3,mlp,0.856237
4,cat,0.932347


In [17]:
best = score_table.loc[score_table['accuracy'].idxmax()].model
answer = pd.DataFrame(columns=['fn', 'proba', 'label'])
answer.fn = test
proba = eval(best + '.predict_proba(X_test)')

In [18]:
i = 0
for prob in proba:
    answer.loc[i,'proba'] = max(prob)
    answer.loc[i,'label'] = encode.inverse_transform(np.argmax(prob))
    if answer.loc[i,'label'] == 'knocking':
        answer.loc[i,'label'] = 'knocking_door'
    i += 1

Запишем результат в result.txt

In [19]:
with open("result.txt", "w") as text_file:
    text_file.write(answer.to_csv(sep='\t', na_rep='', header=False, index=False))