In [1]:
# Load necessary libraries

import pandas as pd
import numpy as np
from os import listdir
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
# Load previously prepared data
# unzip archive from repository
# or use feature extraction notebook for initial *.wav data set
full = pd.read_csv('full_set_prepared.csv')
ans = pd.read_table('data/meta/meta.txt', header = None)
full['answers'] = [ans[ans[0] == i][4].values.tolist()[0] for i in full['0'] ]

In [3]:
# prepare test and hide sets
test = pd.read_csv('test_set_prepared.csv')
unknown_list = [i for i in test['0'] if i.startswith('unknown')]

test_feature = test[~test['0'].isin(unknown_list)].drop('0', axis=1)
test_answers = [i.split('_')[0] for i in test['0'] if not i.startswith('unknown')]
test_answers = [i+'_door' if i.startswith('knock') else i for i in test_answers]
test_target = pd.Series(test_answers)

hide_feature = test[test['0'].isin(unknown_list)].drop('0', axis=1)

In [4]:
# separate train into features and target
train_feature = full.drop(['answers', '0'], axis = 1)
train_target = full['answers']

In [6]:
from sklearn.preprocessing import StandardScaler

#prepare train features
scaler = StandardScaler()
train_feature = scaler.fit_transform(train_feature)

#prepare test features
test_feature = scaler.transform(test_feature)

#prepare hide features
hide_feature = scaler.transform(hide_feature)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(train_feature, train_target, stratify = train_target,
                                                    test_size = 0.1, random_state = 17)

In [24]:
clf = RandomForestClassifier(n_estimators=100, random_state = 17)
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=17, verbose=0, warm_start=False)

In [25]:
prediction = clf.predict(X_test)

In [26]:
acc = accuracy_score(y_test, prediction)
print('accuracy on the train set part %.3f'%acc)

accuracy on the train set part 0.996


In [27]:
clf.fit(train_feature, train_target)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=17, verbose=0, warm_start=False)

In [28]:
# predict test and hide parts

test_prediction_proba = clf.predict_proba(test_feature)
hide_prediction_proba = clf.predict_proba(hide_feature)

test_prediction = clf.predict(test_feature)
hide_prediction = clf.predict(hide_feature)

In [29]:
print('accuracy on the test part %.3f'%accuracy_score(test_target, test_prediction))

accuracy on the test part 0.871


In [30]:
# save test part results
ans_df = pd.DataFrame(np.hstack([np.array([i for i in test['0'] if not i.startswith('unk')]).reshape(-1, 1),
                                 np.array([[np.max(i)] for i in test_prediction_proba]),
                                test_prediction.reshape(-1, 1)]),
                     columns = ['File', 'Proba', 'Class'])
ans_df.to_csv('result.txt', sep = '\t', index = None, header = None)

In [31]:
# save hide part results
ans_df = pd.DataFrame(np.hstack([np.array([i for i in test['0'] if i.startswith('unk')]).reshape(-1, 1),
                                 np.array([[np.max(i)] for i in hide_prediction_proba]),
                                hide_prediction.reshape(-1, 1)]),
                     columns = ['File', 'Proba', 'Class'])
ans_df.to_csv('hide_results.txt', sep = '\t', index = None, header = None)