In [15]:
import os
import sys
import numpy as np
import pandas as pd
import pickle
import json
import codecs

In [3]:
# util funcs
def GetFileNamesInDir(dir_name, filter=".wav"):
    names = []
    for (path, dirs, files) in os.walk(dir_name):
        for f in files:
            if filter.split('.')[-1].lower() == f.split('.')[-1].lower():
                names.append(path + "/" + f)
                
    return names

# Creating dataset info

In [17]:
root_dir = 'CSV/'
seed_Ext = '.csv'
filenames = GetFileNamesInDir(root_dir, seed_Ext)
db_info_filename = 'rosamerica_info.json'

In [10]:
db_info = {}
for filename in filenames:
    genre = filename.split('/')[-2].strip()
    if genre not in db_info:
        db_info[genre] = []
    db_info[genre].append(filename)
    

In [18]:
json.dump(db_info, codecs.open(db_info_filename, 'w', encoding='utf-8'))

In [20]:
# general stats
for genre, array in db_info.items():
    print(genre, len(db_info[genre]))

spe 54
pop 55
dan 55
jaz 55
cla 54
roc 54
hip 54
rhy 54


In [57]:
class_map = dict(zip(db_info.keys(), range(len(db_info.keys()))))

# Convering jsons to csvs

In [47]:
for genre, filearray in db_info.items():
    for filename in filearray:
        filename_json = filename.replace('CSV', 'Lowleveldata').replace('.csv', '')
        cmd = "python json_to_csv.py -i %s -o %s --ignore *metadata* *beats_position*"
#         print(filename_json, filename)
#         break
        os.system(cmd%(filename_json, filename))

# Validating consistency of features across all the files
* every feature was found to be of the same length

In [48]:
for genre, filearray in db_info.items():
    for filename in filearray:
        data = pd.read_csv(filename)
        print(genre, os.path.basename(filename), len(data.columns))

spe monologo_hombre_01.csv 2651
spe monologo_mujer_08.csv 2651
spe tlfno_hombre.csv 2651
spe monologo_mujer_09.csv 2651
spe monologo_hombre_02.csv 2651
spe tlfno_mujer_04.csv 2651
spe monologo_hombre_03.csv 2651
spe monologo_hombre_07.csv 2651
spe monologo_hombre_13.csv 2651
spe monologo_hombre_12.csv 2651
spe monologo_hombre_06.csv 2651
spe monologo_hombre_10.csv 2651
spe monologo_hombre_04.csv 2651
spe tlfno_mujer_03.csv 2651
spe tlfno_mujer_02.csv 2651
spe monologo_hombre_11.csv 2651
spe speech_7_catinfo.csv 2651
spe tlfno_hombre_08.csv 2651
spe tlfno_hombre_09.csv 2651
spe tlfno_mujeres_01.csv 2651
spe speech_26_69-g.csv 2651
spe tertulia.csv 2651
spe speech_12_ondacero.csv 2651
spe tertulia_02.csv 2651
spe tlfno_hombre_02.csv 2651
spe tlfno_hombres_10.csv 2651
spe tlfno_hombres_04.csv 2651
spe tlfno_hombre_03.csv 2651
spe tertulia_tlfno_01.csv 2651
spe tertulia_03.csv 2651
spe tlfno_hombre_01.csv 2651
spe tlfno_hombres_06.csv 2651
spe tlfno_hombres_07.csv 2651
spe tertulia_tlfno_0

KeyboardInterrupt: 

# Consolidating features in a common file

In [63]:
cnt = 0
for genre, filearray in db_info.items():
    for ii, filename in enumerate(filearray):
        
        if cnt ==0:
            features = pd.read_csv(filename)
            features['class'] = class_map[genre]
        else:
            temp = pd.read_csv(filename)
            temp['class'] = class_map[genre]
            features = features.append(temp)
        cnt += 1

In [74]:
features['class'].value_counts()

3    55
2    55
1    55
7    54
6    54
5    54
4    54
0    54
Name: class, dtype: int64

In [81]:
pd.get_dummies(features['tonal.chords_key'])

Unnamed: 0,A,A#,B,C,C#,D,D#,E,F,F#,G,G#
0,1,0,0,0,0,0,0,0,0,0,0,0
0,0,0,0,1,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0,0,1,0
0,0,0,0,0,0,0,0,0,0,0,1,0
0,0,0,0,0,0,0,0,0,0,1,0,0
0,0,1,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0,0,1,0
0,0,0,0,0,0,0,0,0,0,0,1,0
0,0,1,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0,1,0,0


In [None]:
  "chords_key": "D",
    "chords_scale": "major",
    "key_key": "G",
    "key_scale": "minor"

In [86]:
features = features.drop(columns=['tonal.chords_key', 'tonal.chords_scale', 'tonal.key_key', 'tonal.key_scale'])

In [94]:
feature_mtx = features.as_matrix()

# Perfoming cross fold validation

In [156]:
from sklearn.model_selection import StratifiedKFold
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score

In [139]:
features_all = feature_mtx[:, :-1]
class_all = feature_mtx[:, -1]

In [173]:
pca = PCA(n_components=50)
clf = RandomForestClassifier(n_estimators=30)

In [174]:
features_shrink = pca.fit_transform(features_all)

In [175]:
stf = StratifiedKFold(n_splits=10, shuffle=True)

In [176]:
for train_ind, test_ind in stf.split(features_shrink, class_all):
    clf.fit(features_shrink[train_ind], class_all[train_ind])
    predictions = clf.predict(features_shrink[test_ind])
    print(accuracy_score(predictions, class_all[test_ind]))


0.7291666666666666
0.6041666666666666
0.625
0.6875
0.5813953488372093
0.675
0.65
0.65
0.625
0.675
