In [72]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
import pandas as pd
from pprint import pprint
from sklearn.decomposition import PCA
import random

# Data

In [54]:
data = pd.read_csv('../features/features.csv')

In [99]:
data = data.sample(frac=1)
print(data)

X = data[data.columns[0:9]]
y = data[data.columns[10]]

X_train = X[0:int(0.8*len(X))]
y_train = y[0:int(0.8*len(y))]

X_test = X[int(0.8*len(X)):]
y_test = y[int(0.8*len(X)):]

     melodic_variance  repeteadness_mean  repeteadness_variance  \
81           0.000000           1.666667               0.555556   
108          0.000000           1.246855               0.189102   
117          0.000000           1.666667               0.555556   
252          0.249433           2.798676               2.721041   
191          0.530235           8.601132              20.873843   
19           2.000000           1.500000               0.416667   
238          0.000000           2.399272               2.044601   
90           0.835429           4.501299               8.390258   
128          0.000000           1.333333               0.222222   
8            0.266529           3.203401               3.911145   
212          0.945398           3.979833               8.318938   
12           0.249865           3.935484               5.673257   
37           0.187500           1.000000               0.000000   
259          0.513465           3.234969               4.18471

# Random Forest

## Training

In [88]:
clf = RandomForestClassifier(max_depth=5, random_state=0)
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

## Feature Importance

In [57]:
feature_name = data.columns[0:9]
feature_importance = clf.feature_importances_

pprint({feature_name[i]:feature_importance[i] for i in range(0, len(feature_name))})

{'branchiness_mean': 0.1364483862145653,
 'branchiness_variance': 0.12791111128580537,
 'melodic_mean': 0.12737709229786176,
 'melodic_variance': 0.07023300112223092,
 'pitch_in_piece': 0.10572076134191173,
 'pitch_in_rules': 0.1111492425764091,
 'repeteadness_mean': 0.0793339198066466,
 'repeteadness_variance': 0.10737887922547988,
 'weighted_abruptness': 0.13444760612908935}


## Training Error

In [70]:
print(clf.score(X, y))

0.9541984732824428


# PCA

In [58]:
pca = PCA()
pca.fit(X)

PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

## Number of Components vs Variation

In [68]:
pprint({i: sum(pca.explained_variance_ratio_[0:i]) for i in range(1, len(pca.explained_variance_ratio_)+1)})

{1: 0.7396039323222565,
 2: 0.8597398883505759,
 3: 0.9518594612629343,
 4: 0.9881727138700985,
 5: 0.9957705739096677,
 6: 0.9981700017408182,
 7: 0.9993127749101668,
 8: 0.9999115667680704,
 9: 1.0}
