In [139]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
import pandas as pd
from pprint import pprint
from sklearn.decomposition import PCA
import random

# Data

In [140]:
data = pd.read_csv('../features/features_sample.csv')

In [141]:
data = data.sample(frac=1)

X = data[data.columns[0:9]]
y = data[data.columns[10]]

X_train = X[0:int(0.8*len(X))]
y_train = y[0:int(0.8*len(y))]

X_test = X[int(0.8*len(X)):]
y_test = y[int(0.8*len(X)):]

# Random Forest

## Training

In [142]:
clf = RandomForestClassifier(max_depth=3, random_state=0)
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=3, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

## Feature Importance

In [143]:
feature_name = data.columns[0:9]
feature_importance = clf.feature_importances_

pprint({feature_name[i]:feature_importance[i] for i in range(0, len(feature_name))})

{'branchiness_mean': 0.18551347414939662,
 'branchiness_variance': 0.27273768650555397,
 'melodic_mean': 0.04214402215983171,
 'melodic_variance': 0.09772018427358498,
 'pitch_in_piece': 0.09903895718326929,
 'pitch_in_rules': 0.08164435390313077,
 'repeteadness_mean': 0.13231962776017217,
 'repeteadness_variance': 0.04925604033025336,
 'weighted_abruptness': 0.03962565373480723}


## Training Error

In [144]:
print(clf.score(X_train, y_train))

0.7126436781609196


## Testing Error

In [145]:
print(clf.score(X_test, y_test))

0.4090909090909091


# PCA

In [146]:
pca = PCA()
pca.fit(X)

PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

## Number of Components vs Variation

In [147]:
pprint({i: sum(pca.explained_variance_ratio_[0:i]) for i in range(1, len(pca.explained_variance_ratio_)+1)})

{1: 0.7734463767207131,
 2: 0.9525685792226765,
 3: 0.9806854991914611,
 4: 0.9924096300739014,
 5: 0.9964705012942381,
 6: 0.9986326726126084,
 7: 0.9993437914567733,
 8: 0.9999308118233797,
 9: 1.0000000000000002}
