In [None]:
%matplotlib inline
import pandas as pd
import sklearn
from sklearn import preprocessing
from sklearn import cluster
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.cluster.hierarchy import dendrogram, linkage
from pathlib import Path
import librosa, librosa.display
import IPython.lib.display as ipd


### Setup Seaborn
sns.set_style("whitegrid")
sns.set_context("poster")

In [None]:
results = pd.read_csv('./surveyresult/results.csv', dtype={'happiness': str}, encoding='latin-1')

In [None]:
results.describe()

In [None]:
results.happiness.unique()

In [None]:
# Using three class labels: sad, neutral and happy
di = {'1': 'sad', '2': 'sad', '3': 'neutral', '4':'happy', '5':'happy'}

In [None]:
# Using two class labels: sad and happy
#di = {'1': 'sad', '2': 'sad', '3': 'happy', '4':'happy', '5':'happy'}

In [None]:
results = results.replace({"happiness": di})

In [None]:
results.head()

In [None]:
musicdf = pd.read_csv('./musicdf30.csv', index_col=0, encoding='latin-1')

In [None]:
musicdf.head()

In [None]:
results.shape

In [None]:
musicdf.shape

In [None]:
results = pd.merge(results, musicdf, on='file')

In [None]:
results.columns

In [None]:
#mfcc_columns = (['MFCC_0', 'MFCC_1', 'MFCC_10',
#       'MFCC_11', 'MFCC_2', 'MFCC_3', 'MFCC_4', 'MFCC_5', 'MFCC_6', 'MFCC_7',
#       'MFCC_8', 'MFCC_9', 'MFCC_SD_0', 'MFCC_SD_1', 'MFCC_SD_10',
#       'MFCC_SD_11', 'MFCC_SD_2', 'MFCC_SD_3', 'MFCC_SD_4', 'MFCC_SD_5',
#       'MFCC_SD_6', 'MFCC_SD_7', 'MFCC_SD_8', 'MFCC_SD_9']);

In [None]:
feature_cols = results.columns.drop(['composer', 'description', 'symphony', 'tempo', 'file', 'part', 'duration']);

In [None]:
df = results[feature_cols]

In [None]:
df.columns

In [None]:
X, y = df.loc[:, df.columns != 'happiness'].values, df.loc[:, 'happiness'].values

In [None]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
forest = RandomForestClassifier(n_estimators=10000, random_state=0, n_jobs=-1)

In [None]:
forest.fit(X_train, y_train)

In [None]:
y_train_pred = forest.predict(X_train)

In [None]:
y_test_pred = forest.predict(X_test)

In [None]:
pd.crosstab(y_test, y_test_pred, rownames=['actual'], colnames=['preds'],margins=True)

In [None]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_test_pred)
print(f'Mean accuracy score: {accuracy:.3}')

In [None]:
y_test

In [None]:
from sklearn.metrics import confusion_matrix
cm = pd.DataFrame(confusion_matrix(y_test, y_test_pred), columns=['happy','neutral','sad'], index=['happy','neutral','sad']).rename_axis('Actual').rename_axis("Predicted", axis="columns")
sns.heatmap(cm, annot=True)

In [None]:
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

In [None]:
total_features = X_test.shape[1]

In [None]:
forest.n_features_

In [None]:
column_names = df.loc[:, df.columns != 'happiness'].columns.tolist()

In [None]:
print("Feature ranking:")

for f in range(forest.n_features_):
    print("%d. feature %s (%f)" % (f + 1, column_names[indices[f]], importances[indices[f]]))


In [None]:
column_names = np.array(column_names)[indices].tolist()

In [None]:
plt.figure()
plt.title("Feature importances")
plt.bar(range(forest.n_features_), importances[indices],
       color="r",  align="center")
plt.xticks(range(forest.n_features_), column_names, rotation='vertical')
plt.xlim([-1, forest.n_features_])
plt.show()