In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import pickle

In [2]:
data = pd.read_csv('outputs/edm_trimmed.csv')

In [3]:
data.head()

Unnamed: 0,danceability,energy,loudness,speechiness,instrumentalness,liveness,valence,tempo,genre
0,0.527,0.862,-3.684,0.0457,2.4e-05,0.13,0.348,130.081,progressivehouse
1,0.49,0.936,-4.575,0.0772,0.00446,0.316,0.0891,132.013,progressivehouse
2,0.605,0.873,-5.938,0.0344,0.837,0.314,0.476,126.026,progressivehouse
3,0.525,0.925,-7.213,0.0526,0.0285,0.0707,0.203,128.015,progressivehouse
4,0.75,0.729,-6.06,0.0622,0.873,0.0647,0.166,126.997,progressivehouse


In [4]:
features = ['danceability','energy','loudness','tempo','instrumentalness','liveness','speechiness','valence']
genres = ['progressivehouse','basshouse','techhouse','bigroom','trance','futurebass','hardstyle','bounce','tropicalhouse','futurehouse']
X = data[features]
y = LabelEncoder().fit_transform(data['genre'])
data['genre']

0       progressivehouse
1       progressivehouse
2       progressivehouse
3       progressivehouse
4       progressivehouse
              ...       
5575       tropicalhouse
5576       tropicalhouse
5577       tropicalhouse
5578       tropicalhouse
5579       tropicalhouse
Name: genre, Length: 5580, dtype: object

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=1, stratify=y)

In [6]:
scaler = MinMaxScaler()
scaler.fit(X_train)

X_train = pd.DataFrame(scaler.transform(X_train), columns = X.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns = X.columns)

In [7]:
rf = RandomForestClassifier(max_depth=20, min_samples_leaf=1, min_samples_split=5, n_estimators=1000)
rf.fit(X_train, y_train)

rf_train_probs = rf.predict_proba(X_train)
rf_test_probs = rf.predict_proba(X_test)
#First running with manual testing. Seems the model cannot peak 67% test and 98% train.

In [8]:
train_pred = np.argmax(rf_train_probs, axis=-1)
test_pred = np.argmax(rf_test_probs, axis=-1)
train_score = accuracy_score(y_train, train_pred)
test_score = accuracy_score(y_test, test_pred)
print("Train Accuracy:", train_score)
print("Test Accuracy:", test_score)

Train Accuracy: 0.98123550495467
Test Accuracy: 0.6798088410991637


Running two randomized searches, one with a range of values and one with lists of values incremented in multiples of five or ten. Neither gave a better test accuracy (less than 1% and tanked train accuracy).

In [9]:
#import numpy as np
#params = {
#    'max_depth': range(1,25),
#    'n_estimators': range(1,1500),
#    'min_samples_split': range(1, 25),
#    'min_samples_leaf': range(1, 25),
#}

#rs = RandomizedSearchCV(rf, params, 5000, cv=3, random_state=1, n_jobs=4)
#rs.fit(X_train, y_train)

#rs.best_params_

#rf1 = RandomForestClassifier(max_depth=24, min_samples_leaf=3, min_samples_split=17, n_estimators=506)
#rf1.fit(X_train, y_train)

#rf1_train_probs = rf1.predict_proba(X_train)
#rf1_test_probs = rf1.predict_proba(X_test)

#rf1_train_pred = np.argmax(rf1_train_probs, axis=-1)
#rf1_test_pred = np.argmax(rf1_test_probs, axis=-1)
#rf1_train_score = accuracy_score(y_train, rf1_train_pred)
#rf1_test_score = accuracy_score(y_test, rf1_test_pred)
#print("Train Accuracy:", rf1_train_score)
#print("Test Accuracy:", rf1_test_score)

#params = {
#    'max_depth':[1,5,10,15,25],
#    'n_estimators': [200,400,600,800,1200,1400,1500],
#    'min_samples_split': [1,5,10,15,20,25],
#    'min_samples_leaf': [1,2,5,10]
#}

#rs2 = RandomizedSearchCV(rf, params, 5000, cv=3, random_state=1, n_jobs=4)
#rs2.fit(X_train, y_train)

#rs2.best_params_

#rf2 = RandomForestClassifier(max_depth=10, min_samples_leaf=1, min_samples_split=10, n_estimators=400)
#rf2.fit(X_train, y_train)

#rf2_train_probs = rf2.predict_proba(X_train)
#rf2_test_probs = rf2.predict_proba(X_test)

#rf2_train_pred = np.argmax(rf2_train_probs, axis=-1)
#rf2_test_pred = np.argmax(rf2_test_probs, axis=-1)
#rf2_train_score = accuracy_score(y_train, rf2_train_pred)
#rf2_test_score = accuracy_score(y_test, rf2_test_pred)
#print("Train Accuracy:", rf2_train_score)
#print("Test Accuracy:", rf2_test_score)

#grid_params = {
#    'max_depth':[23,27],
#    'min_samples_split': range(20, 28),
#    'min_samples_leaf': range(1, 8),
#    'max_features': [.4,.5,.6],
#    'bootstrap': range(1),
#    'warm_start': range(1),
#}

In [10]:
confusion_matrix(y_train, train_pred)

array([[467,   2,   0,   0,   0,   0,   0,   5,   0,   0],
       [  0, 449,  11,   0,   2,   0,  12,   0,   0,   0],
       [  0,   6, 469,   0,   0,   0,   0,   0,   0,   0],
       [  0,   0,   0, 468,   0,   2,   4,   0,   0,   0],
       [  2,   4,   0,   0, 457,   0,   5,   6,   0,   0],
       [  0,   0,   0,   0,   0, 474,   0,   0,   0,   0],
       [  0,   5,   0,   0,   1,   0, 466,   0,   3,   0],
       [  4,   0,   1,   0,   2,   0,   0, 467,   0,   0],
       [  0,   0,   1,   0,   2,   0,   9,   0, 462,   0],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0, 475]], dtype=int64)

In [11]:
confusion_matrix(y_test, test_pred)

array([[31,  5, 14,  0, 14,  0,  0, 19,  1,  0],
       [ 2, 41, 16,  3,  7,  0, 12,  0,  3,  0],
       [ 3, 17, 56,  0,  0,  0,  1,  5,  1,  0],
       [ 1,  0,  0, 67,  0, 10,  6,  0,  0,  0],
       [10,  3,  8,  0, 33,  0, 11, 17,  2,  0],
       [ 0,  2,  0,  8,  0, 73,  0,  0,  1,  0],
       [ 0,  9,  3,  1,  8,  0, 60,  0,  2,  0],
       [ 5,  1,  4,  0,  9,  0,  0, 63,  1,  1],
       [ 2,  9,  0,  4,  1,  0,  3,  3, 62,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0, 83]], dtype=int64)

In [12]:
with open('final_model.pkl', 'wb') as f:
    pickle.dump(rf, f)

In [13]:
validation_data = pd.read_csv('outputs/val_trimmed.csv')

In [14]:
validation_data.head()

Unnamed: 0,danceability,energy,loudness,speechiness,instrumentalness,liveness,valence,tempo,genre
0,0.665,0.93,-3.543,0.12,2.3e-05,0.178,0.461,127.985,progressivehouse
1,0.527,0.864,-3.115,0.0499,0.00599,0.207,0.422,129.2,progressivehouse
2,0.562,0.823,-5.471,0.0644,0.0,0.136,0.365,128.144,progressivehouse
3,0.546,0.912,-3.447,0.108,1e-06,0.333,0.469,127.984,progressivehouse
4,0.512,0.905,-4.296,0.128,1e-06,0.339,0.334,128.009,progressivehouse


In [15]:
features = ['danceability','energy','loudness','tempo','instrumentalness','liveness','speechiness','valence']
genres = ['progressivehouse','basshouse','techhouse','bigroom','trance','futurebass','hardstyle','bounce','tropicalhouse','futurehouse']
Xval = validation_data[features]
yval = LabelEncoder().fit_transform(validation_data['genre'])

In [16]:
Xval_train, Xval_test, yval_train, yval_test = train_test_split(Xval, yval, test_size=0.15, random_state=1, stratify=yval)

In [17]:
scaler = MinMaxScaler()
scaler.fit(Xval_train)

X_train = pd.DataFrame(scaler.transform(Xval_train), columns = Xval.columns)
X_test = pd.DataFrame(scaler.transform(Xval_test), columns = Xval.columns)

In [18]:
rf = RandomForestClassifier(max_depth=25, min_samples_leaf=1, min_samples_split=5, n_estimators=1200)
rf.fit(Xval_train, yval_train)

rf_val_train_probs = rf.predict_proba(Xval_train)
rf_val_test_probs = rf.predict_proba(Xval_test)

In [19]:
val_train_pred = np.argmax(rf_val_train_probs, axis=-1)
val_test_pred = np.argmax(rf_val_test_probs, axis=-1)
val_train_score = accuracy_score(yval_train, val_train_pred)
val_test_score = accuracy_score(yval_test, val_test_pred)
print("Train Accuracy:", val_train_score)
print("Test Accuracy:", val_test_score)

Train Accuracy: 1.0
Test Accuracy: 0.782608695652174


In [20]:
confusion_matrix(yval_train, val_train_pred)

array([[13,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0, 13,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0, 13,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0, 13,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0, 13,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0, 12,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0, 13,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0, 12,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0, 12,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0, 13]], dtype=int64)

In [21]:
confusion_matrix(yval_test, val_test_pred)

array([[0, 0, 1, 0, 0, 0, 0, 1, 0, 0],
       [0, 2, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 2, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 3, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 2, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 2, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 3, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 2]], dtype=int64)