In [51]:
%matplotlib inline

import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

from sklearn.metrics import confusion_matrix, precision_score, recall_score, roc_auc_score, f1_score
from sklearn.model_selection import cross_val_score, cross_val_predict

import joblib

In [52]:
data = pd.read_csv("fusebox.csv")

In [53]:
num_attrs = ["time_signature", "danceability", "energy", "loudness", "valence", "tempo", "acousticness", "instrumentalness"]
cat_attrs = ["key"]

num_pipeline = Pipeline([
    ("min_max", MinMaxScaler())
])

pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attrs),
    ("cat", OneHotEncoder(), cat_attrs),
], remainder="drop")

In [54]:
# stratify test sample so it represents the full dataset
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(data, data["rate"]):
    strat_train_set = data.loc[train_index]
    strat_test_set = data.loc[test_index]

In [55]:
X_train = strat_train_set.drop("rate", axis=1)
X_test = strat_test_set.drop("rate", axis=1)
y_train = strat_train_set["rate"].copy()
y_test = strat_test_set["rate"].copy()

In [56]:
X_train_prepared = pipeline.fit_transform(X_train)
X_test_prepared = pipeline.transform(X_test)

In [57]:
from sklearn.svm import SVC 

svc_clf = SVC()
svc_clf.fit(X_train_prepared, y_train)

y_test_predictions = svc_clf.predict(X_test_prepared)

test_error = sum(y_test_predictions != y_test)/len(y_test_predictions)
train_error = sum(svc_clf.predict(X_train_prepared) != y_train)/len(y_train)

"train error: %.5f and test error %.5f" % (train_error, test_error)

'train error: 0.64773 and test error 0.82090'

In [58]:
cross_val_score(svc_clf, X_train_prepared, y_train, cv=2, scoring="accuracy")

array([0.18939394, 0.24242424])

In [59]:
joblib.dump(pipeline, "models/classification/multiclass/pipeline.joblib")
joblib.dump(svc_clf, "models/classification/multiclass/support_vector_machine.joblib")

['models/classification/multiclass/support_vector_machine.joblib']