In [46]:
%matplotlib inline

import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

from sklearn.metrics import confusion_matrix, precision_score, recall_score, roc_auc_score, f1_score
from sklearn.model_selection import cross_val_score, cross_val_predict

import joblib

In [47]:
data = pd.read_csv("fusebox.csv")

In [48]:
num_attrs = ["time_signature", "danceability", "energy", "loudness", "valence", "tempo", "acousticness", "instrumentalness"]
cat_attrs = ["key"]

num_pipeline = Pipeline([
    ("min_max", MinMaxScaler())
])

pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attrs),
    ("cat", OneHotEncoder(), cat_attrs),
], remainder="drop")

In [49]:
# stratify test sample so it represents the full dataset
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(data, data["rate"]):
    strat_train_set = data.loc[train_index]
    strat_test_set = data.loc[test_index]

In [50]:
X_train = strat_train_set.drop("rate", axis=1)
X_test = strat_test_set.drop("rate", axis=1)
y_train = strat_train_set["rate"].copy()
y_test = strat_test_set["rate"].copy()

In [51]:
X_train_prepared = pipeline.fit_transform(X_train)
X_test_prepared = pipeline.transform(X_test)

y_train_prepared = (y_train >= 5)
y_test_prepared = (y_test >= 5)

In [52]:
from sklearn.svm import SVC 

svc_clf = SVC()
svc_clf.fit(X_train_prepared, y_train_prepared)

y_test_predictions = svc_clf.predict(X_test_prepared)

test_error = sum(y_test_predictions != y_test_prepared)/len(y_test_predictions)
train_error = sum(sgd_clf.predict(X_train_prepared) != y_train_prepared)/len(y_train_prepared)

"train error: %.5f and test error %.5f" % (train_error, test_error)

'train error: 0.23913 and test error 0.24138'

In [53]:
cross_val_score(sgd_clf, X_train_prepared, y_train_prepared, cv=3, scoring="accuracy")

array([0.76623377, 0.75324675, 0.55263158])

In [54]:
y_train_predict = cross_val_predict(sgd_clf, X_train_prepared, y_train_prepared, cv=3)
confusion_matrix(y_train_prepared, y_train_predict)

array([[  7,  48],
       [ 23, 152]])

In [55]:
precision_score(y_train_prepared, y_train_predict)

0.76

In [56]:
recall_score(y_train_prepared, y_train_predict)

0.8685714285714285

In [57]:
f1_score(y_train_prepared, y_train_predict)

0.8106666666666666

In [64]:
joblib.dump(pipeline, "models/classification/binary/pipeline.joblib")
joblib.dump(sgd_clf, "models/classification/binary/stochastic_gradient_descent.joblib")
joblib.dump(random_forest_clf, "models/classification/binary/random_forest.joblib")

['models/classification/binary/random_forest.joblib']