In [24]:
%matplotlib inline

import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

import joblib

In [25]:
data = pd.read_csv("fusebox.csv")

In [26]:
num_attrs = ["time_signature", "danceability", "energy", "loudness", "valence", "tempo", "acousticness", "instrumentalness"]
cat_attrs = ["key"]

num_pipeline = Pipeline([
    ("min_max", MinMaxScaler())
])

pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attrs),
    ("cat", OneHotEncoder(), cat_attrs),
], remainder="drop")

In [27]:
# stratify test sample so it represents the full dataset
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(data, data["rate"]):
    strat_train_set = data.loc[train_index]
    strat_test_set = data.loc[test_index]

In [28]:
X_train = strat_train_set.drop("rate", axis=1)
X_test = strat_test_set.drop("rate", axis=1)
y_train = strat_train_set["rate"].copy()
y_test = strat_test_set["rate"].copy()

In [29]:
X_train_prepared = pipeline.fit_transform(X_train)

joblib.dump(pipeline, "pipeline.joblib")

X_test_prepared = pipeline.transform(X_test)

In [30]:
# linear regression model
from sklearn.linear_model import LinearRegression

linear_regression = LinearRegression()
linear_regression.fit(X_train_prepared, y_train)

# use test data to calculate error
y_test_predictions = linear_regression.predict(X_test_prepared)

test_error = np.sqrt(mean_squared_error(y_test, y_test_predictions))

# create folds on the training data
scores = cross_val_score(linear_regression, X_train_prepared, y_train, scoring="neg_mean_squared_error", cv=10)
pos_scores =  np.sqrt(-scores)
mean_training_error = pos_scores.mean()

"test error: %.5f, mean training error: %.5f" % (test_error, mean_training_error)

'test error: 1.75309, mean training error: 1.86119'

In [31]:
# decision tree model
from sklearn.tree import DecisionTreeRegressor

decision_tree = DecisionTreeRegressor()
decision_tree.fit(X_train_prepared, y_train)

y_test_predictions = decision_tree.predict(X_test_prepared)

test_error = np.sqrt(mean_squared_error(y_test, y_test_predictions))

# create folds on the training data
scores = cross_val_score(decision_tree, X_train_prepared, y_train, scoring="neg_mean_squared_error", cv=10)
pos_scores =  np.sqrt(-scores)
mean_training_error = pos_scores.mean()

"test error: %.5f, mean training error: %.5f" % (test_error, mean_training_error)

'test error: 2.37897, mean training error: 2.27865'

In [32]:
# random forest model
from sklearn.ensemble import RandomForestRegressor

random_forest = RandomForestRegressor()
random_forest.fit(X_train_prepared, y_train)

y_test_predictions = random_forest.predict(X_test_prepared)

test_error = np.sqrt(mean_squared_error(y_test, y_test_predictions))

# create folds on the training data
scores = cross_val_score(random_forest, X_train_prepared, y_train, scoring="neg_mean_squared_error", cv=10)
pos_scores =  np.sqrt(-scores)
mean_training_error = pos_scores.mean()

"test error: %.5f, mean training error: %.5f" % (test_error, mean_training_error)

'test error: 1.67094, mean training error: 1.71431'

In [33]:
from sklearn.neighbors import KNeighborsRegressor

knn = KNeighborsRegressor(n_neighbors=7)
knn.fit(X_train_prepared, y_train)

y_test_predictions = knn.predict(X_test_prepared)

test_error = np.sqrt(mean_squared_error(y_test, y_test_predictions))

# create folds on the training data
scores = cross_val_score(knn, X_train_prepared, y_train, scoring="neg_mean_squared_error", cv=10)
pos_scores =  np.sqrt(-scores)
mean_training_error = pos_scores.mean()

"test error: %.5f, mean training error: %.5f" % (test_error, mean_training_error)

'test error: 1.72084, mean training error: 1.91336'

In [34]:
joblib.dump(linear_regression, "models/regression/linear_regression.joblib")
joblib.dump(decision_tree, "models/regression/decision_tree.joblib")
joblib.dump(random_forest, "models/regression/random_forest.joblib")
joblib.dump(knn, "models/regression/knn.joblib")

['models/regression/knn.joblib']