In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import LinearSVR
from sklearn.multioutput import MultiOutputRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import Lasso
from sklearn.linear_model import BayesianRidge
from sklearn.linear_model import MultiTaskLasso
from sklearn.linear_model import MultiTaskElasticNet
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SequentialFeatureSelector
import pickle

In [None]:
data = pd.read_csv("parser_output/extracted_features_split_4_18_upsample.csv")
data = data.dropna()

In [None]:
data

In [None]:
#LOOK INTO https://scikit-learn.org/stable/modules/classes.html#module-sklearn.feature_selection
#multi output regressor? - https://scikit-learn.org/stable/modules/generated/sklearn.multioutput.MultiOutputRegressor.html#sklearn.multioutput.MultiOutputRegressor

In [None]:
y = data[['danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']]
x = data[data.columns[1:len(data.columns)-12]]

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.7)
X_train = X_train.iloc[: , 1:]
X_test = X_test.iloc[: , 1:]

In [None]:
X_train

## REGRESSION MODELS

In [None]:
#lINEAR REGRESSION - 
reg_model = LinearRegression().fit(X_train, y_train)
ypred_train = reg_model.predict(X_train)
ypred_test = reg_model.predict(X_test)
print("LINEAR REGRESSION")
print("Train Score: ")
print(r2_score(y_train, ypred_train))
print("Test Score: ")
print(r2_score(y_test, ypred_test))

In [None]:
reg_model.coef_

In [None]:
#K NEIGHBORS REGRESSOR - https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsRegressor.html#sklearn.neighbors.KNeighborsRegressor
knn_model = KNeighborsRegressor().fit(X_train, y_train)
ypred_train = knn_model.predict(X_train)
ypred_test = knn_model.predict(X_test)
print("KNN REGRESSION")
print("Train Score: ")
print(r2_score(y_train, ypred_train))
print("Test Score: ")
print(r2_score(y_test, ypred_test))

In [None]:
#LINEAR SVR - https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVR.html#sklearn.svm.LinearSVR
svr_model = MultiOutputRegressor(LinearSVR()).fit(X_train, y_train)
ypred_train = svr_model.predict(X_train)
ypred_test = svr_model.predict(X_test)
print("SVR REGRESSION")
print("Train Score: ")
print(r2_score(y_train, ypred_train))
print("Test Score: ")
print(r2_score(y_test, ypred_test))

In [None]:
#Tree - https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html#sklearn.tree.DecisionTreeRegressor
tree_model = DecisionTreeRegressor().fit(X_train, y_train)
ypred_train = tree_model.predict(X_train)
ypred_test = tree_model.predict(X_test)
print("TREE REGRESSION")
print("Train Score: ")
print(r2_score(y_train, ypred_train))
print("Test Score: ")
print(r2_score(y_test, ypred_test))

In [None]:
#Random Forest - https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html#sklearn.ensemble.RandomForestRegressor
rf_model = RandomForestRegressor().fit(X_train, y_train)
ypred_train = rf_model.predict(X_train)
ypred_test = rf_model.predict(X_test)
print("RANDOM FOREST REGRESSION")
print("Train Score: ")
print(r2_score(y_train, ypred_train))
print("Test Score: ")
print(r2_score(y_test, ypred_test))

# Feature Selection

In [None]:
selected_RF = RFECV(rf_model, step=25, cv=5, n_jobs = -1)
selected_RF = selected_RF.fit(X_train, y_train)
ypred_train = selected_RF.predict(X_train)
ypred_test = selected_RF.predict(X_test)
print("RANDOM FOREST REGRESSION - SELECTED FEATURES")
print("Train Score: ")
print(r2_score(y_train, ypred_train))
print("Test Score: ")
print(r2_score(y_test, ypred_test))

In [None]:
names = pd.DataFrame(selected_RF.feature_names_in_)
best_features = [names.iloc[selected_RF.ranking_==1]]
best_features
#pd.DataFrame(best_features).to_csv()

In [None]:
featureSelectedX_train = X_train[names.iloc[selected_RF.ranking_==1][0].to_numpy()]
featureSelectedX_test = X_test[names.iloc[selected_RF.ranking_==1][0].to_numpy()]

In [None]:
X_train

In [None]:
names.iloc[selected_RF.ranking_==1][0].to_numpy()

# Parameter Tuning

In [None]:
from sklearn.model_selection import RandomizedSearchCV
import numpy as np
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(featureSelectedX_train, y_train)

ypred_train = rf_random.predict(featureSelectedX_train)
ypred_test = rf_random.predict(featureSelectedX_test)
print("RANDOM FOREST REGRESSION")
print("Train Score: ")
print(r2_score(y_train, ypred_train))
print("Test Score: ")
print(r2_score(y_test, ypred_test))

In [None]:
rf_random.best_params_

# Model Export

In [None]:
filename = 'model_4_20.sav'
pickle.dump(rf_random, open(filename, 'wb'))

In [None]:
filename = 'model_4_20.sav'
model = pickle.load(open(filename, 'rb'))

In [None]:
model.best_params_

In [None]:
ypred_train = model.predict(X_train)
ypred_test = model.predict(X_test)
print("RANDOM FOREST REGRESSION")
print("Train Score: ")
print(r2_score(y_train, ypred_train))
print("Test Score: ")
print(r2_score(y_test, ypred_test))

In [None]:
rf_final = RandomForestRegressor(n_estimators=400, min_samples_split=2,min_samples_leaf=1, max_features='sqrt', max_depth=None, bootstrap=False).fit(featureSelectedX_train, y_train)

In [None]:
ypred_train = rf_final.predict(featureSelectedX_train)
ypred_test = rf_final.predict(featureSelectedX_test)
print("RANDOM FOREST REGRESSION")
print("Train Score: ")
print(r2_score(y_train, ypred_train))
print("Test Score: ")
print(r2_score(y_test, ypred_test))

In [None]:
filename = 'model_final.sav'
pickle.dump(rf_final, open(filename, 'wb'))

In [None]:
pd.DataFrame(names.iloc[selected_RF.ranking_==1][0].to_numpy()).to_csv("Selected_Feature_List.csv")