# Read the data set

In [None]:
import pandas as pd
data = pd.read_csv('Danceability_Prediction_Spotify.csv', sep=',')

# Convert all the dataset values to the numeric data

In [None]:
for c in data.columns:
        data[c] = pd.to_numeric(data[c], errors='coerce')

# Convert all column values in the range of 0 to 1


In [None]:
df_norm = data.sub(data.min()).div((data.max() - data.min()))
data = df_norm
data.dance = data.dance.astype(int)

# Corelation between features, scatterplot

In [None]:
import numpy as np
import matplotlib.pyplot as plot
import seaborn as sb
sb.set(style='whitegrid', context='notebook')
columns = ['energy','key','loudness','mode','speechiness','acousticness','instrumentalness','liveness','valence','tempo','danceability']
sb.pairplot(data[columns])
plot.show()

In [None]:
import numpy as np
import seaborn as sb
cor_matrix = np.corrcoef(data[columns].values.T)
sb.set(font_scale=1.5)
cor_heat_map = sb.heatmap(cor_matrix, cbar=True, annot=True,square=True,fmt='.2f', annot_kws={'size':9},
yticklabels=columns,
xticklabels=columns)
plot.show()
data.drop('danceability', axis=1, inplace=True)
data.rename(columns={'dance': 'danceability'}, inplace=True)

# Corelation by computing the mean

In [None]:
data.groupby('danceability').mean()

# Delete the columns having values has NaN values

In [None]:
data = data.loc[:, data.isnull().mean() < 1]

# Delete the row values that has NaN values

In [None]:
data.dropna()

In [None]:
features = ['energy','valence','loudness','acousticness']

# Selecting feature vectors and class label

In [None]:
X = data[features].values
y = data['danceability']

# Plot features in multiple dimentions

In [None]:
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt

fig = plt.figure(figsize=(20,10))
ax = fig.add_subplot(111, projection='3d')
ax.set_facecolor('blue')
ax.patch.set_alpha(0.2)
sp = ax.scatter(X[:,0], X[:,1], X[:,2], c=X[:,3], cmap=plt.hot())
plt.colorbar(sp)
ax.set_xlabel('Energy')
ax.set_ylabel('Valence')
ax.set_zlabel('Loudness')
plt.show()

# Apply kernels to the SVM classfier without PCA

In [None]:
from sklearn import datasets, svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn import decomposition

columns = ['energy','key','loudness','mode','speechiness','acousticness','instrumentalness','liveness','valence','tempo']

X = data[features].values
y = data['danceability']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=0)

CV = 5

linear_svc = svm.SVC(kernel='linear', C=1)
scores = cross_val_score(linear_svc, X_train, y_train, cv=CV)
print("Training Accuracy(Linear Kernel) without PCA : %0.2f " % (scores.mean()))

rbf_svc = svm.SVC(kernel='rbf', gamma=0.6, C=1)
scores = cross_val_score(rbf_svc, X_train, y_train, cv=CV)
print("Training Accuracy(RBF Kernel) without PCA : %0.2f " % (scores.mean()))

poly_svc = svm.SVC(kernel='poly',degree=3, C=1)
scores = cross_val_score(poly_svc, X_train, y_train, cv=CV)
print("Training Accuracy(Polynomial Kernel) without PCA : %0.2f " % (scores.mean()))

In [None]:
from sklearn.model_selection import cross_val_predict
from sklearn import metrics
linear_predicted = cross_val_predict(linear_svc, X_test, y_test, cv=CV)
print("Testing Accuracy(Linear Kernel) without PCA : %0.2f" %metrics.accuracy_score(y_test, linear_predicted))


rbf_predicted = cross_val_predict(rbf_svc, X_test, y_test, cv=CV)
print("Testing Accuracy(RBF Kernel) without PCA : %0.2f" %metrics.accuracy_score(y_test, rbf_predicted))


poly_predicted = cross_val_predict(poly_svc, X_test, y_test, cv=CV)
print("Testing Accuracy(Poly Kernel) without PCA : %0.2f" %metrics.accuracy_score(y_test, poly_predicted))

# Apply kernels to the SVM classfier with PCA

In [None]:
from sklearn import datasets, svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn import decomposition

X = data[columns].values
y = data['danceability']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=0)

CV = 5

pca = decomposition.PCA(n_components=10)
pca.fit(X_train)
X_train = pca.transform(X_train)

linear_svc = svm.SVC(kernel='linear', C=1)
scores = cross_val_score(linear_svc, X_train, y_train, cv=CV)
print("Training Accuracy(Linear Kernel) with PCA : %0.2f " % (scores.mean()))

rbf_svc = svm.SVC(kernel='rbf', gamma=0.6, C=1)
scores = cross_val_score(rbf_svc, X_train, y_train, cv=CV)
print("Training Accuracy(RBF Kernel) with PCA : %0.2f " % (scores.mean()))

poly_svc = svm.SVC(kernel='poly',degree=3, C=1)
scores = cross_val_score(poly_svc, X_train, y_train, cv=CV)
print("Training Accuracy(Polynomial Kernel) with PCA : %0.2f " % (scores.mean()))

In [None]:
pca = decomposition.PCA(n_components=10)
pca.fit(X_test)
X_test = pca.transform(X_test)

linear_predicted = cross_val_predict(linear_svc, X_test, y_test, cv=CV)
print("Testing Accuracy(Linear Kernel) with PCA : %0.2f" %metrics.accuracy_score(y_test, linear_predicted))

rbf_predicted = cross_val_predict(rbf_svc, X_test, y_test, cv=CV)
print("Testing Accuracy(RBF Kernel) with PCA : %0.2f" %metrics.accuracy_score(y_test, rbf_predicted))

poly_predicted = cross_val_predict(poly_svc, X_test, y_test, cv=CV)
print("Testing Accuracy(Poly Kernel) without PCA : %0.2f" %metrics.accuracy_score(y_test, poly_predicted))


# Find the best parameters by applying Grid Search on the SVM classifier

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVC

tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-2, 0,0.1,1e-4,1e-5],
                     'C': [1, 10, 100, 1000,10000, 100000]},
                    {'kernel': ['poly'], 'degree': [2,3,4],
                     'C': [1, 10, 100]},
                     {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]

scores = ['accuracy']

X = data[features].values
y = data['danceability']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=0)

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)

    clf = GridSearchCV(SVC(), tuned_parameters, cv=CV,
                       scoring='%s' % score)
    clf.fit(X_train, y_train)
    print("Best parameters set found on the dataset:")
    print(clf.best_params_)
    print("Grid scores on data set:")
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))

    print("--------- Detailed classification report without PCA ---------")
    print("The scores are computed on the dataset.")
    y_true, y_pred = y_test, clf.predict(X_test)
    print(classification_report(y_true, y_pred))
    print("------------------------------------------------------")

# Apply the params found in the step above and find the accuracy without PCA

In [None]:
X = data[features].values
y = data['danceability']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=0)
rbf_svc = svm.SVC(kernel='rbf', gamma=0.01, C=10)
scores = cross_val_score(rbf_svc, X_train, y_train, cv=CV)
print("Training Accuracy(RBF Kernel) after Grid Search without PCA : %0.2f " % (scores.mean()))

In [None]:
rbf_predicted = cross_val_predict(rbf_svc, X_test, y_test, cv=CV)
print("Test Accuracy(RBF Kernel) after Grid Search without PCA : %0.2f" %metrics.accuracy_score(y_test, rbf_predicted))

# Apply the params found in the step above and find the accuracy with PCA

In [None]:
X = data[columns].values
y = data['danceability']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=0)

pca = decomposition.PCA(n_components=10)
pca.fit(X_train)
X_train = pca.transform(X_train)

rbf_svc = svm.SVC(kernel='rbf', gamma=0.01, C=10)
scores = cross_val_score(rbf_svc, X_train, y_train, cv=CV)
print("Training Accuracy(RBF Kernel) after Grid Search with PCA : %0.2f " % (scores.mean()))

In [None]:
pca = decomposition.PCA(n_components=10)
pca.fit(X_test)
X_test = pca.transform(X_test)

rbf_predicted = cross_val_predict(rbf_svc, X_test, y_test, cv=CV)
print("Test Accuracy(RBF Kernel) after Grid Search with PCA : %0.2f" %metrics.accuracy_score(y_test, rbf_predicted))