Read csv in data using pandas.

In [1]:
import pandas as pd
data = pd.read_csv('Danceability_Prediction_Spotify.csv', sep=',')


Check number of rows and columns

In [2]:
rows,columns = data.shape

print('Number of rows:', rows)
print('Number of columns:', columns)

('Number of rows:', 495)
('Number of columns:', 19)


Convert columns with String values to numeric values

In [3]:
for c in data.columns:
        data[c] = pd.to_numeric(data[c], errors='coerce')

Normalize the data
Convert all columns in the range of 0 to 1

In [None]:
df_norm = data.sub(data.min()).div((data.max() - data.min()))
data = df_norm
data.dance = data.dance.astype(int)

Determine correlation between features using scatterplot

In [None]:
import numpy as np
import matplotlib.pyplot as plot
import seaborn as sb
sb.set(style='whitegrid', context='notebook')

columns = ['energy','key','loudness','mode','speechiness','acousticness','instrumentalness','liveness','valence','tempo','duration_ms','danceability']
sb.pairplot(data[columns])
plot.show()


Determine correlation between features using Heatmap

In [None]:
cor_matrix = np.corrcoef(data[columns].values.T)
sb.set(font_scale=2.5)
cor_heat_map = sb.heatmap(cor_matrix, cbar=True, annot=True,square=True,fmt='.2f', annot_kws={'size':9},
yticklabels=columns,
xticklabels=columns)
plot.show()
data.drop('danceability', axis=1, inplace=True)
data.rename(columns={'dance': 'danceability'}, inplace=True)


Determine correlation between features using mean

In [None]:
data.groupby('danceability').mean()

Remove features which has all NaN values since these features will not play any role in determination

In [None]:
data = data.loc[:, data.isnull().mean() < 1]

Remove rows which has NaN values

In [None]:
data.dropna()

Select feature vectore and class label

In [None]:
X = data[['valence','energy','loudness','acousticness']].values
y = data['danceability'].values
y

Plot relation between features in multi dimensions

In [None]:
# from mpl_toolkits.mplot3d import Axes3D
# import matplotlib.pyplot as plt

# fig = plt.figure(figsize=(20,10))
# ax = fig.add_subplot(111, projection='3d')
# ax.set_facecolor('blue')
# ax.patch.set_alpha(0.2)
# sp = ax.scatter(X[:,0], X[:,1], X[:,2], c=X[:,3], cmap=plt.hot())
# plt.colorbar(sp)
# ax.set_xlabel('Energy')
# ax.set_ylabel('Valence')
# ax.set_zlabel('Loudness')
# plt.show()

Without PCA

In [None]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn import decomposition
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict
from sklearn import metrics

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=0)

CV = 5

pca = decomposition.PCA(n_components=4)

lr = LogisticRegression()
scores = cross_val_score(lr, X_train, y_train, cv=CV)
print("LR Training Accuracy without PCA : %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

predicted = cross_val_predict(lr, X_test, y_test, cv=CV)
print("LR Testing Accuracy without PCA : %0.2f" % metrics.accuracy_score(y_test, predicted))





With PCA

In [None]:

X = data.iloc[:, :-2].values
y = data.iloc[:, -1].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=0)

CV = 5

pca.fit(X_train)
X_train = pca.transform(X_train)

lr = LogisticRegression()
scores = cross_val_score(lr, X_train, y_train, cv=CV)
print("LR Training Accuracy with PCA : %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))


predicted = cross_val_predict(lr, X_test, y_test, cv=CV)
print("LR Testing Accuracy with PCA : %0.2f" % metrics.accuracy_score(y_test, predicted))


Get classification report

In [None]:
from sklearn.metrics import classification_report
from sklearn.cross_validation import cross_val_predict
y_pred = cross_val_predict(lr, X, y, cv=10)
print(classification_report(y, y_pred))