In [1]:
%matplotlib notebook
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

#  Explanation of Dataset

In [275]:
data = pd.read_csv('songs.csv')

In [276]:
data.shape

(1994, 15)

In [277]:
data.dtypes

Index                      int64
Title                     object
Artist                    object
Top Genre                 object
Year                       int64
Beats Per Minute (BPM)     int64
Energy                     int64
Danceability               int64
Loudness (dB)              int64
Liveness                   int64
Valence                    int64
Length (Duration)          int64
Acousticness               int64
Speechiness                int64
Popularity                 int64
dtype: object

In [278]:
data.describe()

Unnamed: 0,Index,Year,Beats Per Minute (BPM),Energy,Danceability,Loudness (dB),Liveness,Valence,Length (Duration),Acousticness,Speechiness,Popularity
count,1994.0,1994.0,1994.0,1994.0,1994.0,1994.0,1994.0,1994.0,1994.0,1994.0,1994.0,1994.0
mean,997.5,1992.992979,120.215647,59.679539,53.238215,-9.008526,19.012036,49.408726,262.44333,28.858074,4.994985,59.52658
std,575.762538,16.116048,28.028096,22.154322,15.351507,3.647876,16.727378,24.858212,93.604387,29.011986,4.401566,14.3516
min,1.0,1956.0,37.0,3.0,10.0,-27.0,2.0,3.0,93.0,0.0,2.0,11.0
25%,499.25,1979.0,99.0,42.0,43.0,-11.0,9.0,29.0,212.0,3.0,3.0,49.25
50%,997.5,1993.0,119.0,61.0,53.0,-8.0,12.0,47.0,245.0,18.0,4.0,62.0
75%,1495.75,2007.0,136.0,78.0,64.0,-6.0,23.0,69.75,289.0,50.0,5.0,71.0
max,1994.0,2019.0,206.0,100.0,96.0,-2.0,99.0,99.0,1412.0,99.0,55.0,100.0


# Visualizing

In [279]:
#energy / numerical
plt.figure()

arr = data[['Year', 'Energy']]
bins = [1956, 1966, 1976, 1986, 1996, 2006, 2016, 2019]
labels = ['1956-1965','1966-1975','1976-1985','1986-1995','1996-2005','2006-2015', '2015+']
arr['Year'] = pd.cut(arr['Year'], bins = bins, labels = labels, include_lowest = True)
yearRng = pd.DataFrame({'count': arr.groupby(["Year"]).size()}).reset_index()
energyMean = arr[['Year','Energy']].groupby(['Year'], as_index = False).mean()
energyRng = range(len(energyMean))

x = plt.gca().xaxis
for item in x.get_ticklabels():
    item.set_rotation(90)
energyRng_bar = plt.bar(energyRng, energyMean['Energy'], width = 0.3)    
plt.xticks(energyRng, energyMean['Year'], size = 7)
plt.title("Energy Level By Years")
plt.xlabel('Energy Level')
plt.ylabel('Years')
plt.subplots_adjust(bottom = 0.25) 
plt.show()

<IPython.core.display.Javascript object>

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


In [199]:
#genres / categorical
plt.figure()

danceCnd = data['Danceability'] > 70  
arr = data[danceCnd][['Top Genre', 'Danceability']]
genres = pd.DataFrame({'count': arr.groupby(["Top Genre"]).size()}).reset_index()
genresMean = arr[['Top Genre','Danceability']].groupby(['Top Genre'], as_index = False).mean()
danceRng = range(len(genresMean))

x = plt.gca().xaxis
for item in x.get_ticklabels():
    item.set_rotation(90)
danceRng_bar = plt.bar(danceRng, genresMean['Danceability'], width = 0.8)    
plt.xticks(danceRng, genresMean['Top Genre'], size = 7)
plt.title("Danceability Level By Genres")
plt.subplots_adjust(bottom = 0.25) 
plt.show()

<IPython.core.display.Javascript object>

# Preprocessing

In [200]:
data.isnull().sum()

Index                     0
Title                     0
Artist                    0
Top Genre                 0
Year                      0
Beats Per Minute (BPM)    0
Energy                    0
Danceability              0
Loudness (dB)             0
Liveness                  0
Valence                   0
Length (Duration)         0
Acousticness              0
Speechiness               0
Popularity                0
dtype: int64

In [227]:
dataCnd = data['Year'] > 1989
data = data[dataCnd]
data = data.drop(['Artist', 'Title', 'Year', 'Popularity', 'Index'], axis = 1)
data.shape

(1130, 10)

In [273]:
y_data = pd.DataFrame(data['Top Genre'])
X_data = data.drop(['Top Genre'], axis = 1)
df = pd.DataFrame()

# Clustering

In [266]:
#Partitioning Clustering / k-means
from sklearn.cluster import KMeans 
from adspy_shared_utilities import plot_labelled_scatter

XData = X_data[['Beats Per Minute (BPM)', 'Energy', 'Valence']]
scaler = StandardScaler()
XData = scaler.fit_transform(XData)

kmeans = KMeans(n_clusters = 3, random_state = 41, init = 'k-means++')
kmeans.fit(XData)
y_pred = kmeans.predict(XData)
plot_labelled_scatter(XData, kmeans.labels_, ['Cluster 1', 'Cluster 2', 'Cluster 3'])

[0 2 0 ... 2 1 0]


<IPython.core.display.Javascript object>

In [280]:
from sklearn.cluster import AgglomerativeClustering

XData = X_data[['Beats Per Minute (BPM)', 'Energy', 'Valence']]
scaler = StandardScaler()
XData = scaler.fit_transform(XData)  

cls = AgglomerativeClustering(n_clusters = 3)
cls_assignment = cls.fit_predict(XData)

plot_labelled_scatter(XData, cls_assignment, ['Cluster 1', 'Cluster 2', 'Cluster 3'])

<IPython.core.display.Javascript object>

In [281]:
from sklearn.cluster import DBSCAN

XData = X_data[['Beats Per Minute (BPM)', 'Energy', 'Valence']]
scaler = StandardScaler()
XData = scaler.fit_transform(XData)  

dbscan = DBSCAN(eps = 0.5, min_samples = 2)

cls = dbscan.fit_predict(XData)
print("Cluster membership values:\n{}".format(cls))

plot_labelled_scatter(XData, cls + 1, ['Noise', 'Cluster 0', 'Cluster 1', 'Cluster 2'])

Cluster membership values:
[-1  0  0 ...  0  0  0]


<IPython.core.display.Javascript object>