## Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn import metrics
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import mean_squared_error
from sklearn import svm
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report


# Spotify Data Analysis

The datasets we are using are from a kaggle set that uses the Spotify API to query song data. https://www.kaggle.com/yamaerenay/spotify-dataset-19212020-160k-tracks

In [4]:
df = pd.read_csv("archive/data.csv")
df_artists = pd.read_csv("archive/data_by_artist.csv")
df_genres = pd.read_csv("archive/data_by_genres.csv")
df_year = pd.read_csv("archive/data_by_year.csv")
df_w_genres = pd.read_csv("archive/data_w_genres.csv")

In [28]:
df.head()

Unnamed: 0,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo,valence,year
0,0.995,['Carl Woitschach'],0.708,158648,0.195,0,6KbQ3uYMLKb5jDxLF7wYDD,0.563,10,0.151,-12.428,1,Singende Bataillone 1. Teil,0,1928,0.0506,118.469,0.779,1928
1,0.994,"['Robert Schumann', 'Vladimir Horowitz']",0.379,282133,0.0135,0,6KuQTIu1KoTTkLXKrwlLPV,0.901,8,0.0763,-28.454,1,"Fantasiestücke, Op. 111: Più tosto lento",0,1928,0.0462,83.972,0.0767,1928
2,0.604,['Seweryn Goszczyński'],0.749,104300,0.22,0,6L63VW0PibdM1HDSBoqnoM,0.0,5,0.119,-19.924,0,Chapter 1.18 - Zamek kaniowski,0,1928,0.929,107.177,0.88,1928
3,0.995,['Francisco Canaro'],0.781,180760,0.13,0,6M94FkXd15sOAOQYRnWPN8,0.887,1,0.111,-14.734,0,Bebamos Juntos - Instrumental (Remasterizado),0,1928-09-25,0.0926,108.003,0.72,1928
4,0.99,"['Frédéric Chopin', 'Vladimir Horowitz']",0.21,687733,0.204,0,6N6tiFZ9vLTSOIxkj8qKrd,0.908,11,0.098,-16.829,1,"Polonaise-Fantaisie in A-Flat Major, Op. 61",1,1928,0.0424,62.149,0.0693,1928


Most of the other datasets are aggregations of this one.  The genre data is the only one that presents information that is not found in this dataset, and it provides aggregations of the data at the genre level or includes what genres an artist encapsualtes. 

In [29]:
df_artists.head()

Unnamed: 0,artists,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity,key,mode,count
0,"""Cats"" 1981 Original London Cast",0.575083,0.44275,247260.0,0.386336,0.022717,0.287708,-14.205417,0.180675,115.9835,0.334433,38.0,5,1,12
1,"""Cats"" 1983 Broadway Cast",0.862538,0.441731,287280.0,0.406808,0.081158,0.315215,-10.69,0.176212,103.044154,0.268865,33.076923,5,1,26
2,"""Fiddler On The Roof” Motion Picture Chorus",0.856571,0.348286,328920.0,0.286571,0.024593,0.325786,-15.230714,0.118514,77.375857,0.354857,34.285714,0,1,7
3,"""Fiddler On The Roof” Motion Picture Orchestra",0.884926,0.425074,262890.962963,0.24577,0.073587,0.275481,-15.63937,0.1232,88.66763,0.37203,34.444444,0,1,27
4,"""Joseph And The Amazing Technicolor Dreamcoat""...",0.605444,0.437333,232428.111111,0.429333,0.037534,0.216111,-11.447222,0.086,120.329667,0.458667,42.555556,11,1,9


In [30]:
df_genres.head()

Unnamed: 0,genres,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity,key,mode
0,432hz,0.49478,0.299333,1048887.0,0.450678,0.477762,0.131,-16.854,0.076817,120.285667,0.22175,52.166667,5,1
1,[],0.679245,0.521473,229852.6,0.401522,0.196737,0.221586,-12.679076,0.112267,112.025168,0.51592,21.682005,7,1
2,a cappella,0.621532,0.577017,193652.2,0.345694,0.003799,0.127087,-12.770211,0.095324,111.81323,0.453186,43.351819,11,1
3,abstract,0.359395,0.4595,343018.5,0.487,0.7914,0.11948,-14.092,0.04342,124.7432,0.30499,41.5,1,1
4,abstract beats,0.353347,0.6944,233824.4,0.6134,0.349403,0.102453,-6.6998,0.143453,119.3984,0.634187,58.6,10,0


In [31]:
df_w_genres.head()

Unnamed: 0,artists,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity,key,mode,count,genres
0,"""Cats"" 1981 Original London Cast",0.575083,0.44275,247260.0,0.386336,0.022717,0.287708,-14.205417,0.180675,115.9835,0.334433,38.0,5,1,12,['show tunes']
1,"""Cats"" 1983 Broadway Cast",0.862538,0.441731,287280.0,0.406808,0.081158,0.315215,-10.69,0.176212,103.044154,0.268865,33.076923,5,1,26,[]
2,"""Fiddler On The Roof” Motion Picture Chorus",0.856571,0.348286,328920.0,0.286571,0.024593,0.325786,-15.230714,0.118514,77.375857,0.354857,34.285714,0,1,7,[]
3,"""Fiddler On The Roof” Motion Picture Orchestra",0.884926,0.425074,262890.962963,0.24577,0.073587,0.275481,-15.63937,0.1232,88.66763,0.37203,34.444444,0,1,27,[]
4,"""Joseph And The Amazing Technicolor Dreamcoat""...",0.605444,0.437333,232428.111111,0.429333,0.037534,0.216111,-11.447222,0.086,120.329667,0.458667,42.555556,11,1,9,[]


In [32]:
df_year.head()

Unnamed: 0,year,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity,key,mode
0,1921,0.895823,0.425661,229911.914062,0.236784,0.32233,0.215814,-17.095437,0.077258,100.397758,0.425495,0.351562,7,1
1,1922,0.939236,0.48,167904.541667,0.237026,0.44047,0.238647,-19.179958,0.115419,101.376139,0.534056,0.138889,10,1
2,1923,0.976329,0.568462,178356.301775,0.246936,0.401932,0.236656,-14.373882,0.098619,112.456598,0.624788,5.727811,0,1
3,1924,0.935575,0.548654,188461.649789,0.347033,0.583955,0.237875,-14.202304,0.09021,120.653359,0.668574,0.603376,10,1
4,1925,0.965422,0.57189,184130.69962,0.264373,0.408893,0.243094,-14.516707,0.115457,115.671715,0.61643,2.707224,5,1


### Verification that 4 other datasets are aggregations of the original dataset

In [8]:
df.groupby("year").mean().head()

Unnamed: 0_level_0,acousticness,danceability,duration_ms,energy,explicit,instrumentalness,key,liveness,loudness,mode,popularity,speechiness,tempo,valence
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1921,0.895823,0.425661,229911.914062,0.236784,0.054688,0.32233,5.03125,0.215814,-17.095438,0.65625,0.351562,0.077258,100.397758,0.425495
1922,0.939236,0.48,167904.541667,0.237026,0.0,0.44047,5.236111,0.238647,-19.179958,0.625,0.138889,0.115419,101.376139,0.534056
1923,0.976329,0.568462,178356.301775,0.246936,0.0,0.401932,4.846154,0.236656,-14.373882,0.775148,5.727811,0.098619,112.456598,0.624788
1924,0.935575,0.548654,188461.649789,0.347033,0.0,0.583955,5.637131,0.237875,-14.202304,0.751055,0.603376,0.09021,120.653359,0.668574
1925,0.965422,0.57189,184130.69962,0.264373,0.0,0.408893,5.171103,0.243094,-14.516707,0.718631,2.707224,0.115457,115.671715,0.61643


The columns in this dataset mostly go over technical muscial information, more detail can be found at this link: https://developer.spotify.com/documentation/web-api/reference/tracks/get-audio-features/

This link contains a detailed description of the popularity variable
https://developer.spotify.com/documentation/web-api/reference/tracks/get-track/

In [55]:
#Create Dataframe for Predictors
X = df[['acousticness', 
       'danceability',
       'energy',
       'year', 
        'loudness',
       'explicit',
       'instrumentalness', 
       'key', 
       'liveness', 
       'mode', 
       'speechiness', 
       'tempo',
        'valence', 
        'key', 
        'mode', 
        'explicit', 
        'duration_ms']]
#Add 60, which is max range of loudness, to loudness column to avoid having any negative values, without affeting the data
X['loudness'] = X['loudness'] + 60

#Create dataframe for variable to be predicted
y = df['popularity']

#Split up the dataset into training data and test data using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


# Gaussian Naive Bayes
#### Use all Features

In [56]:
#Gaussian Naive Bayes Model Predict, Fit, and Score
#Provide two metrics of accuracy determination: model.score & RMSE
#Note: Only parameter that can be changed in Gaussian is the var_smoothing parameter
v_smooth = [0.0000000001,0.000000001,0.00000001,0.0000001, 0.000001, 0.0001, 0.001, 0.01, 0.1, 0.2, 0.5, 1, 5, 10]
for i in v_smooth:
    nb_model = GaussianNB(var_smoothing= i)
    nb_model.fit(X_train, y_train)
    y_pred = nb_model.predict(X_test)
    print("var_smoothing parameter: ", i)
    print("Acuracy Score: ", nb_model.score(X_test, y_test))
    print("RMSE:", np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
    print("-" * 65)

    

var_smoothing parameter:  1e-10
Acuracy Score:  0.18683420634453535
RMSE: 11.914403238593655
-----------------------------------------------------------------
var_smoothing parameter:  1e-09
Acuracy Score:  0.18671649696898357
RMSE: 12.048189832072326
-----------------------------------------------------------------
var_smoothing parameter:  1e-08
Acuracy Score:  0.1831852157024307
RMSE: 15.598143212735602
-----------------------------------------------------------------
var_smoothing parameter:  1e-07
Acuracy Score:  0.16223294685421694
RMSE: 36.04884282357534
-----------------------------------------------------------------
var_smoothing parameter:  1e-06
Acuracy Score:  0.16196810075922546
RMSE: 38.243723238402694
-----------------------------------------------------------------
var_smoothing parameter:  0.0001
Acuracy Score:  0.16196810075922546
RMSE: 38.243723238402694
-----------------------------------------------------------------
var_smoothing parameter:  0.001
Acuracy Score: 

In [57]:
#Utilize GridSearchCV to find the most parameters
pipeline = Pipeline([
    ('clf', GaussianNB())
])

parameters = {
    'clf__priors': [None],
    'clf__var_smoothing': [0.00000001, 0.000000001, 0.00000001]
}

cv = GridSearchCV(pipeline, param_grid=parameters)
cv.fit(X_train, y_train)

#Regular Accuracy Score
print("Accuracy Score:", cv.score(X_test, y_test))
print(cv.best_params_)
print("-" * 65)


#RMSE 
y_pred = cv.predict(X_test)
print("RMSE: ", np.sqrt(metrics.mean_squared_error(y_test, y_pred)))



Accuracy Score: 0.18671649696898357
{'clf__priors': None, 'clf__var_smoothing': 1e-09}
-----------------------------------------------------------------
RMSE:  12.048189832072326


### Pick new features 

In [58]:
#Look for correlation coefficients to identify strength of relationship between other musical features and popularity feature
new_x = df[['acousticness', 
       'danceability',
       'energy',
       'year', 
        'loudness',
       'instrumentalness', 
       'liveness', 
       'speechiness', 
       'tempo',
        'valence', 
        'key', 
        'mode', 
        'explicit', 
        'duration_ms','popularity']]

correlation = new_x.corr()
correlation[correlation.columns[-1]]

acousticness       -0.593345
danceability        0.221077
energy              0.497488
year                0.880724
loudness            0.466546
instrumentalness   -0.299829
liveness           -0.075293
speechiness        -0.135707
tempo               0.135047
valence             0.009327
key                 0.010675
mode               -0.032854
explicit            0.214044
duration_ms         0.063292
popularity          1.000000
Name: popularity, dtype: float64

Features to look at from correlation coefficient: year, acousticness, energy, loudness, instrumentalness?

### Gaussian Naive Bayes Model with new selection of features after looking at correlation coefficients

In [59]:
#Gaussian Naive Bayes Model with new selection of features after looking at correlation coefficients
new_data = df[['energy', 'year', 'loudness', 'acousticness']]
new_data['loudness'] = new_data['loudness'] + 60

#create new train and test data
new_X_train, new_X_test, new_y_train, new_y_test = train_test_split(new_data, y, test_size=0.2, random_state=42)

v_smooth = [0.0000000001,0.000000001,0.00000001,0.0000001, 0.000001, 0.0001, 0.001, 0.01, 0.1, 0.2, 0.5, 1, 5, 10]
for i in v_smooth:
    nb_model = GaussianNB(var_smoothing= i)
    nb_model.fit(new_X_train, new_y_train)
    new_y_pred = nb_model.predict(new_X_test)
    print("var_smoothing parameter: ", i)
    print("Acuracy: ", nb_model.score(new_X_test, new_y_test))
    print("RMSE:", np.sqrt(metrics.mean_squared_error(new_y_test, new_y_pred)))
    print("F1 Score is:", f1_score(new_y_pred, new_y_test, average= 'micro'))
    print("-" * 65)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


var_smoothing parameter:  1e-10
Acuracy:  0.18453887352127596
RMSE: 12.700355926846667
F1 Score is: 0.184538873521276
-----------------------------------------------------------------
var_smoothing parameter:  1e-09
Acuracy:  0.18453887352127596
RMSE: 12.697220572866666
F1 Score is: 0.184538873521276
-----------------------------------------------------------------
var_smoothing parameter:  1e-08
Acuracy:  0.18453887352127596
RMSE: 12.685169018831138
F1 Score is: 0.184538873521276
-----------------------------------------------------------------
var_smoothing parameter:  1e-07
Acuracy:  0.18453887352127596
RMSE: 12.67928922524797
F1 Score is: 0.184538873521276
-----------------------------------------------------------------
var_smoothing parameter:  1e-06
Acuracy:  0.1846271555529398
RMSE: 12.667627149081033
F1 Score is: 0.1846271555529398
-----------------------------------------------------------------
var_smoothing parameter:  0.0001
Acuracy:  0.18692248837619915
RMSE: 12.340167597

## Multinomial Naive Bayes

#### Use all features

In [60]:
#Create a list of the different alpha values for Multinomial Naive Bayes to find optimal parameter
alphas = [0.00001, 0.0001, 0.001, 0.01, 0.1, 0.5, 1,2,5,10]
for a in alphas:
    new_model = MultinomialNB(alpha = a)
    new_model.fit(X_train, y_train)
    y_pred = new_model.predict(X_test)
    print("Alpha value is: ", a)
    print("Accuracy Score:", new_model.score(X_test, y_test))
    print("RMSE: ", np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
    print("-" * 65)


Alpha value is:  1e-05
Accuracy Score: 0.02224707197928315
RMSE:  49.3639977993659
-----------------------------------------------------------------
Alpha value is:  0.0001
Accuracy Score: 0.02224707197928315
RMSE:  49.45736415624814
-----------------------------------------------------------------
Alpha value is:  0.001
Accuracy Score: 0.02224707197928315
RMSE:  49.58867181319788
-----------------------------------------------------------------
Alpha value is:  0.01
Accuracy Score: 0.02221764463539521
RMSE:  49.75557258357397
-----------------------------------------------------------------
Alpha value is:  0.1
Accuracy Score: 0.02218821729150727
RMSE:  49.91289334330188
-----------------------------------------------------------------
Alpha value is:  0.5
Accuracy Score: 0.02221764463539521
RMSE:  49.78459279212164
-----------------------------------------------------------------
Alpha value is:  1
Accuracy Score: 0.02221764463539521
RMSE:  49.431118693209825
------------------------

Outcome: Accuracy is not good at all. 

#### Multinomial Naive Bayes Using Significant Features After Looking at Correlation Coefficients 

In [61]:
#Multinomial Naive Bayes Model utilizing new features dataset after eliminating non-significant features
alphas = [0.00001, 0.0001, 0.001, 0.01, 0.1, 0.5, 1,2,5,10]
for a in alphas:
    new_model = MultinomialNB(alpha = a)
    new_model.fit(new_X_train, new_y_train)
    new_y_pred = new_model.predict(new_X_test)
    print("Alpha value is: ", a)
    print("Accuracy Score:", new_model.score(new_X_test, new_y_test))
    print("RMSE: ", np.sqrt(metrics.mean_squared_error(new_y_test,new_y_pred)))
    print("-" * 65)

Alpha value is:  1e-05
Accuracy Score: 0.16273321170031194
RMSE:  38.24200613069051
-----------------------------------------------------------------
Alpha value is:  0.0001
Accuracy Score: 0.16273321170031194
RMSE:  38.24200613069051
-----------------------------------------------------------------
Alpha value is:  0.001
Accuracy Score: 0.16273321170031194
RMSE:  38.24200613069051
-----------------------------------------------------------------
Alpha value is:  0.01
Accuracy Score: 0.16273321170031194
RMSE:  38.24200613069051
-----------------------------------------------------------------
Alpha value is:  0.1
Accuracy Score: 0.16273321170031194
RMSE:  38.24200613069051
-----------------------------------------------------------------
Alpha value is:  0.5
Accuracy Score: 0.16273321170031194
RMSE:  38.24200613069051
-----------------------------------------------------------------
Alpha value is:  1
Accuracy Score: 0.16273321170031194
RMSE:  38.24200613069051
------------------------

Multinomial Naive Bayes Model Accuracy has improved significantly from previously.

## Neural Network

 reference: https://datascience.stackexchange.com/questions/36049/how-to-adjust-the-hyperparameters-of-mlp-classifier-to-get-more-perfect-performa

In [74]:
nn = MLPClassifier(max_iter = 100)

In [36]:
parameter_space = {
    'hidden_layer_sizes': [(50,50,50), (50,100,50), (100,)],
    'activation': ['tanh','relu'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.05],
    'learning_rate': ['constant','adaptive'],
}



#Find the best Neural Network
# best_nn = GridSearchCV(nn, parameter_space, n_jobs =-1, cv = 3)

best_nn.fit(X_train, y_train)
y_pred = best_nn.predict(X_test)

#print best parameter set
print('Best parameters found:\n', best_nn.best_params_)

#try on test set
print('Results on the test set:')
print(classification_report(y_test, y_pred))


In [None]:
#Ran above for a while. Going to try using StandardScaler to standardize the data since MLP is sensitive to feature scaling. 


 Future Reference: https://www.kdnuggets.com/2016/10/beginners-guide-neural-networks-python-scikit-learn.html/2

In [33]:
#Try out different combinations of different features
#Try out different standardizations


In [46]:
# #create sample for experiment
# sample_df = df.sample(frac = 0.10, replace = True, random_state=1)
 
# #create sample x and y
# X = sample_df[['acousticness', 
#        'danceability',
#        'energy',
#        'year', 
#         'loudness',
#        'instrumentalness', 
#        'liveness', 
#        'speechiness', 
#        'tempo',
#         'valence', 
#         'key', 
#         'mode', 
#         'explicit', 
#         'duration_ms']]
# y = sample_df[['popularity']]
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# #create the Neural network classifier
# nn = MLPClassifier(max_iter = 100)

# #fit the data
# nn.fit(X_train, y_train)

# #predict the data
# y_pred = nn.predict(X_test)

# #try on test set
# print('Results on the test set:')
# print(classification_report(y_test, y_pred))


In [63]:
new_data = df[['energy', 'year', 'loudness', 'acousticness']]
new_data['loudness'] = new_data['loudness'] + 60
X_train, X_test, y_train, y_test = train_test_split(new_data, y, test_size=0.2, random_state=42)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [16]:
#create sample for experiment
sample_df = df.sample(frac = 0.10, replace = True, random_state=1)
 
#create sample x and y
X = sample_df[['acousticness', 
       'energy',
       'year', 
        'loudness'
       ]]
y = sample_df[['popularity']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#create the Neural network classifier
nn = MLPClassifier(max_iter = 100)

#fit the data
nn.fit(X_train, y_train)

#predict the data
y_pred = nn.predict(X_test)

#try on test set
print('Results on the test set:')
print("Accuracy Score:", nn.score(X_test, y_test))
print("RMSE: ", np.sqrt(metrics.mean_squared_error(y_test, y_pred)))


  y = column_or_1d(y, warn=True)


Results on the test set:
Accuracy Score: 0.07178581935863489
RMSE:  20.543368974589473
