In [38]:
# Import dependencies
# Database Dependencies
from pymongo import MongoClient
import gridfs
import pandas as pd
from io import StringIO
# Machine Learning Dependencies
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
import sklearn.metrics

# Database Connection and Loading Data

In [2]:
# Create a client instance
client = MongoClient()

In [3]:
# Note: `mongofiles` identifies files stored with whatever filepath
# was passed from the command line
# i.e $ mongofiles -d=Music_db put Data_Sample/features_3_sec.csv
# creates the identifier "Data_Sample/features_3_sec.csv" regardless
# of the relative path from this file. This is annoying, but the following
# code retrieves the proper identifier

In [4]:
# Create our database instance
db = client["Music_db"]

# Get our file identifiers
files = []
for doc in db.fs.files.find():
    files.append(doc["filename"])

In [5]:
# Create a gridfs instance
fs = gridfs.GridFS(db)

In [6]:
# Create GridFS file instance with fs.get_last_version
infile = files[0]
features_3_sec_raw = fs.get_last_version(infile)

In [7]:
# Read data as a byte string
bytes_string = features_3_sec_raw.read() 

In [8]:
# Decode our binary string containing music feature data
data_string = bytes_string.decode("utf-8")

In [9]:
# Add feature string to Pandas dataframe
data_string_IO = StringIO(data_string)

features_df = pd.read_csv(data_string_IO)
features_df

Unnamed: 0,filename,length,chroma_stft_mean,chroma_stft_var,rms_mean,rms_var,spectral_centroid_mean,spectral_centroid_var,spectral_bandwidth_mean,spectral_bandwidth_var,...,mfcc16_var,mfcc17_mean,mfcc17_var,mfcc18_mean,mfcc18_var,mfcc19_mean,mfcc19_var,mfcc20_mean,mfcc20_var,label
0,blues.00000.0.wav,66149,0.335406,0.091048,0.130405,0.003521,1773.065032,167541.630869,1972.744388,117335.771563,...,39.687145,-3.241280,36.488243,0.722209,38.099152,-5.050335,33.618073,-0.243027,43.771767,blues
1,blues.00000.1.wav,66149,0.343065,0.086147,0.112699,0.001450,1816.693777,90525.690866,2010.051501,65671.875673,...,64.748276,-6.055294,40.677654,0.159015,51.264091,-2.837699,97.030830,5.784063,59.943081,blues
2,blues.00000.2.wav,66149,0.346815,0.092243,0.132003,0.004620,1788.539719,111407.437613,2084.565132,75124.921716,...,67.336563,-1.768610,28.348579,2.378768,45.717648,-1.938424,53.050835,2.517375,33.105122,blues
3,blues.00000.3.wav,66149,0.363639,0.086856,0.132565,0.002448,1655.289045,111952.284517,1960.039988,82913.639269,...,47.739452,-3.841155,28.337118,1.218588,34.770935,-3.580352,50.836224,3.630866,32.023678,blues
4,blues.00000.4.wav,66149,0.335579,0.088129,0.143289,0.001701,1630.656199,79667.267654,1948.503884,60204.020268,...,30.336359,0.664582,45.880913,1.689446,51.363583,-3.392489,26.738789,0.536961,29.146694,blues
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9985,rock.00099.5.wav,66149,0.349126,0.080515,0.050019,0.000097,1499.083005,164266.886443,1718.707215,85931.574523,...,42.485981,-9.094270,38.326839,-4.246976,31.049839,-5.625813,48.804092,1.818823,38.966969,rock
9986,rock.00099.6.wav,66149,0.372564,0.082626,0.057897,0.000088,1847.965128,281054.935973,1906.468492,99727.037054,...,32.415203,-12.375726,66.418587,-3.081278,54.414265,-11.960546,63.452255,0.428857,18.697033,rock
9987,rock.00099.7.wav,66149,0.347481,0.089019,0.052403,0.000701,1346.157659,662956.246325,1561.859087,138762.841945,...,78.228149,-2.524483,21.778994,4.809936,25.980829,1.775686,48.582378,-0.299545,41.586990,rock
9988,rock.00099.8.wav,66149,0.387527,0.084815,0.066430,0.000320,2084.515327,203891.039161,2018.366254,22860.992562,...,28.323744,-5.363541,17.209942,6.462601,21.442928,2.354765,24.843613,0.675824,12.787750,rock


# Build, Train, and Test Models to Predict `label` from feature data

## `DecisionTreeClassifier`

In [10]:
# Separate Features (X) from the Target (y)
y = features_df["label"]
X = features_df.drop(columns="label", axis=1)

In [11]:
# Drop `filename` (identifier/same as `label`) and `length` (same for all songs - 3 sec sample)
# from features X
X = X.drop(columns=["filename", "length"], axis=1)
X

Unnamed: 0,chroma_stft_mean,chroma_stft_var,rms_mean,rms_var,spectral_centroid_mean,spectral_centroid_var,spectral_bandwidth_mean,spectral_bandwidth_var,rolloff_mean,rolloff_var,...,mfcc16_mean,mfcc16_var,mfcc17_mean,mfcc17_var,mfcc18_mean,mfcc18_var,mfcc19_mean,mfcc19_var,mfcc20_mean,mfcc20_var
0,0.335406,0.091048,0.130405,0.003521,1773.065032,167541.630869,1972.744388,117335.771563,3714.560359,1.080790e+06,...,-2.853603,39.687145,-3.241280,36.488243,0.722209,38.099152,-5.050335,33.618073,-0.243027,43.771767
1,0.343065,0.086147,0.112699,0.001450,1816.693777,90525.690866,2010.051501,65671.875673,3869.682242,6.722448e+05,...,4.074709,64.748276,-6.055294,40.677654,0.159015,51.264091,-2.837699,97.030830,5.784063,59.943081
2,0.346815,0.092243,0.132003,0.004620,1788.539719,111407.437613,2084.565132,75124.921716,3997.639160,7.907127e+05,...,4.806280,67.336563,-1.768610,28.348579,2.378768,45.717648,-1.938424,53.050835,2.517375,33.105122
3,0.363639,0.086856,0.132565,0.002448,1655.289045,111952.284517,1960.039988,82913.639269,3568.300218,9.216524e+05,...,-1.359111,47.739452,-3.841155,28.337118,1.218588,34.770935,-3.580352,50.836224,3.630866,32.023678
4,0.335579,0.088129,0.143289,0.001701,1630.656199,79667.267654,1948.503884,60204.020268,3469.992864,6.102111e+05,...,2.092937,30.336359,0.664582,45.880913,1.689446,51.363583,-3.392489,26.738789,0.536961,29.146694
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9985,0.349126,0.080515,0.050019,0.000097,1499.083005,164266.886443,1718.707215,85931.574523,3015.559458,8.479527e+05,...,5.773784,42.485981,-9.094270,38.326839,-4.246976,31.049839,-5.625813,48.804092,1.818823,38.966969
9986,0.372564,0.082626,0.057897,0.000088,1847.965128,281054.935973,1906.468492,99727.037054,3746.694524,1.170890e+06,...,2.074155,32.415203,-12.375726,66.418587,-3.081278,54.414265,-11.960546,63.452255,0.428857,18.697033
9987,0.347481,0.089019,0.052403,0.000701,1346.157659,662956.246325,1561.859087,138762.841945,2442.362154,2.602871e+06,...,-1.005473,78.228149,-2.524483,21.778994,4.809936,25.980829,1.775686,48.582378,-0.299545,41.586990
9988,0.387527,0.084815,0.066430,0.000320,2084.515327,203891.039161,2018.366254,22860.992562,4313.266226,4.968878e+05,...,4.123402,28.323744,-5.363541,17.209942,6.462601,21.442928,2.354765,24.843613,0.675824,12.787750


In [12]:
# Encode class labels in target y
genre_nums = {}
cnt = 1
for genre in y.unique():
    genre_nums[genre] = cnt
    cnt += 1

y = y.apply(lambda x: genre_nums[x])
genre_nums

{'blues': 1,
 'classical': 2,
 'country': 3,
 'disco': 4,
 'hiphop': 5,
 'jazz': 6,
 'metal': 7,
 'pop': 8,
 'reggae': 9,
 'rock': 10}

In [13]:
# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [14]:
# Instantitae DecisionTreeClassifier
tree_model = DecisionTreeClassifier()

# Fit the model
tree_model = tree_model.fit(X_train, y_train)

# Make predictions using the testing data
tree_pred = tree_model.predict(X_test)
results = pd.DataFrame({
    "Prediction": tree_pred,
    "Actual": y_test
}).reset_index(drop=True)
results

Unnamed: 0,Prediction,Actual
0,3,9
1,9,9
2,10,10
3,7,1
4,7,10
...,...,...
2493,2,2
2494,10,4
2495,3,3
2496,1,1


In [15]:
# Evaluate model
cm = sklearn.metrics.confusion_matrix(y_test, tree_pred)
cm

array([[133,   2,  20,  19,  10,  14,  15,   1,   6,  16],
       [  3, 215,   4,   1,   0,  14,   1,   1,   0,   8],
       [ 28,   8, 140,  13,   6,  28,   9,   8,   8,  15],
       [  8,   2,  10, 128,  15,   4,   9,  17,  14,  22],
       [ 11,   1,   7,  17, 172,   3,   9,  22,  14,   6],
       [ 15,  21,  21,   7,   7, 149,   3,   5,   7,  16],
       [  8,   1,   4,   7,  18,   2, 198,   1,   3,  17],
       [  1,   1,   9,  13,  12,   6,   0, 167,  22,  11],
       [  9,   1,   8,  15,  21,   8,   4,  13, 175,  11],
       [ 21,   7,  22,  14,   9,  14,  16,   8,  18, 115]])

In [16]:
# Calculate Accuracy Score
acc_score = sklearn.metrics.accuracy_score(y_test, tree_pred)
acc_score

0.6373098478783027

In [17]:
# Print classification report
print(sklearn.metrics.classification_report(y_test, tree_pred))

              precision    recall  f1-score   support

           1       0.56      0.56      0.56       236
           2       0.83      0.87      0.85       247
           3       0.57      0.53      0.55       263
           4       0.55      0.56      0.55       229
           5       0.64      0.66      0.65       262
           6       0.62      0.59      0.60       251
           7       0.75      0.76      0.76       259
           8       0.69      0.69      0.69       242
           9       0.66      0.66      0.66       265
          10       0.49      0.47      0.48       244

    accuracy                           0.64      2498
   macro avg       0.63      0.64      0.63      2498
weighted avg       0.64      0.64      0.64      2498



## `KNN`

In [26]:
# Instantiate SVM
knn_model = KNeighborsClassifier(n_neighbors=21)

# Fit the model
knn_model = knn_model.fit(X_train, y_train)

# Make predicitions using the testing data
knn_pred = knn_model.predict(X_test)
results = pd.DataFrame({
    "Prediction": knn_pred,
    "Actual": y_test
}).reset_index(drop=True)
results

Unnamed: 0,Prediction,Actual
0,10,9
1,2,9
2,4,10
3,7,1
4,7,10
...,...,...
2493,2,2
2494,3,4
2495,4,3
2496,10,1


In [27]:
# Evaluate model
cm = sklearn.metrics.confusion_matrix(y_test, knn_pred)
cm

array([[ 33,  14,  25,  25,  19,  43,  41,   9,  11,  16],
       [ 11, 167,   7,   1,   0,  16,  38,   0,   4,   3],
       [ 21,  17,  50,  44,  12,  25,  28,  16,  28,  22],
       [ 26,   2,  19,  53,  33,   4,  19,  19,  27,  27],
       [ 10,   2,  16,  43,  40,   1,  23,  59,  53,  15],
       [ 38,  24,  31,  21,   6,  53,  34,  14,  10,  20],
       [ 26,  31,  21,  22,   9,  11, 111,   0,   4,  24],
       [  7,   3,  10,  30,  38,  13,   5,  91,  39,   6],
       [ 11,   3,  27,  32,  22,   7,  17,  45,  90,  11],
       [ 20,  12,  37,  47,  22,  22,  32,  17,  19,  16]])

In [28]:
# Calculate Accuracy Score
acc_score = sklearn.metrics.accuracy_score(y_test, knn_pred)
acc_score

0.28182546036829464

In [29]:
# Print classification report
print(sklearn.metrics.classification_report(y_test, knn_pred))

              precision    recall  f1-score   support

           1       0.16      0.14      0.15       236
           2       0.61      0.68      0.64       247
           3       0.21      0.19      0.20       263
           4       0.17      0.23      0.19       229
           5       0.20      0.15      0.17       262
           6       0.27      0.21      0.24       251
           7       0.32      0.43      0.37       259
           8       0.34      0.38      0.36       242
           9       0.32      0.34      0.33       265
          10       0.10      0.07      0.08       244

    accuracy                           0.28      2498
   macro avg       0.27      0.28      0.27      2498
weighted avg       0.27      0.28      0.27      2498



## Naive-Bayes Classifier

In [32]:
# Instantiate the model
gnb_model = GaussianNB()

# Fit the model
gnb_model = gnb_model.fit(X_train, y_train)

# Make predicitions using the testing data
gnb_pred = gnb_model.predict(X_test)
results = pd.DataFrame({
    "Prediction": gnb_pred,
    "Actual": y_test
}).reset_index(drop=True)
results

Unnamed: 0,Prediction,Actual
0,4,9
1,8,9
2,5,10
3,7,1
4,7,10
...,...,...
2493,6,2
2494,4,4
2495,10,3
2496,7,1


In [35]:
# Evaluate model
cm = sklearn.metrics.confusion_matrix(y_test, gnb_pred)
cm

array([[ 56,  15,  33,   4,   1,  27,  78,   0,  16,   6],
       [  1, 220,   2,   0,   0,   8,  10,   1,   3,   2],
       [ 16,  15,  84,  37,   4,   9,  58,   1,  22,  17],
       [  8,   2,   7,  92,  10,   2,  76,   5,  16,  11],
       [  7,   0,  25,  47,  63,   1,  50,  22,  42,   5],
       [ 22,  52,  12,  26,   0,  56,  48,   6,   6,  23],
       [  1,   1,   1,  10,   7,   0, 232,   1,   2,   4],
       [  1,   2,   6,  57,  10,   3,  19, 116,  23,   5],
       [ 25,   1,  31,  30,  21,   1,   7,  19, 122,   8],
       [  3,  12,  27,  36,  11,   2, 108,   4,  17,  24]])

In [36]:
# Calculate Accuracy Score
acc_score = sklearn.metrics.accuracy_score(y_test, gnb_pred)
acc_score

0.42634107285828665

In [37]:
# Print classification report
print(sklearn.metrics.classification_report(y_test, gnb_pred))

              precision    recall  f1-score   support

           1       0.40      0.24      0.30       236
           2       0.69      0.89      0.78       247
           3       0.37      0.32      0.34       263
           4       0.27      0.40      0.32       229
           5       0.50      0.24      0.32       262
           6       0.51      0.22      0.31       251
           7       0.34      0.90      0.49       259
           8       0.66      0.48      0.56       242
           9       0.45      0.46      0.46       265
          10       0.23      0.10      0.14       244

    accuracy                           0.43      2498
   macro avg       0.44      0.42      0.40      2498
weighted avg       0.44      0.43      0.40      2498



## `RandomForestClassifier`

In [41]:
# Instantiate the model
rf_model = RandomForestClassifier(n_estimators=500)

# Fit the model
rf_model = rf_model.fit(X_train, y_train)

# Make predicitions using the testing data
rf_pred = rf_model.predict(X_test)
results = pd.DataFrame({
    "Prediction": rf_pred,
    "Actual": y_test
}).reset_index(drop=True)
results

Unnamed: 0,Prediction,Actual
0,9,9
1,9,9
2,10,10
3,1,1
4,7,10
...,...,...
2493,2,2
2494,4,4
2495,3,3
2496,1,1


In [43]:
# Evaluate model
cm = sklearn.metrics.confusion_matrix(y_test, rf_pred)
cm

array([[216,   0,   6,   4,   1,   5,   4,   0,   0,   0],
       [  0, 238,   2,   0,   0,   5,   0,   0,   0,   2],
       [ 10,   1, 225,   3,   0,   9,   2,   1,   9,   3],
       [  3,   4,   4, 193,   8,   0,   3,   2,   5,   7],
       [  1,   1,   2,   8, 223,   0,   5,  14,   4,   4],
       [  4,  13,   8,   2,   1, 223,   0,   0,   0,   0],
       [  1,   0,   0,   2,   5,   1, 240,   0,   1,   9],
       [  0,   2,  10,   7,   6,   0,   0, 208,   7,   2],
       [  2,   2,   8,   4,   5,   2,   0,   7, 234,   1],
       [  8,   5,  10,  13,   1,   8,  13,   1,   7, 178]])

In [44]:
# Calculate Accuracy Score
acc_score = sklearn.metrics.accuracy_score(y_test, rf_pred)
acc_score

0.8718975180144115

In [45]:
# Print classification report
print(sklearn.metrics.classification_report(y_test, rf_pred))

              precision    recall  f1-score   support

           1       0.88      0.92      0.90       236
           2       0.89      0.96      0.93       247
           3       0.82      0.86      0.84       263
           4       0.82      0.84      0.83       229
           5       0.89      0.85      0.87       262
           6       0.88      0.89      0.88       251
           7       0.90      0.93      0.91       259
           8       0.89      0.86      0.88       242
           9       0.88      0.88      0.88       265
          10       0.86      0.73      0.79       244

    accuracy                           0.87      2498
   macro avg       0.87      0.87      0.87      2498
weighted avg       0.87      0.87      0.87      2498

