In [1]:
# Import dependencies
# Database Dependencies
from pymongo import MongoClient
import gridfs
import pandas as pd
from io import StringIO
# Machine Learning Dependencies
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import sklearn.metrics

# Database Connection and Loading Data

In [2]:
# Create a client instance
client = MongoClient()

In [3]:
# Note: `mongofiles` identifies files stored with whatever filepath
# was passed from the command line
# i.e $ mongofiles -d=Music_db put Data_Sample/features_3_sec.csv
# creates the identifier "Data_Sample/features_3_sec.csv" regardless
# of the relative path from this file. This is annoying, but the following
# code retrieves the proper identifier

In [4]:
# Create our database instance
db = client["Music_db"]

# Get our file identifiers
files = []
for doc in db.fs.files.find():
    files.append(doc["filename"])

In [5]:
# Create a gridfs instance
fs = gridfs.GridFS(db)

In [6]:
# Create GridFS file instance with fs.get_last_version
infile = files[0]
features_3_sec_raw = fs.get_last_version(infile)

In [7]:
# Read data as a byte string
bytes_string = features_3_sec_raw.read() 

In [8]:
# Decode our binary string containing music feature data
data_string = bytes_string.decode("utf-8")

In [9]:
# Add feature string to Pandas dataframe
data_string_IO = StringIO(data_string)

features_df = pd.read_csv(data_string_IO)
features_df

Unnamed: 0,filename,length,chroma_stft_mean,chroma_stft_var,rms_mean,rms_var,spectral_centroid_mean,spectral_centroid_var,spectral_bandwidth_mean,spectral_bandwidth_var,...,mfcc16_var,mfcc17_mean,mfcc17_var,mfcc18_mean,mfcc18_var,mfcc19_mean,mfcc19_var,mfcc20_mean,mfcc20_var,label
0,blues.00000.0.wav,66149,0.335406,0.091048,0.130405,0.003521,1773.065032,167541.630869,1972.744388,117335.771563,...,39.687145,-3.241280,36.488243,0.722209,38.099152,-5.050335,33.618073,-0.243027,43.771767,blues
1,blues.00000.1.wav,66149,0.343065,0.086147,0.112699,0.001450,1816.693777,90525.690866,2010.051501,65671.875673,...,64.748276,-6.055294,40.677654,0.159015,51.264091,-2.837699,97.030830,5.784063,59.943081,blues
2,blues.00000.2.wav,66149,0.346815,0.092243,0.132003,0.004620,1788.539719,111407.437613,2084.565132,75124.921716,...,67.336563,-1.768610,28.348579,2.378768,45.717648,-1.938424,53.050835,2.517375,33.105122,blues
3,blues.00000.3.wav,66149,0.363639,0.086856,0.132565,0.002448,1655.289045,111952.284517,1960.039988,82913.639269,...,47.739452,-3.841155,28.337118,1.218588,34.770935,-3.580352,50.836224,3.630866,32.023678,blues
4,blues.00000.4.wav,66149,0.335579,0.088129,0.143289,0.001701,1630.656199,79667.267654,1948.503884,60204.020268,...,30.336359,0.664582,45.880913,1.689446,51.363583,-3.392489,26.738789,0.536961,29.146694,blues
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9985,rock.00099.5.wav,66149,0.349126,0.080515,0.050019,0.000097,1499.083005,164266.886443,1718.707215,85931.574523,...,42.485981,-9.094270,38.326839,-4.246976,31.049839,-5.625813,48.804092,1.818823,38.966969,rock
9986,rock.00099.6.wav,66149,0.372564,0.082626,0.057897,0.000088,1847.965128,281054.935973,1906.468492,99727.037054,...,32.415203,-12.375726,66.418587,-3.081278,54.414265,-11.960546,63.452255,0.428857,18.697033,rock
9987,rock.00099.7.wav,66149,0.347481,0.089019,0.052403,0.000701,1346.157659,662956.246325,1561.859087,138762.841945,...,78.228149,-2.524483,21.778994,4.809936,25.980829,1.775686,48.582378,-0.299545,41.586990,rock
9988,rock.00099.8.wav,66149,0.387527,0.084815,0.066430,0.000320,2084.515327,203891.039161,2018.366254,22860.992562,...,28.323744,-5.363541,17.209942,6.462601,21.442928,2.354765,24.843613,0.675824,12.787750,rock


# Build, Train, and Test Models to Predict `label` from feature data

## `DecisionTreeClassifier`

In [10]:
# Separate Features (X) from the Target (y)
y = features_df["label"]
X = features_df.drop(columns="label", axis=1)

In [11]:
# Drop `filename` (identifier/same as `label`) and `length` (same for all songs - 3 sec sample)
# from features X
X = X.drop(columns=["filename", "length"], axis=1)
X

Unnamed: 0,chroma_stft_mean,chroma_stft_var,rms_mean,rms_var,spectral_centroid_mean,spectral_centroid_var,spectral_bandwidth_mean,spectral_bandwidth_var,rolloff_mean,rolloff_var,...,mfcc16_mean,mfcc16_var,mfcc17_mean,mfcc17_var,mfcc18_mean,mfcc18_var,mfcc19_mean,mfcc19_var,mfcc20_mean,mfcc20_var
0,0.335406,0.091048,0.130405,0.003521,1773.065032,167541.630869,1972.744388,117335.771563,3714.560359,1.080790e+06,...,-2.853603,39.687145,-3.241280,36.488243,0.722209,38.099152,-5.050335,33.618073,-0.243027,43.771767
1,0.343065,0.086147,0.112699,0.001450,1816.693777,90525.690866,2010.051501,65671.875673,3869.682242,6.722448e+05,...,4.074709,64.748276,-6.055294,40.677654,0.159015,51.264091,-2.837699,97.030830,5.784063,59.943081
2,0.346815,0.092243,0.132003,0.004620,1788.539719,111407.437613,2084.565132,75124.921716,3997.639160,7.907127e+05,...,4.806280,67.336563,-1.768610,28.348579,2.378768,45.717648,-1.938424,53.050835,2.517375,33.105122
3,0.363639,0.086856,0.132565,0.002448,1655.289045,111952.284517,1960.039988,82913.639269,3568.300218,9.216524e+05,...,-1.359111,47.739452,-3.841155,28.337118,1.218588,34.770935,-3.580352,50.836224,3.630866,32.023678
4,0.335579,0.088129,0.143289,0.001701,1630.656199,79667.267654,1948.503884,60204.020268,3469.992864,6.102111e+05,...,2.092937,30.336359,0.664582,45.880913,1.689446,51.363583,-3.392489,26.738789,0.536961,29.146694
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9985,0.349126,0.080515,0.050019,0.000097,1499.083005,164266.886443,1718.707215,85931.574523,3015.559458,8.479527e+05,...,5.773784,42.485981,-9.094270,38.326839,-4.246976,31.049839,-5.625813,48.804092,1.818823,38.966969
9986,0.372564,0.082626,0.057897,0.000088,1847.965128,281054.935973,1906.468492,99727.037054,3746.694524,1.170890e+06,...,2.074155,32.415203,-12.375726,66.418587,-3.081278,54.414265,-11.960546,63.452255,0.428857,18.697033
9987,0.347481,0.089019,0.052403,0.000701,1346.157659,662956.246325,1561.859087,138762.841945,2442.362154,2.602871e+06,...,-1.005473,78.228149,-2.524483,21.778994,4.809936,25.980829,1.775686,48.582378,-0.299545,41.586990
9988,0.387527,0.084815,0.066430,0.000320,2084.515327,203891.039161,2018.366254,22860.992562,4313.266226,4.968878e+05,...,4.123402,28.323744,-5.363541,17.209942,6.462601,21.442928,2.354765,24.843613,0.675824,12.787750


In [12]:
# Encode class labels in target y
genre_nums = {}
cnt = 1
for genre in y.unique():
    genre_nums[genre] = cnt
    cnt += 1

y = y.apply(lambda x: genre_nums[x])
y = y.values.reshape(-1, 1)

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
y_train.reshape(-1, 1)
y_test.reshape(-1, 1)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(7492, 57)
(2498, 57)
(7492, 1)
(2498, 1)


In [14]:
# Instantitae DecisionTreeClassifier
model = DecisionTreeClassifier()

# Fit the model
model = model.fit(X_train, y_train)

# Make predictions using the testing data
predictions = model.predict(X_test)
predictions

array([ 9,  9, 10, ...,  3,  1,  4])

In [15]:
# Evaluate model
cm = sklearn.metrics.confusion_matrix(y_test, predictions)
cm

array([[128,   1,  30,  18,  11,  13,  16,   2,   5,  12],
       [  2, 215,   3,   1,   0,  12,   1,   0,   2,  11],
       [ 27,   8, 136,  19,   6,  19,   4,  10,   9,  25],
       [  9,   3,  10, 121,  18,   5,  10,  20,  16,  17],
       [  9,   1,   6,  23, 170,   0,  13,  20,  15,   5],
       [ 16,  22,  22,   8,   3, 150,   3,   4,   7,  16],
       [ 10,   1,   5,   8,  11,   0, 197,   1,   3,  23],
       [  2,   1,  11,  17,  12,   7,   0, 160,  18,  14],
       [  8,   1,   8,  14,  20,   7,   4,  14, 178,  11],
       [ 18,   5,  27,  19,   9,  10,  16,   8,  16, 116]])

In [16]:
# Calculate Accuracy Score
acc_score = sklearn.metrics.accuracy_score(y_test, predictions)
acc_score

0.6289031224979984

## `SVM`