In [1]:
import pandas as pd
import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import SVC

import lightgbm as lgbm
import math
import pickle

# Loading original data

In [2]:
data_train = pd.read_csv('data_train.csv')
data_train.head()

Unnamed: 0,MFCC1_Mean,MFCC2_Mean,MFCC3_Mean,MFCC4_Mean,MFCC5_Mean,MFCC6_Mean,MFCC7_Mean,MFCC8_Mean,MFCC9_Mean,MFCC10_Mean,...,MFCC12_Var,MFCC13_Var,MFCC14_Var,MFCC15_Var,MFCC16_Var,MFCC17_Var,MFCC18_Var,MFCC19_Var,MFCC20_Var,Label
0,-93.411186,147.278046,-97.258049,83.586334,-14.802794,20.762394,-7.277999,9.453241,-0.376516,7.87562,...,43.238956,26.841871,24.928793,19.325878,27.467104,24.696417,24.23526,21.569134,23.072239,metal
1,-65.97541,114.01535,-91.296211,75.249908,-29.545425,21.402174,-17.291033,16.866028,-15.112152,12.151247,...,53.351036,41.479321,17.116251,29.222637,29.792177,17.065321,17.538721,20.774614,16.81094,metal
2,-61.660713,112.881172,-86.35849,69.170464,-25.941076,24.364878,-22.086647,18.880112,-13.000805,14.005173,...,36.167828,30.252613,23.398046,19.78895,23.08149,17.972075,23.284164,16.86795,15.886907,metal
3,-85.775391,139.275513,-91.946304,86.010201,-18.682989,25.84544,-7.294381,15.457644,-0.646857,9.919909,...,47.799232,23.859961,25.833527,18.115795,24.416447,19.360317,23.314762,18.214552,21.407824,metal
4,-104.304657,149.740524,-104.292702,81.813568,-8.93132,19.425995,-3.741026,5.96945,3.799001,7.449512,...,39.067505,31.955067,32.847073,31.047714,35.233013,16.906719,17.978216,21.101961,15.177757,metal


In [3]:
data_train = data_train.sample(frac=1, random_state = 101).reset_index(drop=True)

In [4]:
classes = {'blues': 0, 'classical': 1, 'country': 2, 'disco': 3, 'hiphop': 4, 'jazz': 5, 'metal': 6, 'pop': 7, 'reggae': 8, 'rock': 9}

In [5]:
data_train.Label = [classes[l] for l in data_train.Label]

In [6]:
x_data = data_train[data_train.columns[:-1]]
y_data = data_train['Label']

In [7]:
x_train  = x_data[data_train.columns[:-1]][:8000]
x_val  = x_data[data_train.columns[:-1]][8000:]

y_train  = y_data[:8000]
y_val  = y_data[8000:]

# GridSearch SVM

In [8]:
params = {
    "cls__C": [5],
    "cls__kernel": ['rbf'],
}

pipe_svm = Pipeline([
    ('scale', StandardScaler()),
    ('var_tresh', VarianceThreshold(threshold=0.1)),
    ('cls', SVC())
])

grid_svm = GridSearchCV(pipe_svm, params, scoring='accuracy', n_jobs=-1, cv=9,verbose=2)
grid_svm.fit(x_train, y_train)

Fitting 9 folds for each of 1 candidates, totalling 9 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   9 | elapsed:    7.5s remaining:    9.4s
[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed:    9.4s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed:    9.4s finished


GridSearchCV(cv=9,
             estimator=Pipeline(steps=[('scale', StandardScaler()),
                                       ('var_tresh',
                                        VarianceThreshold(threshold=0.1)),
                                       ('cls', SVC())]),
             n_jobs=-1, param_grid={'cls__C': [5], 'cls__kernel': ['rbf']},
             scoring='accuracy', verbose=2)

In [9]:
preds_val = grid_svm.predict(x_val)
print("Best score on train set (accuracy) = {:.4f}".format(grid_svm.best_score_))
print("Best score on validation set (accuracy) = {:.4f}".format(accuracy_score(y_val, preds_val)))

Best score on train set (accuracy) = 0.8437
Best score on validation set (accuracy) = 0.8779


# Test on YouTube data

In [10]:
data_test = pd.read_csv('data_test.csv')
data_test.head()

Unnamed: 0,MFCC1_Mean,MFCC2_Mean,MFCC3_Mean,MFCC4_Mean,MFCC5_Mean,MFCC6_Mean,MFCC7_Mean,MFCC8_Mean,MFCC9_Mean,MFCC10_Mean,...,MFCC12_Var,MFCC13_Var,MFCC14_Var,MFCC15_Var,MFCC16_Var,MFCC17_Var,MFCC18_Var,MFCC19_Var,MFCC20_Var,Label
0,-249.010376,149.310394,-38.809299,42.263432,-14.468389,10.295317,-13.667592,4.414966,-4.66299,0.519707,...,122.881081,72.274254,66.570541,259.968506,53.388855,69.137527,52.009602,167.365768,55.598461,hiphop
1,-279.474091,167.203583,-20.340523,25.866331,-15.994143,0.329006,-22.783136,-1.330112,-11.841547,2.925979,...,186.408676,110.577431,178.660934,108.807304,61.083157,99.472855,71.585411,124.691971,158.015259,hiphop
2,-296.589569,145.014938,-8.389028,32.571354,-3.660239,20.281359,-22.27586,-11.310889,-22.031324,-7.625155,...,294.384308,259.512238,209.804428,81.502647,135.450653,67.957977,68.078758,237.000793,172.157974,hiphop
3,-240.589905,161.670792,-22.104473,38.9333,-15.202853,16.041809,-19.434389,0.025945,-13.720518,-4.024367,...,76.631096,91.791725,118.611588,73.517639,123.737152,55.229832,44.574848,99.319275,107.875664,hiphop
4,-141.305054,122.189499,-54.743061,65.195763,-40.119255,14.402914,-40.423481,3.981862,-8.399135,0.515828,...,143.140717,90.704193,92.015656,54.689579,74.343674,59.26936,64.264503,74.729507,127.474457,hiphop


In [11]:
data_test.Label = [classes[l] for l in data_test.Label]

In [12]:
x_test = data_test[data_test.columns[:-1]]
y_test = data_test['Label']

In [13]:
preds_test = grid_svm.predict(x_test)
print("Best score on test set (accuracy) = {:.4f}".format(accuracy_score(y_test, preds_test)))

Best score on test set (accuracy) = 0.2014


# Saving model

In [14]:
filename = 'SVM.sav'
pickle.dump(grid_svm, open(filename, 'wb'))

In [15]:
print('done')

done
