# First - Import full library

In [110]:
import pandas as pd
import numpy as np
from sklearn import  preprocessing, cross_validation, neighbors, metrics, svm
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [5]:
songs_dataset = pd.read_json('MasterSongList.json')

In [7]:
songs_dataset.head(1)

Unnamed: 0,_id,album,artist,audio_features,context,decades,genres,lyrics_features,moods,name,new_context,picture,recording_id,sub_context,yt_id,yt_views
0,{'$oid': '52fdfb440b9398049f3d7a8c'},Gangnam Style (강남스타일),PSY,"[11, 0.912744, 0.083704, 132.069, 0.293137, 0....",[work out],[],[pop],"[oppa, gangnam, style, gangnam, style, najeneu...","[energetic, motivational]",Gangnam Style (강남스타일),work out,http://images.musicnet.com/albums/073/463/405/...,50232.0,[working out: cardio],9bZkp7q19f0,2450112089


In [8]:
songs_dataset.loc[:,'genres'] = songs_dataset['genres'].apply(''.join)
def consolidateGenre(genre):
    if(len(genre) > 0):
        return genre.split(':')[0]
    else:
        return genre
songs_dataset.loc[:,'genres'] = songs_dataset['genres'].apply(consolidateGenre)

In [10]:
audio_feature_list = []
for audio_feature in songs_dataset['audio_features']:
    audio_feature_list.append(audio_feature)

In [13]:
audio_feature_header = ['key','energy','liveliness','tempo','speechiness','acousticness','instrumentalness','time_signature'
                         ,'duration','loudness','valence','danceability','mode','time_signature_confidence','tempo_confidence'
                         ,'key_confidence','mode_confidence']

df_feature =  pd.DataFrame(audio_feature_list, columns = audio_feature_header)
df_feature.loc[:,].dropna(axis=0,how='all',inplace=True)

df_feature['genres'] = songs_dataset['genres']
df_feature.head(10)

Unnamed: 0,key,energy,liveliness,tempo,speechiness,acousticness,instrumentalness,time_signature,duration,loudness,valence,danceability,mode,time_signature_confidence,tempo_confidence,key_confidence,mode_confidence,genres
0,11.0,0.912744,0.083704,132.069,0.293137,0.005423,1e-06,0.0,4.0,218.30667,-3.89,0.752186,0.72692,0.552,0.541,1.0,1.0,pop
1,6.0,0.745704,0.119955,100.008,0.046255,0.02623,0.012727,1.0,4.0,235.06086,-7.687,0.351282,0.691817,0.737,0.634,0.796,1.0,pop
2,5.0,0.709932,0.231455,130.03,0.121741,0.036662,0.0,0.0,4.0,232.46104,-5.15,0.37439,0.704729,0.565,0.565,0.743,1.0,
3,3.0,0.705822,0.053292,126.009,0.126016,0.001966,0.0,0.0,4.0,194.09333,-3.898,0.592798,0.875137,0.004,0.114,1.0,0.742,dance
4,3.0,0.741757,0.072774,129.985,0.051255,0.096732,0.000474,0.0,4.0,285.42667,-5.86,0.58563,0.730711,0.271,0.324,0.822,1.0,reggaeton
5,8.0,0.733856,0.093043,174.952,0.058137,0.293325,0.0,0.0,4.0,188.94281,-4.312,0.953424,0.785102,0.177,0.404,0.826,1.0,
6,8.0,0.777375,0.054104,104.946,0.029302,0.13035,0.0,1.0,4.0,228.29333,-5.112,0.525632,0.729051,0.817,0.672,0.394,0.963,r&b
7,7.0,0.585564,0.108297,120.014,0.038924,0.011707,5e-06,1.0,4.0,193.57333,-6.583,0.622176,0.781822,0.97,0.861,0.792,1.0,pop
8,0.0,0.418212,0.105322,129.054,0.045461,0.548385,0.00012,0.0,4.0,248.28,-7.284,0.44079,0.841824,0.011,0.303,0.843,1.0,
9,4.0,0.81403,0.079196,124.991,0.07244,0.005355,0.0,1.0,4.0,198.25333,-1.985,0.838512,0.70086,0.429,0.375,0.74,1.0,pop


# Consolidatin addition feature

In [71]:
# Dance
dance_df = df_feature[(songs_dataset['genres'] == 'dance') & (songs_dataset['yt_views'] > 1000)]
dance_new = pd.DataFrame(dance_df).dropna()
print("Dance")
print(dance_new.shape)

# Jazz
jazz_df = df_feature[(songs_dataset['genres'] == 'jazz') & (songs_dataset['yt_views'] > 1000)]
jazz_new = pd.DataFrame(jazz_df).dropna()
print("Jazz")
print(jazz_new.shape)

# Rock
rock_df = df_feature[(songs_dataset['genres'] == 'rock') & (songs_dataset['yt_views'] > 1000)]
rock_new = pd.DataFrame(rock_df).dropna()
print("Rock")
print(rock_new.shape)

# Rap
rap_df = df_feature[(songs_dataset['genres'] == 'rap') & (songs_dataset['yt_views'] > 1000)]
rap_new = pd.DataFrame(rap_df).dropna()
print("Rap")
print(rap_new.shape)

Dance
(1938, 18)
Jazz
(1677, 18)
Rock
(6245, 18)
Rap
(2411, 18)


  
  
  


In [72]:
data_add_feature = pd.concat((dance_new.sample(n=1500, replace=True), jazz_new.sample(n=1500, replace=True), rock_new.sample(n=1500, replace=True), rap_new.sample(n=1500, replace=True)), axis=0)
print(data_add_feature.shape)
data_add_feature.head(5)

(6000, 18)


Unnamed: 0,key,energy,liveliness,tempo,speechiness,acousticness,instrumentalness,time_signature,duration,loudness,valence,danceability,mode,time_signature_confidence,tempo_confidence,key_confidence,mode_confidence,genres
14103,9.0,0.92194,0.220017,170.44,0.138252,0.075693,0.000195,1.0,4.0,354.64,-5.816,0.842493,0.690519,0.42,0.421,0.47,1.0,dance
1953,0.0,0.807279,0.093509,130.002,0.10788,0.22162,0.0,0.0,4.0,208.62018,-3.621,0.613644,0.71587,0.573,0.65,0.926,1.0,dance
11245,7.0,0.870477,0.117017,128.03,0.038113,0.015035,0.800117,1.0,4.0,300.5859,-3.845,0.389359,0.724536,1.0,0.674,0.884,1.0,dance
6588,4.0,0.753022,0.196292,129.061,0.117276,0.409805,0.366476,0.0,4.0,89.2,-6.139,0.778334,0.69869,0.496,0.461,0.83,1.0,dance
1298,1.0,0.946412,0.355363,127.992,0.068679,0.00075,0.077679,1.0,4.0,292.90667,-2.876,0.332467,0.61569,0.41,0.436,0.779,0.948,dance


In [73]:
df_labels = data_add_feature['genres']
df_features = data_add_feature.drop('genres', axis=1)
df_labels.unique()

array(['dance', 'jazz', 'rock', 'rap'], dtype=object)

In [79]:
scander_scaler = StandardScaler()
df_features_results = scander_scaler.fit_transform(df_features)

In [81]:
from sklearn.cross_validation import train_test_split
X = df_features_results
y = df_labels

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

# Suport Vector Classifier

In [101]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from nltk import ConfusionMatrix


model_loreg = LogisticRegression(solver='saga',multi_class='multinomial')
model_loreg.fit(X_train, y_train)
predict = model_loreg.predict(X_test)
classifi_report = classification_report(y_test, predict)
print("#LogisticRegression")
print(classifi_report)


model_svc = SVC(C=1, gamma=1)
model_svc.fit(X_train, y_train)
predict = model_svc.predict(X_test)
classifi_report = classification_report(y_test, predict)
print("#SVC")
print(classifi_report)


model_rf = RandomForestClassifier(n_estimators=5,min_samples_split=2, max_features='log2')
model_rf.fit(X_train, y_train)
predict = model_rf.predict(X_test)
classifi_report = classification_report(y_test, predict)
print("#Random Forest")
print(classifi_report)
# confus_mat = ConfusionMatrix(list(y_test), list(predict))

#LogisticRegression
             precision    recall  f1-score   support

      dance       0.71      0.75      0.73       455
       jazz       0.81      0.85      0.83       437
        rap       0.78      0.74      0.76       455
       rock       0.75      0.70      0.73       453

avg / total       0.76      0.76      0.76      1800

#SVC
             precision    recall  f1-score   support

      dance       0.91      0.64      0.75       455
       jazz       0.97      0.56      0.71       437
        rap       0.95      0.58      0.72       455
       rock       0.45      0.95      0.61       453

avg / total       0.82      0.68      0.70      1800

#Random Forest
             precision    recall  f1-score   support

      dance       0.78      0.86      0.82       455
       jazz       0.85      0.89      0.87       437
        rap       0.85      0.84      0.85       455
       rock       0.83      0.72      0.77       453

avg / total       0.83      0.83      0.83      180

# SVM - GridSearchCV

In [106]:
from  sklearn.model_selection import GridSearchCV
param_grid = {'C': [0.1,1, 10], 'gamma': [1,0.1,0.01,0.001]}

In [107]:
grid = GridSearchCV(model_svc, param_grid, verbose=3)
grid.fit(X_train, y_train)

Fitting 3 folds for each of 12 candidates, totalling 36 fits
[CV] C=0.1, gamma=1 ..................................................
[CV] ........ C=0.1, gamma=1, score=0.25320970042796004, total=   0.6s
[CV] C=0.1, gamma=1 ..................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.8s remaining:    0.0s


[CV] ........ C=0.1, gamma=1, score=0.25303788420300216, total=   0.6s
[CV] C=0.1, gamma=1 ..................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.7s remaining:    0.0s


[CV] ........ C=0.1, gamma=1, score=0.25303788420300216, total=   0.6s
[CV] C=0.1, gamma=0.1 ................................................
[CV] ....... C=0.1, gamma=0.1, score=0.7489300998573466, total=   0.4s
[CV] C=0.1, gamma=0.1 ................................................
[CV] ....... C=0.1, gamma=0.1, score=0.7591136526090064, total=   0.4s
[CV] C=0.1, gamma=0.1 ................................................
[CV] ....... C=0.1, gamma=0.1, score=0.7576840600428878, total=   0.4s
[CV] C=0.1, gamma=0.01 ...............................................
[CV] ...... C=0.1, gamma=0.01, score=0.7517831669044223, total=   0.4s
[CV] C=0.1, gamma=0.01 ...............................................
[CV] ...... C=0.1, gamma=0.01, score=0.7626876340243031, total=   0.3s
[CV] C=0.1, gamma=0.01 ...............................................
[CV] ...... C=0.1, gamma=0.01, score=0.7684060042887777, total=   0.4s
[CV] C=0.1, gamma=0.001 ..............................................
[CV] .

[Parallel(n_jobs=1)]: Done  36 out of  36 | elapsed:   22.9s finished


GridSearchCV(cv=None, error_score='raise',
       estimator=SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=1, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': [0.1, 1, 10], 'gamma': [1, 0.1, 0.01, 0.001]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=3)

In [105]:
grid.best_params_

{'C': 1, 'gamma': 0.1}

In [108]:
svc_best = SVC(C=1, gamma=0.1)
svc_best.fit(X_train, y_train)
svc_predictions = svc_best.predict(X_test)
print(confusion_matrix(y_test, svc_predictions))
print(classification_report(list(y_test), list(svc_predictions)))

NameError: name 'confusion_matrix' is not defined