# First - Import full library

In [155]:
import pandas as pd
import numpy as np
from sklearn import  preprocessing, cross_validation, neighbors, metrics, svm
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest

In [156]:
songs_dataset = pd.read_json('MasterSongList.json')

In [157]:
songs_dataset.head(1)

Unnamed: 0,_id,album,artist,audio_features,context,decades,genres,lyrics_features,moods,name,new_context,picture,recording_id,sub_context,yt_id,yt_views
0,{'$oid': '52fdfb440b9398049f3d7a8c'},Gangnam Style (강남스타일),PSY,"[11, 0.912744, 0.083704, 132.069, 0.293137, 0....",[work out],[],[pop],"[oppa, gangnam, style, gangnam, style, najeneu...","[energetic, motivational]",Gangnam Style (강남스타일),work out,http://images.musicnet.com/albums/073/463/405/...,50232.0,[working out: cardio],9bZkp7q19f0,2450112089


In [158]:
songs_dataset.loc[:,'genres'] = songs_dataset['genres'].apply(''.join)
def consolidateGenre(genre):
    if(len(genre) > 0):
        return genre.split(':')[0]
    else:
        return genre
songs_dataset.loc[:,'genres'] = songs_dataset['genres'].apply(consolidateGenre)

In [159]:
audio_feature_list = []
for audio_feature in songs_dataset['audio_features']:
    audio_feature_list.append(audio_feature)

In [160]:
audio_feature_header = ['key','energy','liveliness','tempo','speechiness','acousticness','instrumentalness','time_signature'
                         ,'duration','loudness','valence','danceability','mode','time_signature_confidence','tempo_confidence'
                         ,'key_confidence','mode_confidence']

df_feature =  pd.DataFrame(audio_feature_list, columns = audio_feature_header)
df_feature.loc[:,].dropna(axis=0,how='all',inplace=True)

df_feature['genres'] = songs_dataset['genres']
df_feature.head(10)

Unnamed: 0,key,energy,liveliness,tempo,speechiness,acousticness,instrumentalness,time_signature,duration,loudness,valence,danceability,mode,time_signature_confidence,tempo_confidence,key_confidence,mode_confidence,genres
0,11.0,0.912744,0.083704,132.069,0.293137,0.005423,1e-06,0.0,4.0,218.30667,-3.89,0.752186,0.72692,0.552,0.541,1.0,1.0,pop
1,6.0,0.745704,0.119955,100.008,0.046255,0.02623,0.012727,1.0,4.0,235.06086,-7.687,0.351282,0.691817,0.737,0.634,0.796,1.0,pop
2,5.0,0.709932,0.231455,130.03,0.121741,0.036662,0.0,0.0,4.0,232.46104,-5.15,0.37439,0.704729,0.565,0.565,0.743,1.0,
3,3.0,0.705822,0.053292,126.009,0.126016,0.001966,0.0,0.0,4.0,194.09333,-3.898,0.592798,0.875137,0.004,0.114,1.0,0.742,dance
4,3.0,0.741757,0.072774,129.985,0.051255,0.096732,0.000474,0.0,4.0,285.42667,-5.86,0.58563,0.730711,0.271,0.324,0.822,1.0,reggaeton
5,8.0,0.733856,0.093043,174.952,0.058137,0.293325,0.0,0.0,4.0,188.94281,-4.312,0.953424,0.785102,0.177,0.404,0.826,1.0,
6,8.0,0.777375,0.054104,104.946,0.029302,0.13035,0.0,1.0,4.0,228.29333,-5.112,0.525632,0.729051,0.817,0.672,0.394,0.963,r&b
7,7.0,0.585564,0.108297,120.014,0.038924,0.011707,5e-06,1.0,4.0,193.57333,-6.583,0.622176,0.781822,0.97,0.861,0.792,1.0,pop
8,0.0,0.418212,0.105322,129.054,0.045461,0.548385,0.00012,0.0,4.0,248.28,-7.284,0.44079,0.841824,0.011,0.303,0.843,1.0,
9,4.0,0.81403,0.079196,124.991,0.07244,0.005355,0.0,1.0,4.0,198.25333,-1.985,0.838512,0.70086,0.429,0.375,0.74,1.0,pop


# Consolidatin addition feature

In [161]:
# Dance
dance_df = df_feature[(songs_dataset['genres'] == 'dance') & (songs_dataset['yt_views'] > 1000)]
dance_new = pd.DataFrame(dance_df).dropna()
print("Dance")
print(dance_new.shape)

# Jazz
jazz_df = df_feature[(songs_dataset['genres'] == 'jazz') & (songs_dataset['yt_views'] > 1000)]
jazz_new = pd.DataFrame(jazz_df).dropna()
print("Jazz")
print(jazz_new.shape)

# Rock
rock_df = df_feature[(songs_dataset['genres'] == 'rock') & (songs_dataset['yt_views'] > 1000)]
rock_new = pd.DataFrame(rock_df).dropna()
print("Rock")
print(rock_new.shape)

# Rap
rap_df = df_feature[(songs_dataset['genres'] == 'rap') & (songs_dataset['yt_views'] > 1000)]
rap_new = pd.DataFrame(rap_df).dropna()
print("Rap")
print(rap_new.shape)

Dance
(1938, 18)
Jazz
(1677, 18)
Rock
(6245, 18)
Rap
(2411, 18)


  
  
  


In [162]:
data_add_feature = pd.concat((dance_new.sample(n=1500, replace=True), jazz_new.sample(n=1500, replace=True), rock_new.sample(n=1500, replace=True), rap_new.sample(n=1500, replace=True)), axis=0)
print(data_add_feature.shape)
data_add_feature.head(5)

(6000, 18)


Unnamed: 0,key,energy,liveliness,tempo,speechiness,acousticness,instrumentalness,time_signature,duration,loudness,valence,danceability,mode,time_signature_confidence,tempo_confidence,key_confidence,mode_confidence,genres
26872,7.0,0.413997,0.088377,119.965,0.04634,0.905124,0.929734,1.0,4.0,210.10667,-14.422,0.197229,0.694723,0.089,0.304,0.605,0.023,dance
4582,1.0,0.689778,0.283752,129.482,0.041365,0.011664,0.85427,1.0,4.0,349.52,-7.279,0.971121,0.659309,0.604,0.577,0.858,0.744,dance
5038,4.0,0.738654,0.044702,116.792,0.063966,0.044469,0.01771,0.0,4.0,280.46667,-7.814,0.854048,0.781668,0.371,0.324,0.914,1.0,dance
28575,1.0,0.441488,0.08811,127.994,0.04983,0.000493,0.024812,1.0,4.0,382.5,-2.997,0.405731,0.654535,0.517,0.3,0.79,1.0,dance
29054,10.0,0.772109,0.121702,123.166,0.046866,0.003274,0.890622,0.0,4.0,296.41333,-11.219,0.510498,0.792937,0.64,0.692,0.795,1.0,dance


In [163]:
df_labels = data_add_feature['genres']
df_features = data_add_feature.drop('genres', axis=1)
df_labels.unique()

array(['dance', 'jazz', 'rock', 'rap'], dtype=object)

In [164]:
scander_scaler = StandardScaler()
df_features_results = scander_scaler.fit_transform(df_features)

In [165]:
from sklearn.cross_validation import train_test_split
X = df_features_results
y = df_labels

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

# 2) Build the following classifiers

In [166]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from nltk import ConfusionMatrix


model_loreg = LogisticRegression(solver='saga',multi_class='multinomial')
model_loreg.fit(X_train, y_train)
predict = model_loreg.predict(X_test)
classifi_report = classification_report(y_test, predict)
print("#LogisticRegression")
print(classifi_report)


model_svc = SVC(C=1, gamma=1)
model_svc.fit(X_train, y_train)
predict = model_svc.predict(X_test)
classifi_report = classification_report(y_test, predict)
print("#SVC")
print(classifi_report)


model_rf = RandomForestClassifier(n_estimators=5,min_samples_split=2, max_features='log2')
model_rf.fit(X_train, y_train)
predict = model_rf.predict(X_test)
classifi_report = classification_report(y_test, predict)
print("#Random Forest")
print(classifi_report)
# confus_mat = ConfusionMatrix(list(y_test), list(predict))

#LogisticRegression
             precision    recall  f1-score   support

      dance       0.72      0.71      0.72       455
       jazz       0.82      0.80      0.81       437
        rap       0.79      0.77      0.78       455
       rock       0.68      0.73      0.70       453

avg / total       0.75      0.75      0.75      1800

#SVC
             precision    recall  f1-score   support

      dance       0.91      0.62      0.74       455
       jazz       0.96      0.51      0.67       437
        rap       0.94      0.60      0.74       455
       rock       0.44      0.94      0.60       453

avg / total       0.81      0.67      0.69      1800

#Random Forest
             precision    recall  f1-score   support

      dance       0.76      0.81      0.78       455
       jazz       0.83      0.89      0.86       437
        rap       0.82      0.82      0.82       455
       rock       0.77      0.68      0.72       453

avg / total       0.80      0.80      0.80      180

# 3) Successfully find the best values for the following classifier parameters using GridSearchCV

# SVC

In [167]:
from  sklearn.model_selection import GridSearchCV
param_grid = {'C': [0.1,1, 10], 'gamma': [1,0.1,0.01,0.001]}

In [168]:
grid = GridSearchCV(model_svc, param_grid, verbose=3)
grid.fit(X_train, y_train)

Fitting 3 folds for each of 12 candidates, totalling 36 fits
[CV] C=0.1, gamma=1 ..................................................
[CV] ........ C=0.1, gamma=1, score=0.25320970042796004, total=   0.7s
[CV] C=0.1, gamma=1 ..................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.4s remaining:    0.0s


[CV] ........ C=0.1, gamma=1, score=0.25303788420300216, total=   1.4s
[CV] C=0.1, gamma=1 ..................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    3.0s remaining:    0.0s


[CV] ........ C=0.1, gamma=1, score=0.25303788420300216, total=   0.7s
[CV] C=0.1, gamma=0.1 ................................................
[CV] ....... C=0.1, gamma=0.1, score=0.7475035663338089, total=   0.4s
[CV] C=0.1, gamma=0.1 ................................................
[CV] ....... C=0.1, gamma=0.1, score=0.7412437455325233, total=   0.4s
[CV] C=0.1, gamma=0.1 ................................................
[CV] ....... C=0.1, gamma=0.1, score=0.7569692637598284, total=   0.6s
[CV] C=0.1, gamma=0.01 ...............................................
[CV] ...... C=0.1, gamma=0.01, score=0.7432239657631954, total=   0.5s
[CV] C=0.1, gamma=0.01 ...............................................
[CV] ....... C=0.1, gamma=0.01, score=0.729807005003574, total=   0.5s
[CV] C=0.1, gamma=0.01 ...............................................
[CV] ...... C=0.1, gamma=0.01, score=0.7648320228734811, total=   1.1s
[CV] C=0.1, gamma=0.001 ..............................................
[CV] .

[Parallel(n_jobs=1)]: Done  36 out of  36 | elapsed:   27.1s finished


GridSearchCV(cv=None, error_score='raise',
       estimator=SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=1, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': [0.1, 1, 10], 'gamma': [1, 0.1, 0.01, 0.001]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=3)

In [169]:
grid.best_params_

{'C': 1, 'gamma': 0.1}

In [170]:
svc_best = SVC(C=1, gamma=0.1)
svc_best.fit(X_train, y_train)
svc_predictions = svc_best.predict(X_test)
print(confusion_matrix(y_test, svc_predictions))
print(classification_report(list(y_test), list(svc_predictions)))

[[344   7  52  52]
 [ 11 383   6  37]
 [ 47  21 368  19]
 [ 44  54  20 335]]
             precision    recall  f1-score   support

      dance       0.77      0.76      0.76       455
       jazz       0.82      0.88      0.85       437
        rap       0.83      0.81      0.82       455
       rock       0.76      0.74      0.75       453

avg / total       0.79      0.79      0.79      1800



# Logistic Regression

In [178]:
param_grid_loreg = {'solver': ['newton-cg', 'lbfgs', 'sag', 'saga'], 'multi_class':['ovr', 'multinomial']}
grid_loreg = GridSearchCV(model_loreg, param_grid_loreg, verbose=3)
grid_loreg.fit(X_train, y_train)

Fitting 3 folds for each of 8 candidates, totalling 24 fits
[CV] multi_class=ovr, solver=newton-cg ...............................
[CV]  multi_class=ovr, solver=newton-cg, score=0.7439372325249644, total=   0.3s
[CV] multi_class=ovr, solver=newton-cg ...............................
[CV]  multi_class=ovr, solver=newton-cg, score=0.7526804860614725, total=   0.2s
[CV] multi_class=ovr, solver=newton-cg ...............................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.5s remaining:    0.0s


[CV]  multi_class=ovr, solver=newton-cg, score=0.773409578270193, total=   0.1s
[CV] multi_class=ovr, solver=lbfgs ...................................
[CV]  multi_class=ovr, solver=lbfgs, score=0.7439372325249644, total=   0.1s
[CV] multi_class=ovr, solver=lbfgs ...................................
[CV]  multi_class=ovr, solver=lbfgs, score=0.7526804860614725, total=   0.1s
[CV] multi_class=ovr, solver=lbfgs ...................................
[CV]  multi_class=ovr, solver=lbfgs, score=0.773409578270193, total=   0.1s
[CV] multi_class=ovr, solver=sag .....................................
[CV]  multi_class=ovr, solver=sag, score=0.7439372325249644, total=   0.2s
[CV] multi_class=ovr, solver=sag .....................................
[CV]  multi_class=ovr, solver=sag, score=0.7526804860614725, total=   0.2s
[CV] multi_class=ovr, solver=sag .....................................
[CV]  multi_class=ovr, solver=sag, score=0.773409578270193, total=   0.2s
[CV] multi_class=ovr, solver=saga ......

[Parallel(n_jobs=1)]: Done  24 out of  24 | elapsed:    4.7s finished


GridSearchCV(cv=None, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=1, penalty='l2', random_state=None, solver='saga',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'solver': ['newton-cg', 'lbfgs', 'sag', 'saga'], 'multi_class': ['ovr', 'multinomial']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=3)

In [179]:
grid_loreg.best_params_

{'multi_class': 'ovr', 'solver': 'newton-cg'}

In [185]:
model_loreg_best = LogisticRegression(multi_class='ovr', solver='newton-cg')
model_loreg_best.fit(X_train, y_train)
predict = model_loreg_best.predict(X_test)
classifi_report = classification_report(y_test, predict)
print("#LogisticRegression")
print(classifi_report)
# better a litter

#LogisticRegression
             precision    recall  f1-score   support

      dance       0.72      0.70      0.71       455
       jazz       0.81      0.84      0.82       437
        rap       0.78      0.78      0.78       455
       rock       0.71      0.72      0.71       453

avg / total       0.76      0.76      0.76      1800



# Random Forest

In [186]:
param_grid_rf = {'n_estimators':[5, 10, 100], 'min_samples_split':[2, 3, 4, 5, 10], 'max_features': ['sqrt', 'log2', 'auto']}
grid_rf = GridSearchCV(model_rf, param_grid_rf, verbose=3)
grid_rf.fit(X_train, y_train)

Fitting 3 folds for each of 45 candidates, totalling 135 fits
[CV] max_features=sqrt, min_samples_split=2, n_estimators=5 ..........
[CV]  max_features=sqrt, min_samples_split=2, n_estimators=5, score=0.7731811697574893, total=   0.1s
[CV] max_features=sqrt, min_samples_split=2, n_estimators=5 ..........
[CV]  max_features=sqrt, min_samples_split=2, n_estimators=5, score=0.7634024303073624, total=   0.0s
[CV] max_features=sqrt, min_samples_split=2, n_estimators=5 ..........
[CV]  max_features=sqrt, min_samples_split=2, n_estimators=5, score=0.7848463187991422, total=   0.0s
[CV] max_features=sqrt, min_samples_split=2, n_estimators=10 .........


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.1s remaining:    0.0s


[CV]  max_features=sqrt, min_samples_split=2, n_estimators=10, score=0.8017118402282454, total=   0.1s
[CV] max_features=sqrt, min_samples_split=2, n_estimators=10 .........
[CV]  max_features=sqrt, min_samples_split=2, n_estimators=10, score=0.8112937812723374, total=   0.1s
[CV] max_features=sqrt, min_samples_split=2, n_estimators=10 .........
[CV]  max_features=sqrt, min_samples_split=2, n_estimators=10, score=0.7941386704789135, total=   0.1s
[CV] max_features=sqrt, min_samples_split=2, n_estimators=100 ........
[CV]  max_features=sqrt, min_samples_split=2, n_estimators=100, score=0.8266761768901569, total=   1.2s
[CV] max_features=sqrt, min_samples_split=2, n_estimators=100 ........
[CV]  max_features=sqrt, min_samples_split=2, n_estimators=100, score=0.8270192994996426, total=   0.8s
[CV] max_features=sqrt, min_samples_split=2, n_estimators=100 ........
[CV]  max_features=sqrt, min_samples_split=2, n_estimators=100, score=0.823445318084346, total=   0.9s
[CV] max_features=sqrt, m

[CV]  max_features=log2, min_samples_split=2, n_estimators=100, score=0.8252496433666191, total=   0.8s
[CV] max_features=log2, min_samples_split=2, n_estimators=100 ........
[CV]  max_features=log2, min_samples_split=2, n_estimators=100, score=0.8263045032165832, total=   0.8s
[CV] max_features=log2, min_samples_split=2, n_estimators=100 ........
[CV]  max_features=log2, min_samples_split=2, n_estimators=100, score=0.8248749106504646, total=   1.1s
[CV] max_features=log2, min_samples_split=3, n_estimators=5 ..........
[CV]  max_features=log2, min_samples_split=3, n_estimators=5, score=0.776034236804565, total=   0.1s
[CV] max_features=log2, min_samples_split=3, n_estimators=5 ..........
[CV]  max_features=log2, min_samples_split=3, n_estimators=5, score=0.7726947819871337, total=   0.2s
[CV] max_features=log2, min_samples_split=3, n_estimators=5 ..........
[CV]  max_features=log2, min_samples_split=3, n_estimators=5, score=0.773409578270193, total=   0.1s
[CV] max_features=log2, min_s

[CV]  max_features=auto, min_samples_split=3, n_estimators=10, score=0.7945791726105563, total=   0.1s
[CV] max_features=auto, min_samples_split=3, n_estimators=10 .........
[CV]  max_features=auto, min_samples_split=3, n_estimators=10, score=0.7962830593280915, total=   0.1s
[CV] max_features=auto, min_samples_split=3, n_estimators=10 .........
[CV]  max_features=auto, min_samples_split=3, n_estimators=10, score=0.8084345961401, total=   0.1s
[CV] max_features=auto, min_samples_split=3, n_estimators=100 ........
[CV]  max_features=auto, min_samples_split=3, n_estimators=100, score=0.8245363766048502, total=   1.1s
[CV] max_features=auto, min_samples_split=3, n_estimators=100 ........
[CV]  max_features=auto, min_samples_split=3, n_estimators=100, score=0.82987848463188, total=   0.9s
[CV] max_features=auto, min_samples_split=3, n_estimators=100 ........
[CV]  max_features=auto, min_samples_split=3, n_estimators=100, score=0.8148677626876341, total=   0.7s
[CV] max_features=auto, min_s

[Parallel(n_jobs=1)]: Done 135 out of 135 | elapsed:   54.7s finished


GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='log2', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=5, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_estimators': [5, 10, 100], 'min_samples_split': [2, 3, 4, 5, 10], 'max_features': ['sqrt', 'log2', 'auto']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=3)

In [187]:
grid_rf.best_params_

{'max_features': 'log2', 'min_samples_split': 3, 'n_estimators': 100}

In [189]:
model_rf_best = RandomForestClassifier(n_estimators=100,min_samples_split=3, max_features='log2')
model_rf_best.fit(X_train, y_train)
predict = model_rf_best.predict(X_test)
classifi_report = classification_report(y_test, predict)
print(classifi_report)

             precision    recall  f1-score   support

      dance       0.86      0.84      0.85       455
       jazz       0.89      0.90      0.90       437
        rap       0.88      0.89      0.88       455
       rock       0.81      0.81      0.81       453

avg / total       0.86      0.86      0.86      1800



# 4) Successfully find the best audio_features for all classifiers using the following feature selection methods:

# Support Vector Machine - SelectKBest

In [142]:
selectK = SelectKBest(k=5)
X_new = selectK.fit_transform(X, y)
X_new.shape

(6000, 5)

In [191]:
df_features.columns[selectK.get_support(indices=True)].tolist()

['key', 'energy', 'liveliness', 'tempo', 'speechiness']

# Logistic Regression - SelectFromModel - RFE

In [145]:
from numpy import sort
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectFromModel, RFE

In [147]:
rfe = RFE(rfc_original, 5)
rfc_5 = rfe.fit(X, y)
print ('5 importance features: ')
df_audio_features_jdrr_final.columns[rfe.get_support(importances)].tolist()

NameError: name 'rfc_original' is not defined