# Decision Tree Classifier

In [52]:
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import export_graphviz
from sklearn import tree
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, classification_report

## Read dataset

In [76]:
df = pd.read_csv('/Users/haochenyang/Desktop/EECS545/Project/data_merged_quartile.csv')
# mean_popularity = 44.2
# df["popularity"] = [ 1 if i >= mean_popularity else 0 for i in df.popularity ]

In [12]:
df.head()

Unnamed: 0,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,...,genre_2,genre_3,genre_4,genre_5,genre_6,genre_7,genre_8,genre_9,Minor,Major
0,0,0.0127,0.622,218293.0,0.89,0.95,0.124,-7.043,0.03,115.002,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0,0.00306,0.62,215613.0,0.755,0.0118,0.534,-4.617,0.0345,127.994,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0,0.0254,0.774,166875.0,0.7,0.00253,0.157,-4.498,0.239,128.014,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0,0.00465,0.638,222369.0,0.587,0.909,0.157,-6.266,0.0413,145.036,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,1,0.0289,0.572,214408.0,0.803,8e-06,0.106,-4.294,0.351,149.995,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


## Split train and test data

In [77]:
X  = df.drop(['popularity'], axis=1)
y = df['popularity']

In [78]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, stratify=y, random_state=42)

In [79]:
X_train.shape, X_test.shape

((32444, 34), (8112, 34))

## Grid Search

In [80]:
criterion = ['gini', 'entropy']
# max_depth = [2,5,10,15]
# min_samples_split = [2,3,4,6,10,15]

# max_depth = [10,12,15]
# min_samples_split = [6,10,15]

# max_depth = [8,10,12]
# min_samples_split = [8,10,12]
# min_samples_leaf=[1,2,3,4]

max_depth = [8,10,12]
min_samples_split = [8,10,12]
min_samples_leaf=[1,2,3,4]

In [81]:
parameters = dict(max_depth=max_depth,
                 min_samples_split=min_samples_split,
                 min_samples_leaf=min_samples_leaf)

In [82]:
tree_clf = DecisionTreeClassifier()
# tree_clf.fit(X_train,y_train)

In [83]:
tree_clf_gs = GridSearchCV(tree_clf, parameters)
tree_clf_gs.fit(X_train,y_train)

## Best parameters

In [84]:
# print('Best Criterion:', tree_clf_gs.best_estimator_.get_params()['criterion'])
print('Best max_depth:', tree_clf_gs.best_estimator_.get_params()['max_depth'])
print('Best min_samples_split:', tree_clf_gs.best_estimator_.get_params()['min_samples_split'])
print('Best min_samples_leaf:', tree_clf_gs.best_estimator_.get_params()['min_samples_leaf'])
# print(); 
# print(clf_GS.best_estimator_.get_params()['dec_tree'])

Best max_depth: 8
Best min_samples_split: 8
Best min_samples_leaf: 1


In [85]:
X_train.columns[2]

'duration_ms'

In [86]:
tree_clf = DecisionTreeClassifier(max_depth=8, max_features='auto', criterion='gini', min_samples_leaf=1,
                               min_samples_split=8)
tree_clf.fit(X_train,y_train)

# from joblib import dump
# dump(tree_clf, 'Saved models/mlspotify_DT')



## Results

In [87]:
y_pred = tree_clf.predict(X_test)

In [88]:
accuracy_score(y_test, y_pred)

0.5479536489151874

In [89]:
y_pred_train = tree_clf.predict(X_train)

In [90]:
accuracy_score(y_pred_train, y_train)

0.5555418567377636

In [91]:
print(tree_clf.tree_.max_depth)

8


In [92]:
print(confusion_matrix(y_test, y_pred))
print("Classification report\n")
print(classification_report(y_test, y_pred))

[[1547  380  199    6]
 [ 530  938  538    3]
 [ 146  445  996  543]
 [  39  162  676  964]]
Classification report

              precision    recall  f1-score   support

         0.0       0.68      0.73      0.70      2132
         1.0       0.49      0.47      0.48      2009
         2.0       0.41      0.47      0.44      2130
         3.0       0.64      0.52      0.57      1841

    accuracy                           0.55      8112
   macro avg       0.56      0.55      0.55      8112
weighted avg       0.55      0.55      0.55      8112



## Plotting Graphs

In [73]:
from sklearn.metrics import roc_curve, roc_auc_score
# predict probabilities
lr_probs = tree_clf.predict_proba(X_test)
# keep probabilities for the positive outcome only
lr_probs = lr_probs[:, 1]
# calculate scores
lr_auc = roc_auc_score(y_test, lr_probs)
lr_auc

ValueError: multi_class must be in ('ovo', 'ovr')

In [74]:
from matplotlib import pyplot
from numpy import sqrt, argmax, save
import numpy as np
# calculate roc curves
fpr, tpr, thresholds = roc_curve(y_test, lr_probs)
np.save("fpr_DT", fpr)
np.save("tpr_DT", tpr)
# plot the roc curve for the model
pyplot.plot([0,1], [0,1], linestyle='--', label='No Skill')
pyplot.plot(fpr, tpr, marker='.', label='Logistic')
# axis labels
pyplot.xlabel('False Positive Rate')
pyplot.ylabel('True Positive Rate')
pyplot.legend()
# show the plot
pyplot.show()

ValueError: multiclass format is not supported

In [75]:
from sklearn.metrics import auc, precision_recall_curve, f1_score
from matplotlib import pyplot
lr_precision, lr_recall, thresholds = precision_recall_curve(y_test, lr_probs)
np.save("lrp_DT", lr_precision)
np.save("lrr_DT", lr_recall)
lr_f1, lr_auc = f1_score(y_test, y_pred), auc(lr_recall, lr_precision)
# summarize scores
print('SVM: f1=%.3f auc=%.3f' % (lr_f1, lr_auc))
fscore = (2 * lr_precision * lr_recall) / (lr_precision + lr_recall)
# plot the roc curve for the model
no_skill = len(y_test[y_test==1]) / len(y_test)
print(no_skill)
pyplot.plot([0,1], [no_skill,no_skill], linestyle='--', label='No Skill')
pyplot.plot(lr_recall, lr_precision, marker='.', label='Logistic')
# axis labels
pyplot.xlabel('Recall')
pyplot.ylabel('Precision')
pyplot.legend()
# show the plot
pyplot.show()

ValueError: multiclass format is not supported