<a href="https://colab.research.google.com/github/tannerskluz/SpotifyClassifier/blob/main/base_nb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Load libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load song dataset

In [None]:
url = 'https://raw.githubusercontent.com/tannerskluz/SpotifyClassifier/main/data.csv'
df = pd.read_csv(url)
df.head()

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.isna().sum()

In [None]:
df['explicit'].value_counts() # very imbalanced

In [None]:
df.drop(columns=['id', 'name', 'artists', 'release_date'],inplace=True)
df.head()

In [None]:
plt.figure(figsize=(14,12)) 
sns.heatmap(df.corr(), annot=True, cmap='cubehelix_r')
plt.show()

In [None]:
# do some more feature selection based on correlation heatmap
# remove ones with less than .15 absolute correlation
df = df.drop(columns=['duration_ms', 'key', 'liveness', 'mode', 'tempo', 'valence',
                      'loudness', 'instrumentalness']) 

In [None]:
from sklearn.model_selection import train_test_split, KFold, cross_validate
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from imblearn.over_sampling import SMOTE # pip3 install imblearn delayed

import pydotplus
from sklearn.tree import export_graphviz

# helper function for rendering tree
def tree_graph_to_png(tree, feature_names, png_file_to_save):
    tree_str = export_graphviz(tree, feature_names=feature_names, 
                                     filled=True, out_file=None)
    graph = pydotplus.graph_from_dot_data(tree_str)  
    graph.write_png(png_file_to_save)

In [None]:
# split dataset into features and target variable
X = df.drop(columns='explicit')
y = df.explicit

# generate synthetic samples from minority class using kNN
sm = SMOTE(random_state=0, sampling_strategy='minority')
X_res, y_res = sm.fit_resample(X, y)

# instantiate tree 
clf_tree = DecisionTreeClassifier(criterion='gini',max_depth=3,random_state=0)

scoring = {'acc' : 'accuracy',  # dict of metrics 
           'prec': 'precision',
           'rec' : 'recall',
           'f1'  : 'f1'}

# 5-fold cv
scores = cross_validate(estimator=clf_tree,X=X_res, y=y_res, cv=5, scoring=scoring,
                        return_train_score=True, return_estimator=True)

# print eval metrics for each fold
print("Accuracy:", scores['test_acc'])
print("Precision:", scores['test_prec'])
print("Recall:", scores['test_rec'])
print("f-measure:", scores['test_f1'])

In [None]:
# visualize decision tree
# using third tree because it has the best f-measure, other metrics? 
tree_graph_to_png(tree=scores['estimator'][2], feature_names=sorted(X_train),
                  png_file_to_save='explicit_clf_tree.png')

from IPython.display import Image
from IPython.core.display import HTML 
PATH ='explicit_clf_tree.png'
Image(filename = PATH)