In [127]:
import pandas as pd
from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
import pickle
import numpy as np
from dtreeviz.trees import dtreeviz
import matplotlib.pyplot as plt



In [70]:
song_data = pd.read_csv("https://raw.githubusercontent.com/byui-cse/cse450-course/master/data/spotify/data.csv")

In [71]:
song_data = song_data.dropna()
song_data.head()

Unnamed: 0,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo,valence,year
0,0.995,['Carl Woitschach'],0.708,158648,0.195,0,6KbQ3uYMLKb5jDxLF7wYDD,0.563,10,0.151,-12.428,1,Singende Bataillone 1. Teil,0,1928,0.0506,118.469,0.779,1928
1,0.994,"['Robert Schumann', 'Vladimir Horowitz']",0.379,282133,0.0135,0,6KuQTIu1KoTTkLXKrwlLPV,0.901,8,0.0763,-28.454,1,"Fantasiestücke, Op. 111: Più tosto lento",0,1928,0.0462,83.972,0.0767,1928
2,0.604,['Seweryn Goszczyński'],0.749,104300,0.22,0,6L63VW0PibdM1HDSBoqnoM,0.0,5,0.119,-19.924,0,Chapter 1.18 - Zamek kaniowski,0,1928,0.929,107.177,0.88,1928
3,0.995,['Francisco Canaro'],0.781,180760,0.13,0,6M94FkXd15sOAOQYRnWPN8,0.887,1,0.111,-14.734,0,Bebamos Juntos - Instrumental (Remasterizado),0,1928-09-25,0.0926,108.003,0.72,1928
4,0.99,"['Frédéric Chopin', 'Vladimir Horowitz']",0.21,687733,0.204,0,6N6tiFZ9vLTSOIxkj8qKrd,0.908,11,0.098,-16.829,1,"Polonaise-Fantaisie in A-Flat Major, Op. 61",1,1928,0.0424,62.149,0.0693,1928


In [72]:
song_data["year"].value_counts()

1970    2000
1984    2000
1968    2000
1969    2000
2019    2000
        ... 
1925     263
1924     237
1923     169
1921     128
1922      72
Name: year, Length: 100, dtype: int64

In [73]:
song_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 169909 entries, 0 to 169908
Data columns (total 19 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   acousticness      169909 non-null  float64
 1   artists           169909 non-null  object 
 2   danceability      169909 non-null  float64
 3   duration_ms       169909 non-null  int64  
 4   energy            169909 non-null  float64
 5   explicit          169909 non-null  int64  
 6   id                169909 non-null  object 
 7   instrumentalness  169909 non-null  float64
 8   key               169909 non-null  int64  
 9   liveness          169909 non-null  float64
 10  loudness          169909 non-null  float64
 11  mode              169909 non-null  int64  
 12  name              169909 non-null  object 
 13  popularity        169909 non-null  int64  
 14  release_date      169909 non-null  object 
 15  speechiness       169909 non-null  float64
 16  tempo             16

In [74]:
# helper function to find assign binned popularity
def popularity_calc(data):
    if data["popularity"] >= 67:
        return "hit"
    if data["popularity"] > 33 and data["popularity"] <= 66:
        return "mid"
    if data["popularity"] >= 0 and data["popularity"] <= 33:
        return "flop"




In [75]:
song_data["popularity_catagory"] = song_data.apply(popularity_calc, axis=1)


In [76]:
song_data.head()

Unnamed: 0,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo,valence,year,popularity_catagory
0,0.995,['Carl Woitschach'],0.708,158648,0.195,0,6KbQ3uYMLKb5jDxLF7wYDD,0.563,10,0.151,-12.428,1,Singende Bataillone 1. Teil,0,1928,0.0506,118.469,0.779,1928,flop
1,0.994,"['Robert Schumann', 'Vladimir Horowitz']",0.379,282133,0.0135,0,6KuQTIu1KoTTkLXKrwlLPV,0.901,8,0.0763,-28.454,1,"Fantasiestücke, Op. 111: Più tosto lento",0,1928,0.0462,83.972,0.0767,1928,flop
2,0.604,['Seweryn Goszczyński'],0.749,104300,0.22,0,6L63VW0PibdM1HDSBoqnoM,0.0,5,0.119,-19.924,0,Chapter 1.18 - Zamek kaniowski,0,1928,0.929,107.177,0.88,1928,flop
3,0.995,['Francisco Canaro'],0.781,180760,0.13,0,6M94FkXd15sOAOQYRnWPN8,0.887,1,0.111,-14.734,0,Bebamos Juntos - Instrumental (Remasterizado),0,1928-09-25,0.0926,108.003,0.72,1928,flop
4,0.99,"['Frédéric Chopin', 'Vladimir Horowitz']",0.21,687733,0.204,0,6N6tiFZ9vLTSOIxkj8qKrd,0.908,11,0.098,-16.829,1,"Polonaise-Fantaisie in A-Flat Major, Op. 61",1,1928,0.0424,62.149,0.0693,1928,flop


In [77]:
flops_hits = song_data[song_data["popularity_catagory"].isin(["flop", "hit"])]

In [78]:
flops_hits["popularity_catagory"].value_counts()

flop    84966
hit      7297
Name: popularity_catagory, dtype: int64

In [80]:
# only get the 50 years of the data to look at relevant modern music

modern = flops_hits[flops_hits["year"] > 1970]

In [81]:
# get rid of artists since we just want to look at musical attributes

modern_data =  modern.loc[:, song_data.columns != "artists"]

In [82]:
modern_data["popularity_catagory"].value_counts()

flop    21067
hit      7118
Name: popularity_catagory, dtype: int64

In [83]:
# make a new encoded column for popularity 0 means flop and 1 means hit

def pop_encoder(data):
    if data["popularity_catagory"] == "hit":
        return 1
    else:
        return 0

modern_data["pop_encoded"] = modern_data.apply(pop_encoder, axis=1)



In [84]:
modern_data["pop_encoded"].value_counts()

0    21067
1     7118
Name: pop_encoded, dtype: int64

In [85]:
# get our features and target variable and split them into seperate datasets
features = ["acousticness", "danceability", "energy", "explicit", "instrumentalness", "key", "liveness", "loudness", "mode","speechiness", "tempo", "valence"]
target = ["pop_encoded"]

modern_data = modern_data.dropna()

X_raw = modern_data[features]
y = modern_data[target]


In [86]:
# standardize the feature data to fit numerical values between 0 and 1

scaler = MinMaxScaler(feature_range=(0, 1))
X = scaler.fit_transform(X_raw)

In [93]:
# perform oversampling of hit data so we have a balanced dataset
random_oversampler = RandomOverSampler()
X_final, y_final = random_oversampler.fit_resample(X, y)
y_final = np.asarray(y_final)



In [94]:
# check to see if the dataset is balanced
hits = pd.DataFrame(y_final)
hits.value_counts()

1    21067
0    21067
dtype: int64

In [95]:
print(len(X_final))
print(len(y_final))

42134
42134


In [97]:
# split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.33)

In [98]:
# traing the KD tree algorithm and saving it

kd_tree = KNeighborsClassifier(n_neighbors=5, algorithm="kd_tree")
kd_tree.fit(X_train, y_train)
kd_file = "kd_tree.sav"
pickle.dump(kd_tree, file=open(kd_file, mode="wb"), protocol=5)



In [108]:
# train a Decision Tree so we can see the most important features
d_tree = DecisionTreeClassifier(max_depth=7)
d_tree.fit(X_train, y_train)
d_file = "d_tree.sav"
pickle.dump(d_tree, file=open(d_file, mode="wb"), protocol=5)

In [140]:
# find out the accuracy for both models

print("kd_tree score: %.2f,  d_tree score: %.2f" % (kd_tree.score(X_test, y_test), d_tree.score(X_test, y_test)))

kd_tree score: 0.82,  d_tree score: 0.83


In [138]:
# visualize the important features
fig, ax = plt.subplots(figsize=(40, 20))
plot_tree(d_tree, fontsize=12, feature_names=X_raw.columns, class_names=["hit", "flop"])
plt.show()
plt.savefig("decision_tree.png")