In [15]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import seaborn as sns  # for nicer plots
sns.set(style="darkgrid")  # default style
from matplotlib import pyplot as plt

### Data Preprocessing

In [16]:
raw_df = pd.read_csv('spotify_tracks_dataset.csv')

In [17]:
df = raw_df.copy()

# Remove null row
df.dropna(inplace=True)

# Drop Unnamed field and time_signature field
df = df.drop(columns = ["Unnamed: 0","track_id", "artists", "album_name","track_name","time_signature"])

# Map explicit field to binary
df['explicit'] = df['explicit'].apply(lambda x: 1 if x == True else 0)

# One hot encode key field
df = pd.get_dummies(df, columns=['key'], dtype=int)
df = df.rename(columns={"key_0":"C","key_1":"C#","key_2":"D","key_3":"D#","key_4":"E","key_5":"F",
                        "key_6":"F#","key_7":"G","key_8":"G#","key_9":"A","key_10":"A#","key_11":"B"})

df.head()

Unnamed: 0,popularity,duration_ms,explicit,danceability,energy,loudness,mode,speechiness,acousticness,instrumentalness,...,D,D#,E,F,F#,G,G#,A,A#,B
0,73,230666,0,0.676,0.461,-6.746,0,0.143,0.0322,1e-06,...,0,0,0,0,0,0,0,0,0,0
1,55,149610,0,0.42,0.166,-17.235,1,0.0763,0.924,6e-06,...,0,0,0,0,0,0,0,0,0,0
2,57,210826,0,0.438,0.359,-9.734,1,0.0557,0.21,0.0,...,0,0,0,0,0,0,0,0,0,0
3,71,201933,0,0.266,0.0596,-18.515,1,0.0363,0.905,7.1e-05,...,0,0,0,0,0,0,0,0,0,0
4,82,198853,0,0.618,0.443,-9.681,1,0.0526,0.469,0.0,...,1,0,0,0,0,0,0,0,0,0


### Randomize

In [18]:
np.random.seed(0)
indices = np.arange(df.shape[0])
shuffled_indices = np.random.permutation(indices)

df = df.reindex(shuffled_indices)
display(df)

Unnamed: 0,popularity,duration_ms,explicit,danceability,energy,loudness,mode,speechiness,acousticness,instrumentalness,...,D,D#,E,F,F#,G,G#,A,A#,B
45620,51.0,262106.0,0.0,0.621,0.113,-16.842,0.0,0.0491,0.970000,0.834000,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8391,0.0,209293.0,0.0,0.649,0.399,-7.894,1.0,0.0256,0.184000,0.000108,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
56,51.0,234473.0,0.0,0.526,0.333,-13.020,1.0,0.0314,0.910000,0.168000,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
82777,22.0,324493.0,0.0,0.552,0.571,-8.283,1.0,0.0286,0.472000,0.000063,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
57024,74.0,262248.0,0.0,0.633,0.550,-9.965,1.0,0.0924,0.649000,0.000363,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21243,0.0,224000.0,0.0,0.555,0.820,-2.663,1.0,0.0782,0.121000,0.000004,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
45891,21.0,166573.0,0.0,0.448,0.150,-22.779,1.0,0.0385,0.772000,0.885000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
42613,12.0,71240.0,1.0,0.239,0.961,-5.569,1.0,0.1850,0.000004,0.151000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
43567,62.0,202633.0,0.0,0.637,0.936,-5.274,0.0,0.0371,0.046500,0.000150,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


### Feature Selection

In [19]:
df.columns

Index(['popularity', 'duration_ms', 'explicit', 'danceability', 'energy',
       'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness',
       'liveness', 'valence', 'tempo', 'track_genre', 'C', 'C#', 'D', 'D#',
       'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B'],
      dtype='object')

In [20]:
columns = ['danceability', 'energy', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness',
           'liveness', 'valence', 'tempo', 'track_genre', 'C', 'C#', 'D', 'D#',
           'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B']
df = df[columns]
display(df)

Unnamed: 0,danceability,energy,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,...,D,D#,E,F,F#,G,G#,A,A#,B
45620,0.621,0.113,-16.842,0.0,0.0491,0.970000,0.834000,0.1160,0.203,107.750,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8391,0.649,0.399,-7.894,1.0,0.0256,0.184000,0.000108,0.1130,0.342,97.498,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
56,0.526,0.333,-13.020,1.0,0.0314,0.910000,0.168000,0.1110,0.159,94.951,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
82777,0.552,0.571,-8.283,1.0,0.0286,0.472000,0.000063,0.0857,0.331,92.623,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
57024,0.633,0.550,-9.965,1.0,0.0924,0.649000,0.000363,0.1470,0.333,147.099,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21243,0.555,0.820,-2.663,1.0,0.0782,0.121000,0.000004,0.1290,0.968,113.000,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
45891,0.448,0.150,-22.779,1.0,0.0385,0.772000,0.885000,0.1190,0.125,93.857,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
42613,0.239,0.961,-5.569,1.0,0.1850,0.000004,0.151000,0.3220,0.210,121.329,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
43567,0.637,0.936,-5.274,0.0,0.0371,0.046500,0.000150,0.1040,0.497,126.027,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [29]:
##GIVE CREDIT TO KAGGLE HERE
genre_to_category = {
    # EDM
    'edm': 'Electronic Dance Music', 
    'house': 'Electronic Dance Music', 
    'electro': 'Electronic Dance Music', 
    'trance': 'Electronic Dance Music', 
    'techno': 'Electronic Dance Music', 
    'dubstep': 'Electronic Dance Music', 
    'drum-and-bass': 'Electronic Dance Music', 
    'deep-house': 'Electronic Dance Music', 
    'detroit-techno': 'Electronic Dance Music', 
    'minimal-techno': 'Electronic Dance Music', 
    'progressive-house': 'Electronic Dance Music', 
    'breakbeat': 'Electronic Dance Music',
    
    # Rock
    'alt-rock': 'Rock', 
    'rock': 'Rock', 
    'indie': 'Rock', 
    'indie-pop': 'Rock', 
    'punk': 'Rock', 
    'punk-rock': 'Rock', 
    'hard-rock': 'Rock', 
    'metal': 'Rock', 
    'heavy-metal': 'Rock', 
    'black-metal': 'Rock', 
    'death-metal': 'Rock', 
    'grunge': 'Rock',
    
    # Hip-Hop and R&B
    'hip-hop': 'Hip-Hop and R&B', 
    'r-n-b': 'Hip-Hop and R&B', 
    'trap': 'Hip-Hop and R&B',
    
    # Pop
    'pop': 'Pop', 
    'electro-pop': 'Pop', 
    'synth-pop': 'Pop', 
    'k-pop': 'Pop', 
    'pop-film': 'Pop', 
    'power-pop': 'Pop',
    
    # Latin & Reggae/Dancehall
    'latin': 'Latin & Reggae/Dancehall', 
    'reggaeton': 'Latin & Reggae/Dancehall', 
    'salsa': 'Latin & Reggae/Dancehall', 
    'samba': 'Latin & Reggae/Dancehall', 
    'reggae': 'Latin & Reggae/Dancehall', 
    'dancehall': 'Latin & Reggae/Dancehall',
    
    # Funk and Disco
    'funk': 'Funk and Disco', 
    'disco': 'Funk and Disco', 
    'groove': 'Funk and Disco',
}

# Map each track to a category
df['music_category'] = df['track_genre'].apply(lambda x: genre_to_category.get(x, 'Other'))
df['music_category']

45620                       Other
8391                        Other
56                          Other
82777                         Pop
57024                        Rock
                   ...           
21243    Latin & Reggae/Dancehall
45891                       Other
42613                       Other
43567              Funk and Disco
68268                       Other
Name: music_category, Length: 113999, dtype: object

In [22]:
features = ['danceability', 'energy', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness',
           'liveness', 'valence', 'tempo', 'C', 'C#', 'D', 'D#',
           'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B']
label = ['music_category']
X = df[features]
Y = df[label]

In [23]:
#converting Y into label values
Y['music_category'].astype(str)
label_names = list(Y['music_category'].unique())
Y['music_category'] = Y['music_category'].apply(label_names.index)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Y['music_category'] = Y['music_category'].apply(label_names.index)


### Train Test Split

In [24]:
X_train_temp, X_val, Y_train_temp, Y_val = train_test_split(X, Y, test_size = int(0.2*len(X)), random_state = 42)
X_train, X_test, Y_train, Y_test = train_test_split(X_train_temp, Y_train_temp, test_size = int(0.2*len(X)), random_state = 42) #needs to be 20% of original size

In [25]:
print(len(X_train))

68401


In [26]:
print(len(X_val))

22799


In [27]:
print(len(X_test))

22799


In [28]:
#save everything to csv
X_val.to_csv('data/processed/X_val.csv', index=False)
Y_val.to_csv('data/processed/Y_val.csv', index=False)
X_train.to_csv('data/processed/X_train.csv', index=False)
Y_train.to_csv('data/processed/Y_train.csv', index=False)
X_test.to_csv('data/processed/X_test.csv', index=False)
Y_test.to_csv('data/processed/Y_test.csv', index=False)

### Modeling Next Steps
* Decision Trees
* KNN
* Logistic Regression / NN
