## Data Processing and Visualization
### CPE 695 Final Project - Genre Prediction

Spring 2023

In [1]:
import pandas as pd

DATA_PATH = '../data_processed/MSD_Desired_Features_With_Labels.csv'

def load_data(data_path=DATA_PATH):
    return pd.read_csv(data_path)

music = load_data()
music.head()

Unnamed: 0.1,Unnamed: 0,Genre,Segment Pitch 1 Average,Segment Timbre 1 Average,Segment Pitch 1-1 Covariance,Segment Timbre 1-1 Covariance,Segment Pitch 1-2 Covariance,Segment Timbre 1-2 Covariance,Segment Pitch 1-3 Covariance,Segment Timbre 1-3 Covariance,...,Segment Pitch 11 Average,Segment Timbre 11 Average,Segment Pitch 11-11 Covariance,Segment Timbre 11-11 Covariance,Segment Pitch 11-12 Covariance,Segment Timbre 11-12 Covariance,Segment Pitch 12 Average,Segment Timbre 12 Average,Segment Pitch 12-12 Covariance,Segment Timbre 12-12 Covariance
0,0,hip hop,0.471181,41.512778,0.088775,24.473705,0.025206,59.758068,0.008047,193.941455,...,0.30668,-13.56328,0.08351,601.389784,0.00833,-23.752621,0.324174,5.44248,0.114662,314.55638
1,1,blue-eyed soul,0.207747,43.071036,0.034785,34.255008,0.026356,-9.004907,0.011916,50.112214,...,0.218373,-2.752638,0.02655,259.467441,0.011148,-37.508893,0.291395,1.729396,0.093845,146.412662
2,2,latin,0.407313,45.130815,0.119181,17.941269,0.044909,90.896563,0.010286,98.513702,...,0.453607,8.442069,0.121259,360.639376,0.01183,11.974532,0.197391,4.477171,0.037107,403.882977
3,3,glam rock,0.497093,45.800256,0.103584,42.411902,0.013785,-56.275854,-0.022079,-23.051592,...,0.253177,0.261616,0.037001,156.177563,0.026112,29.548883,0.413878,-2.427598,0.077062,347.652176
4,4,alternative metal,0.409174,50.251554,0.090325,17.901274,0.03492,-8.965879,0.007081,46.602124,...,0.3383,5.448426,0.055032,212.20117,0.011682,4.401661,0.278859,1.704516,0.030253,119.497075


In [2]:
music.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9423 entries, 0 to 9422
Columns: 182 entries, Unnamed: 0 to Segment Timbre 12-12 Covariance
dtypes: float64(180), int64(1), object(1)
memory usage: 13.1+ MB


## Drop rows with missing data

In [4]:
import numpy as np

# Drop columns we won't use
# music_clean = music.drop(columns=["Unnamed: 0", "Unnamed: 0.1", "Artist Name", "Release", "Song Title", "Artist Location"])
music_clean = music.drop(columns=["Unnamed: 0"])

# Drop rows of data based on missing data or low confidence
# music_clean = music_clean[music_clean.loc[:, "MB Genre"] != "0"] # Dropped to 7416 instances
# music_clean = music_clean[music_clean.loc[:, "Key Confidence"] >= 0.001] # Dropped to 7035 instances
# music_clean = music_clean[music_clean.loc[:, "Mode Confidence"] >= 0.001] # Dropped to 7019
# music_clean = music_clean[music_clean.loc[:, "Tempo"] != 0.0] # Dropped to 7007
# music_clean = music_clean[music_clean.loc[:, "Time Signature Confidence"] != 0.0] # Dropped to 5465

music_clean = music_clean[music_clean.loc[:, "Genre"] != "0"] 
# print(music_clean.loc[:, "End of Fade In"].value_counts())
music_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7417 entries, 0 to 9422
Columns: 181 entries, Genre to Segment Timbre 12-12 Covariance
dtypes: float64(180), object(1)
memory usage: 10.3+ MB


In [27]:
# print(music_clean.loc[:, "MB Genre"].value_counts())
# common_genres = music_clean.loc[:, "MB Genre"].value_counts().index[music_clean.loc[:, "MB Genre"].value_counts() > 31]
common_genres = music_clean.loc[:, "Genre"].value_counts().index[music_clean.loc[:, "Genre"].value_counts() > 31]

# music_clean = music_clean[music_clean["MB Genre"].isin(common_genres)] # Drops to 3980
music_clean = music_clean[music_clean["Genre"].isin(common_genres)]

music_clean.info()
#print(music_clean.loc[:, "MB Genre"].value_counts())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5821 entries, 0 to 9422
Columns: 181 entries, Genre to Segment Timbre 12-12 Covariance
dtypes: float64(180), object(1)
memory usage: 8.1+ MB


In [None]:

import matplotlib.pyplot as plt
music_clean.hist(bins=50, figsize=(20,15))
#save_fig("music_histogram_plots")
plt.show()

from pandas.plotting import scatter_matrix

attributes = ["Key", "Duration (sec)", "Mode",
              "Tempo", "Interval 1 Count"]
scatter_matrix(music[attributes], figsize=(12, 8))
#save_fig("scatter_matrix_plot")

## Run Data Through Pipeline
Encode genres into one hot encoded lists

In [35]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer


# Create and transform columns
# full_pipeline = ColumnTransformer([
#         ("label_encoder_num", OrdinalEncoder(), ["MB Genre"]),
#         ("label_encoder_1hot", OneHotEncoder(), ["MB Genre"]),
#         ('duration_scaler', StandardScaler(), ["Duration (sec)"]),
#         ('key_mode_passthrough', 'passthrough', ["Key", "Key Confidence","Mode", "Mode Confidence"]),
#         ('loudness_tempo_scaler', StandardScaler(), ["Loudness", "Tempo"]),
#         ('signature_passthrough', 'passthrough', ["Time Signature", "Time Signature Confidence"]),
#         ('fade_count_scaler', StandardScaler(), np.r_[10:23])
# ])

full_pipeline = ColumnTransformer([
        ("label_encoder_num", OrdinalEncoder(), ["Genre"]),
        ("label_encoder_1hot", OneHotEncoder(), ["Genre"]),
        ("all_features_scaler", StandardScaler(), np.r_[1:180])
])

# Apply transformations
music_prepared = pd.DataFrame(full_pipeline.fit_transform(music_clean))

# Generate column names for export
column_names = ["Genre Num"] + sorted(common_genres.values.tolist()) + music_clean.columns[1:-1].tolist()
display(column_names)
music_prepared.columns = column_names
display(music_prepared)

# Re-index pre-transformed data for export
music_pre_pipeline = music_clean.reset_index(drop=True)
display(music_pre_pipeline)

['Genre Num',
 'alternative hip hop',
 'alternative metal',
 'alternative rock',
 'ambient',
 'arena rock',
 'ballad',
 'black metal',
 'bluegrass',
 'blues',
 'blues rock',
 'bossa nova',
 'breakbeat',
 'celtic',
 'classic rock',
 'classical',
 'contemporary christian',
 'contemporary jazz',
 'contemporary r&b',
 'country',
 'dance',
 'dance-pop',
 'dancehall',
 'death metal',
 'disco',
 'downtempo',
 'dub',
 'east coast hip hop',
 'easy listening',
 'electro',
 'electronic',
 'folk',
 'folk rock',
 'funk',
 'gangsta rap',
 'garage rock',
 'gospel',
 'hard rock',
 'hardcore punk',
 'heavy metal',
 'hip hop',
 'indie rock',
 'jazz',
 'latin',
 'pop',
 'pop rock',
 'punk',
 'reggae',
 'rock',
 'soul',
 'Segment Pitch 1 Average',
 'Segment Timbre 1 Average',
 'Segment Pitch 1-1 Covariance',
 'Segment Timbre 1-1 Covariance',
 'Segment Pitch 1-2 Covariance',
 'Segment Timbre 1-2 Covariance',
 'Segment Pitch 1-3 Covariance',
 'Segment Timbre 1-3 Covariance',
 'Segment Pitch 1-4 Covariance',

Unnamed: 0,Genre Num,alternative hip hop,alternative metal,alternative rock,ambient,arena rock,ballad,black metal,bluegrass,blues,...,Segment Timbre 10-12 Covariance,Segment Pitch 11 Average,Segment Timbre 11 Average,Segment Pitch 11-11 Covariance,Segment Timbre 11-11 Covariance,Segment Pitch 11-12 Covariance,Segment Timbre 11-12 Covariance,Segment Pitch 12 Average,Segment Timbre 12 Average,Segment Pitch 12-12 Covariance
0,39.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.509079,-0.006214,-2.824227,0.518256,1.430809,-0.158348,-0.520864,-0.083278,0.330686,1.193979
1,42.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.133209,1.354770,1.974628,1.798187,0.225926,0.054966,0.269193,-1.196825,0.213932,-1.320867
2,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.408016,0.286686,1.321784,-0.447297,-0.516962,0.045937,0.101730,-0.481287,-0.121419,-1.543109
3,35.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.303474,-0.213833,0.622588,1.204787,-0.524490,-0.234485,0.183555,-1.146397,0.814155,-1.020907
4,43.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.606367,-0.837243,0.223482,-0.892706,-0.048770,0.267275,0.943548,-0.821958,-0.138298,-1.226174
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5816,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.067620,-1.123068,0.911064,-1.122541,0.381602,-0.207647,-1.474779,-0.130574,1.492298,0.276209
5817,47.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.140604,-1.014443,1.623150,-1.563237,-0.797037,-0.579343,0.630262,-0.070500,0.891745,-0.179953
5818,27.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.923528,-0.698677,0.622171,0.381122,-0.320124,0.483333,0.358271,-1.186754,1.305178,-0.856783
5819,46.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.525673,-0.935141,-0.737309,-0.214280,1.941355,-1.701072,-2.144900,1.378608,-0.831515,1.267504


Unnamed: 0,Genre,Segment Pitch 1 Average,Segment Timbre 1 Average,Segment Pitch 1-1 Covariance,Segment Timbre 1-1 Covariance,Segment Pitch 1-2 Covariance,Segment Timbre 1-2 Covariance,Segment Pitch 1-3 Covariance,Segment Timbre 1-3 Covariance,Segment Pitch 1-4 Covariance,...,Segment Pitch 11 Average,Segment Timbre 11 Average,Segment Pitch 11-11 Covariance,Segment Timbre 11-11 Covariance,Segment Pitch 11-12 Covariance,Segment Timbre 11-12 Covariance,Segment Pitch 12 Average,Segment Timbre 12 Average,Segment Pitch 12-12 Covariance,Segment Timbre 12-12 Covariance
0,hip hop,0.471181,41.512778,0.088775,24.473705,0.025206,59.758068,0.008047,193.941455,0.008531,...,0.306680,-13.563280,0.083510,601.389784,0.008330,-23.752621,0.324174,5.442480,0.114662,314.556380
1,latin,0.407313,45.130815,0.119181,17.941269,0.044909,90.896563,0.010286,98.513702,-0.005027,...,0.453607,8.442069,0.121259,360.639376,0.011830,11.974532,0.197391,4.477171,0.037107,403.882977
2,alternative metal,0.409174,50.251554,0.090325,17.901274,0.034920,-8.965879,0.007081,46.602124,0.010843,...,0.338300,5.448426,0.055032,212.201170,0.011682,4.401661,0.278859,1.704516,0.030253,119.497075
3,gospel,0.602210,43.398847,0.119990,37.504854,0.028724,53.487959,-0.014175,68.094775,-0.004426,...,0.284266,2.242232,0.103758,210.697038,0.007081,8.101885,0.203133,9.439756,0.046357,347.196470
4,pop,0.468588,45.012354,0.105784,34.059246,0.026610,129.908289,-0.032692,149.439266,0.004605,...,0.216965,0.412119,0.041895,305.751772,0.015314,42.469511,0.240072,1.564965,0.040027,432.302136
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5816,blues,0.398470,34.874122,0.081013,26.720687,0.040225,-140.878550,-0.011674,146.610013,0.004146,...,0.186108,3.565054,0.035116,391.745285,0.007521,-66.889565,0.318789,15.046568,0.086359,228.320156
5817,rock,0.368072,44.935159,0.129540,67.778171,0.045689,585.433711,-0.001274,-56.590727,-0.004793,...,0.197835,6.830350,0.022119,156.238759,0.001422,28.302421,0.325629,10.081257,0.072291,423.222029
5818,easy listening,0.413326,38.707117,0.128164,33.881589,0.031499,24.367544,-0.005635,177.943323,-0.029212,...,0.231924,2.240321,0.079465,251.531843,0.018860,16.002692,0.198538,13.499479,0.051419,381.583623
5819,reggae,0.579638,36.633664,0.108169,41.737965,0.077773,-74.742156,0.024128,247.879996,-0.005260,...,0.206396,-3.993628,0.061904,703.403120,-0.016985,-97.193059,0.490617,-4.166480,0.116930,356.456204


## Split Test/Train Data

In [36]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

music_train, music_test = train_test_split(music_prepared, test_size=0.2, random_state=42)

display(music_test)

Unnamed: 0,Genre Num,alternative hip hop,alternative metal,alternative rock,ambient,arena rock,ballad,black metal,bluegrass,blues,...,Segment Timbre 10-12 Covariance,Segment Pitch 11 Average,Segment Timbre 11 Average,Segment Pitch 11-11 Covariance,Segment Timbre 11-11 Covariance,Segment Pitch 11-12 Covariance,Segment Timbre 11-12 Covariance,Segment Pitch 12 Average,Segment Timbre 12 Average,Segment Pitch 12-12 Covariance
544,29.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.768299,0.006903,-1.171889,1.043059,0.288374,0.443404,0.594411,0.244136,-0.065682,-0.390663
1892,45.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.226787,0.287600,-0.708549,-0.040690,1.374864,0.642651,-0.919379,0.106176,-0.623598,-0.199669
4198,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,-0.290686,0.358253,-1.665067,1.091802,0.420739,0.152013,-0.310933,0.592877,0.955860,1.152511
5426,39.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.342854,0.920468,0.193604,0.678623,0.391734,1.007236,-0.635892,-0.471710,0.883130,-0.292494
1632,2.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.077847,-0.857098,0.439596,-1.766072,-0.990130,-0.178303,0.173161,-0.426050,0.683969,-1.419930
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2291,12.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-2.244357,-1.401580,-0.807840,-0.594629,-0.212808,1.453344,-0.498756,-0.105682,-0.212156,1.034545
4284,2.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.394001,0.074592,0.185310,0.248543,-1.043039,-0.032863,0.641315,-1.110749,-0.560447,-1.575299
5139,13.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.638047,0.284317,0.958043,-0.159630,-0.265785,0.171531,-0.531368,0.696474,0.128301,-0.508797
252,45.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.839716,-0.505885,0.394820,-1.449876,-1.162723,0.233986,0.604337,0.280947,-0.718003,-0.759241


## Export Data

In [37]:
from pathlib import Path  

# Export pre-transformed data
data_cleaned = Path('../data_processed/data_cleaned_NEW.csv')
music_clean.to_csv(path_or_buf=data_cleaned, index=False)

# Export train and test data
training_data = Path('../data_processed/training_data_NEW.csv')
test_data = Path('../data_processed/test_data_NEW.csv')

music_train.to_csv(path_or_buf=training_data, index=False)
music_test.to_csv(path_or_buf=test_data, index=False)