## Data Processing and Visualization
### CPE 695 Final Project - Genre Prediction

Spring 2023

In [1]:
import pandas as pd

DATA_PATH = '../data_processed/Labeled_Data.csv'

def load_data(data_path=DATA_PATH):
    return pd.read_csv(data_path)

music = load_data()
music.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,MB Genre,Artist Name,Release,Song Title,Artist Location,Duration (sec),Key,Key Confidence,...,Interval 3 Count,Interval 4 Count,Interval 5 Count,Interval 6 Count,Interval 7 Count,Interval 8 Count,Interval 9 Count,Interval 10 Count,Interval 11 Count,Interval 12 Count
0,0,0,hip hop,Casual,Fear Itself,I Didn't Mean To,California - LA,218.93179,1,0.736,...,70.0,119.0,38.0,97.0,21.0,51.0,42.0,43.0,90.0,18.0
1,1,1,blue-eyed soul,The Box Tops,Dimensions,Soul Deep,"Memphis, TN",148.03546,6,0.169,...,68.0,36.0,25.0,59.0,5.0,36.0,34.0,15.0,32.0,4.0
2,2,2,latin,Sonora Santanera,Las Numero 1 De La Sonora Santanera,Amor De Cabaret,,177.47546,8,0.643,...,43.0,51.0,23.0,27.0,38.0,38.0,23.0,23.0,21.0,2.0
3,3,3,glam rock,Adam Ant,Friend Or Foe,Something Girls,"London, England",233.40363,0,0.751,...,77.0,54.0,65.0,65.0,13.0,47.0,6.0,21.0,13.0,25.0
4,4,4,alternative metal,Gob,Muertos Vivos,Face the Ashes,,209.60608,2,0.092,...,65.0,34.0,27.0,108.0,20.0,36.0,29.0,24.0,3.0,0.0


In [2]:
music.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9421 entries, 0 to 9420
Data columns (total 29 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Unnamed: 0                 9421 non-null   int64  
 1   Unnamed: 0.1               9421 non-null   int64  
 2   MB Genre                   9421 non-null   object 
 3   Artist Name                9421 non-null   object 
 4   Release                    9421 non-null   object 
 5   Song Title                 9421 non-null   object 
 6   Artist Location            5552 non-null   object 
 7   Duration (sec)             9421 non-null   float64
 8   Key                        9421 non-null   int64  
 9   Key Confidence             9421 non-null   float64
 10  Mode                       9421 non-null   int64  
 11  Mode Confidence            9421 non-null   float64
 12  Loudness                   9421 non-null   float64
 13  Tempo                      9421 non-null   float

## Drop rows with missing data

In [60]:
import numpy as np

# Drop columns we won't use
music_clean = music.drop(columns=["Unnamed: 0", "Unnamed: 0.1", "Artist Name", "Release", "Song Title", "Artist Location"])

# Drop rows of data based on missing data or low confidence
music_clean = music_clean[music_clean.loc[:, "MB Genre"] != "0"] # Dropped to 7416 instances
music_clean = music_clean[music_clean.loc[:, "Key Confidence"] >= 0.001] # Dropped to 7035 instances
music_clean = music_clean[music_clean.loc[:, "Mode Confidence"] >= 0.001] # Dropped to 7019
music_clean = music_clean[music_clean.loc[:, "Tempo"] != 0.0] # Dropped to 7007
music_clean = music_clean[music_clean.loc[:, "Time Signature Confidence"] != 0.0] # Dropped to 5465
#print(music_clean.loc[:, "End of Fade In"].value_counts())
#music_clean.info()

In [85]:
# print(music_clean.loc[:, "MB Genre"].value_counts())
common_genres = music_clean.loc[:, "MB Genre"].value_counts().index[music_clean.loc[:, "MB Genre"].value_counts() > 31]

music_clean = music_clean[music_clean["MB Genre"].isin(common_genres)] # Drops to 3980

#music_clean.info()
#print(music_clean.loc[:, "MB Genre"].value_counts())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3980 entries, 0 to 9420
Data columns (total 23 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   MB Genre                   3980 non-null   object 
 1   Duration (sec)             3980 non-null   float64
 2   Key                        3980 non-null   int64  
 3   Key Confidence             3980 non-null   float64
 4   Mode                       3980 non-null   int64  
 5   Mode Confidence            3980 non-null   float64
 6   Loudness                   3980 non-null   float64
 7   Tempo                      3980 non-null   float64
 8   Time Signature             3980 non-null   int64  
 9   Time Signature Confidence  3980 non-null   float64
 10  End of Fade In             3980 non-null   float64
 11  Interval 1 Count           3980 non-null   float64
 12  Interval 2 Count           3980 non-null   float64
 13  Interval 3 Count           3980 non-null   float

In [None]:

import matplotlib.pyplot as plt
music_clean.hist(bins=50, figsize=(20,15))
#save_fig("music_histogram_plots")
plt.show()

from pandas.plotting import scatter_matrix

attributes = ["Key", "Duration (sec)", "Mode",
              "Tempo", "Interval 1 Count"]
scatter_matrix(music[attributes], figsize=(12, 8))
#save_fig("scatter_matrix_plot")

## Run Data Through Pipeline
Encode genres into one hot encoded lists

In [148]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer


# Create and transform columns
full_pipeline = ColumnTransformer([
        ("label_encoder_num", OrdinalEncoder(), ["MB Genre"]),
        ("label_encoder_1hot", OneHotEncoder(), ["MB Genre"]),
        ('duration_scaler', StandardScaler(), ["Duration (sec)"]),
        ('key_mode_passthrough', 'passthrough', ["Key", "Key Confidence","Mode", "Mode Confidence"]),
        ('loudness_tempo_scaler', StandardScaler(), ["Loudness", "Tempo"]),
        ('signature_passthrough', 'passthrough', ["Time Signature", "Time Signature Confidence"]),
        ('fade_count_scaler', StandardScaler(), np.r_[10:23])
])

# Apply transformations
music_prepared = pd.DataFrame(full_pipeline.fit_transform(music_clean))

# Generate column names for export
column_names = ["Genre Num"] + sorted(common_genres.values.tolist()) + music_clean.columns[1:23].tolist()
music_prepared.columns = column_names
display(music_prepared.iloc[:, 0:45])

# Re-index pre-transformed data for export
music_pre_pipeline = music_clean.reset_index(drop=True)
display(music_pre_pipeline)

Unnamed: 0,MB Genre,Duration (sec),Key,Key Confidence,Mode,Mode Confidence,Loudness,Tempo,Time Signature,Time Signature Confidence,...,Interval 3 Count,Interval 4 Count,Interval 5 Count,Interval 6 Count,Interval 7 Count,Interval 8 Count,Interval 9 Count,Interval 10 Count,Interval 11 Count,Interval 12 Count
0,hip hop,218.93179,1,0.736,0,0.636,-11.197,92.198,4,0.778,...,70.0,119.0,38.0,97.0,21.0,51.0,42.0,43.0,90.0,18.0
1,alternative metal,209.60608,2,0.092,1,0.371,-4.501,129.738,4,0.562,...,65.0,34.0,27.0,108.0,20.0,36.0,29.0,24.0,3.0,0.0
2,gospel,267.70240,5,0.635,1,0.557,-9.323,147.782,3,0.454,...,57.0,30.0,48.0,114.0,20.0,13.0,16.0,32.0,25.0,5.0
3,pop,245.21098,7,0.070,1,0.686,-7.545,117.975,4,0.835,...,129.0,42.0,42.0,67.0,28.0,66.0,20.0,60.0,1.0,0.0
4,contemporary r&b,307.38240,3,0.524,1,0.533,-8.346,125.197,3,0.211,...,107.0,105.0,28.0,82.0,119.0,21.0,26.0,14.0,7.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3975,country,141.73995,2,0.101,1,0.394,-11.756,119.271,4,0.150,...,45.0,87.0,30.0,95.0,14.0,47.0,38.0,8.0,11.0,0.0
3976,rock,386.19383,7,0.374,1,0.540,-8.087,140.185,4,0.099,...,122.0,36.0,33.0,45.0,14.0,53.0,16.0,19.0,1.0,20.0
3977,easy listening,168.01914,8,0.223,1,0.398,-14.517,77.072,3,0.597,...,43.0,28.0,22.0,34.0,5.0,18.0,13.0,14.0,11.0,8.0
3978,reggae,193.72363,1,0.931,1,0.565,-12.087,118.123,4,0.205,...,86.0,74.0,36.0,134.0,38.0,22.0,66.0,18.0,64.0,43.0


Unnamed: 0,Genre Num,alternative hip hop,alternative metal,alternative rock,ambient,ballad,black metal,blues,blues rock,breakbeat,...,punk,reggae,rock,soul,Duration (sec),Key,Key Confidence,Mode,Mode Confidence,Loudness
0,28.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,-0.277421,1.0,0.736,0.0,0.636,-0.210269
1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,-0.359554,2.0,0.092,1.0,0.371,1.081623
2,25.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.152108,5.0,0.635,1.0,0.557,0.151291
3,32.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,-0.045977,7.0,0.070,1.0,0.686,0.494329
4,13.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.501575,3.0,0.524,1.0,0.533,0.339788
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3975,14.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,-0.957259,2.0,0.101,1.0,0.394,-0.318120
3976,36.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.195677,7.0,0.374,1.0,0.540,0.389758
3977,19.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,-0.725815,8.0,0.223,1.0,0.398,-0.850813
3978,35.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,-0.499432,1.0,0.931,1.0,0.565,-0.381982


## Split Test/Train Data

In [150]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

music_train, music_test = train_test_split(music_clean, test_size=0.2, random_state=42)

display(music_test)

Unnamed: 0,MB Genre,Duration (sec),Key,Key Confidence,Mode,Mode Confidence,Loudness,Tempo,Time Signature,Time Signature Confidence,...,Interval 3 Count,Interval 4 Count,Interval 5 Count,Interval 6 Count,Interval 7 Count,Interval 8 Count,Interval 9 Count,Interval 10 Count,Interval 11 Count,Interval 12 Count
938,soul,239.07220,8,0.512,1,0.655,-8.246,90.915,4,0.496,...,143.0,100.0,112.0,121.0,47.0,103.0,69.0,40.0,31.0,4.0
7787,alternative rock,332.79955,4,0.753,1,0.707,-10.372,141.946,4,0.311,...,96.0,44.0,48.0,113.0,16.0,126.0,24.0,13.0,24.0,10.0
1540,electro,385.22730,1,0.060,1,0.273,-6.965,94.005,4,0.704,...,103.0,119.0,134.0,251.0,55.0,81.0,172.0,90.0,113.0,29.0
7378,ballad,260.96281,10,0.397,1,0.429,-3.941,96.087,4,1.000,...,102.0,123.0,97.0,83.0,40.0,68.0,41.0,51.0,27.0,10.0
5460,alternative rock,389.06730,8,0.610,1,0.583,-7.667,196.096,4,0.154,...,104.0,72.0,196.0,192.0,256.0,73.0,35.0,20.0,20.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2542,downtempo,263.73179,1,0.621,1,0.588,-10.235,86.684,5,1.000,...,72.0,75.0,100.0,67.0,34.0,52.0,30.0,14.0,16.0,1.0
5829,folk rock,299.31057,7,0.439,1,0.491,-5.157,165.278,4,0.383,...,101.0,70.0,65.0,80.0,41.0,86.0,28.0,20.0,4.0,5.0
1140,rock,192.26077,0,1.000,1,0.887,-13.626,160.663,4,0.130,...,50.0,24.0,36.0,85.0,10.0,84.0,6.0,9.0,4.0,7.0
9411,alternative rock,162.95138,5,0.860,1,0.562,-12.673,162.133,4,0.668,...,58.0,22.0,17.0,45.0,11.0,41.0,10.0,36.0,32.0,19.0


## Export Data

In [153]:
from pathlib import Path  

# Export pre-transformed data
data_cleaned = Path('../data_processed/data_cleaned.csv')
music_clean.to_csv(path_or_buf=data_cleaned, index=False)

# Export train and test data
training_data = Path('../data_processed/training_data.csv')
test_data = Path('../data_processed/test_data.csv')

music_train.to_csv(path_or_buf=training_data, index=False)
music_test.to_csv(path_or_buf=test_data, index=False)