In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
from sklearn.calibration import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.impute import SimpleImputer
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.discriminant_analysis import StandardScaler
from sklearn.preprocessing import OneHotEncoder

In [2]:
#read data from csv files -> na values are '?', '0/4', -1 as identified in eda

alternative = pd.read_csv('training-data/alternative.csv', na_values=['?', '0/4', -1])
blues = pd.read_csv('training-data/blues.csv', na_values=['?', '0/4', -1])
classical = pd.read_csv('training-data/classical.csv', na_values=['?', '0/4', -1])
comedy = pd.read_csv('training-data/comedy.csv', na_values=['?', '0/4', -1])
folk = pd.read_csv('training-data/folk.csv', na_values=['?', '0/4', -1])
hiphop = pd.read_csv('training-data/hip-hop.csv', na_values=['?', '0/4', -1])
jazz = pd.read_csv('training-data/jazz.csv', na_values=['?', '0/4', -1])
opera = pd.read_csv('training-data/opera.csv', na_values=['?', '0/4', -1])
pop = pd.read_csv('training-data/pop.csv', na_values=['?', '0/4', -1])
rnb = pd.read_csv('training-data/rb.csv', na_values=['?', '0/4', -1])

df = pd.concat([alternative, blues, classical, comedy, folk, hiphop, jazz, opera, pop, rnb], axis=0) #concatenate all genre dataframes

In [3]:
df = df.drop(['instance_id', 'track_id', 'track_name', 'key', 'mode'], axis=1) #drop columns that are not useful for classification
train_data = df.drop(['genre'], axis=1) #drop target column
test_data = df['genre'] #target column

In [4]:
#identify numerical and cateogircal features for encoding

numerical_features = ['popularity', 'acousticness', 'danceability',
                      'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'valence', 'duration_ms']

categorical_features = ['artist_name', 'time_signature']

In [5]:
# perform imputation scaling encoding

si_num = SimpleImputer(strategy='median') #initialize imputer for numerical features
scaler = StandardScaler() #initialize standard scaler for numerical features
si_cat = SimpleImputer(strategy='most_frequent') #initialize imputer for categorical features
ohe = OneHotEncoder(handle_unknown='ignore', sparse=False) #initialize one hot encoder for categorical features

train_data[numerical_features] = si_num.fit_transform(train_data[numerical_features]) #impute missing values in numerical features
train_data[numerical_features] = scaler.fit_transform(train_data[numerical_features]) #scale numerical features

train_data[categorical_features] = si_cat.fit_transform(train_data[categorical_features]) #impute missing values in categorical features
encoded_cat_features = ohe.fit_transform(train_data[categorical_features]) #encode categorical features
encoded_cat_df = pd.DataFrame(encoded_cat_features, columns=ohe.get_feature_names_out(categorical_features)) #convert encoded features to dataframe

train_data_numerical_df = pd.DataFrame(train_data[numerical_features]).reset_index(drop=True) #sort index issues
encoded_cat_df = encoded_cat_df.reset_index(drop=True) #sort index issues

processed_features = pd.concat([train_data_numerical_df, encoded_cat_df], axis=1) #concatenate numerical and categorical features

In [6]:
#split training into train/test data

x_train, x_test, y_train, y_test = train_test_split(processed_features, test_data, test_size=0.2, random_state=42)

In [7]:
#encode genre values in target test sets

le = LabelEncoder()

y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

In [8]:
#initialize classifiers

rf = RandomForestClassifier(random_state=309)
knn = KNeighborsClassifier(n_neighbors=3)
gnb = GradientBoostingClassifier(n_estimators=20)
mlpc = MLPClassifier(max_iter=1000)

In [9]:
#training rf model

rf.fit(x_train, y_train) #fit rf
rf_pred = rf.predict(x_test) #predict rf
rf_acc = accuracy_score(y_test, rf_pred) #calc accuracy

In [10]:
#training gnb model

gnb.fit(x_train, y_train) #fit gnb
gnb_pred = gnb.predict(x_test) #predict gnb
gnb_acc = accuracy_score(y_test, gnb_pred) #calc accuracy

In [11]:
#training mlpc model

mlpc.fit(x_train, y_train) #fit mlpc
mlpc_pred = mlpc.predict(x_test) #predict mlpc
mlpc_acc = accuracy_score(y_test, mlpc_pred) #calc accuracy

In [12]:
#training stacking model

combination = StackingClassifier(estimators=[('rf', rf), ('gnb', gnb), ('mlpc', mlpc)], cv="prefit") #stack rf, gnb, mlpc in model with cv as prefit
combination.fit(x_train, y_train) #fit model
combination_predictions = combination.predict(x_test) #predict model
combination_accuracy = accuracy_score(y_test, combination_predictions) #calc accuracy

In [13]:
#print out accuracies

print("Random Forest Accuracy: ", rf_acc)
print("Gradient Boosting Accuracy: ", gnb_acc)
print("MLPC Accuracy: ", mlpc_acc)
print("Stacking Accuracy: ", combination_accuracy)

# NORMAL with removed duration_ms, key, mode
# Random Forest Accuracy: 0.6234
# Gradient Boosting Accuracy: 0.6268
# MLPC Accuracy: 0.6515
# Stacking Accuracy: 0.6223

# NORMAL with removed duration_ms
# Random Forest Accuracy: 0.6233
# Gradient Boosting Accuracy: 0.626
# MLPC Accuracy: 0.6366
# Stacking Accuracy: 0.6223

# NORMAL with removed key, mode
# Random Forest Accuracy: 0.6271
# Gradient Boosting Accuracy: 0.6269
# MLPC Accuracy: 0.6525
# Stacking Accuracy: 0.6266

# NORMAL with everything
# Random Forest Accuracy: 0.6276
# Gradient Boosting Accuracy: 0.6269
# MLPC Accuracy: 0.6399
# Stacking Accuracy: 0.6288

Random Forest Accuracy:  0.6772
Gradient Boosting Accuracy:  0.6436
MLPC Accuracy:  0.7801
Stacking Accuracy:  0.7509


In [14]:
#read testing data -> na values are '?', '0/4', -1 as identified in eda

test_df = pd.read_csv('testing-data/testing-instances.csv', na_values=['?', '0/4', -1])
id = test_df['instance_id']
testing = test_df.drop(['instance_id', 'track_id', 'track_name', 'key', 'mode'], axis=1) #drop columns that are not useful for classification

In [15]:
#perform the same preprocessing as the training data

testing[numerical_features] = si_num.transform(testing[numerical_features]) #impute missing values in numerical features
testing[numerical_features] = scaler.transform(testing[numerical_features]) #scale numerical features

testing[categorical_features] = si_cat.transform(testing[categorical_features]) #impute missing values in categorical features
encoded_testing_cat_features = ohe.transform(testing[categorical_features]) #encode categorical features
encoded_testing_cat_df = pd.DataFrame(encoded_testing_cat_features, columns=ohe.get_feature_names_out(categorical_features)) #convert encoded features to dataframe

testing_numerical_df = pd.DataFrame(testing[numerical_features]).reset_index(drop=True) #sort index issues
encoded_testing_cat_df = encoded_testing_cat_df.reset_index(drop=True) #sort index issues

processed_testing_features = pd.concat([testing_numerical_df, encoded_testing_cat_df], axis=1) #concatenate numerical and categorical features

In [16]:
#use mlpc model as it had the highest accuracy

mlpc_test_pred = mlpc.predict(processed_testing_features) #predict mlpc on testing data
mlpc_test_pred = le.inverse_transform(mlpc_test_pred) #inverse transform encoded genre values

In [17]:
#print first 10 values to check genre predictions of model

print('MLPC Predictions: ', mlpc_test_pred[:10])

MLPC Predictions:  ['Classical' 'Hip-Hop' 'Blues' 'Jazz' 'Jazz' 'R&B' 'R&B' 'Pop' 'Opera'
 'Jazz']


In [18]:
#dataFrame with test-instance ids and pred-genre

test_mlpc_normal = pd.DataFrame(id)
test_mlpc_normal['genre'] = mlpc_test_pred

In [19]:
#output to csv

test_mlpc_normal.to_csv('output-data/test_mlpc_normal.csv', index=False)