In [88]:
import pandas as pd
from scipy.stats import skew
from scipy.stats import kurtosis
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, f1_score, accuracy_score, confusion_matrix, classification_report


# Exploratory Data Analysis

I will load in the data for each genre and then look at it's *info* to check the structure, null values and data types.

In [91]:
alternative = pd.read_csv("/Users/viyankamoodley/Downloads/alternative.csv",keep_default_na=False,na_values=['?'])
blues = pd.read_csv("/Users/viyankamoodley/Downloads/blues.csv",keep_default_na=False,na_values=['?'])
classical = pd.read_csv("/Users/viyankamoodley/Downloads/classical.csv",keep_default_na=False,na_values=['?'])
comedy = pd.read_csv("/Users/viyankamoodley/Downloads/comedy.csv",keep_default_na=False,na_values=['?'])
folk = pd.read_csv("/Users/viyankamoodley/Downloads/folk.csv",keep_default_na=False,na_values=['?'])
hip_hop = pd.read_csv("/Users/viyankamoodley/Downloads/hip-hop.csv",keep_default_na=False,na_values=['?'])
jazz = pd.read_csv("/Users/viyankamoodley/Downloads/jazz.csv",keep_default_na=False,na_values=['?'])
opera = pd.read_csv("/Users/viyankamoodley/Downloads/opera.csv",keep_default_na=False,na_values=['?'])
pop = pd.read_csv("/Users/viyankamoodley/Downloads/pop.csv",keep_default_na=False,na_values=['?'])
rb = pd.read_csv("/Users/viyankamoodley/Downloads/rb.csv",keep_default_na=False,na_values=['?'])

In [92]:
music_df = pd.concat([alternative, blues, classical, comedy, folk, hip_hop, jazz, opera, pop, rb])
music_df.shape

(50000, 19)

In [95]:
music_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 50000 entries, 0 to 4999
Data columns (total 19 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   instance_id       50000 non-null  int64  
 1   artist_name       50000 non-null  object 
 2   track_name        50000 non-null  object 
 3   track_id          50000 non-null  object 
 4   popularity        50000 non-null  int64  
 5   acousticness      50000 non-null  float64
 6   danceability      50000 non-null  float64
 7   duration_ms       50000 non-null  int64  
 8   energy            50000 non-null  float64
 9   instrumentalness  50000 non-null  float64
 10  key               50000 non-null  object 
 11  liveness          50000 non-null  float64
 12  loudness          50000 non-null  float64
 13  mode              50000 non-null  object 
 14  speechiness       50000 non-null  float64
 15  tempo             42499 non-null  float64
 16  time_signature    50000 non-null  object 
 17 

In [97]:
music_df.head()

Unnamed: 0,instance_id,artist_name,track_name,track_id,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,genre
0,134768,Freddie Gibbs,Triple Threat,12NwYmQT1Mm7gkrCjIuq0d,45,0.0252,0.85,178422,0.412,0.0,D,0.0948,-7.845,Major,0.236,130.99,04-Apr,0.663,Alternative
1,97297,Three Days Grace,No More,4yDs61mzPcTod9qrBUE3H9,45,0.00261,0.271,225493,0.916,0.0,F#,0.156,-3.141,Minor,0.0581,197.817,04-Apr,0.286,Alternative
2,88412,Solange,Don't You Wait,0AmkrjMDff4ICVNxQhppZA,45,0.279,0.833,245907,0.579,0.692,C,0.0997,-6.568,Major,0.08,,04-Apr,0.514,Alternative
3,172572,Amy Grant,"Baby, Baby",25AmLrQC1b3Hz9FUGZXF1S,58,0.426,0.702,236973,0.901,0.00153,C#,0.0368,-4.45,Major,0.0264,97.862,04-Apr,0.919,Alternative
4,194415,empty_field,Livin Right,6KmnejQFYdWUVUJh6zwEce,46,0.00114,0.539,209760,0.915,0.0,C#,0.0675,-4.251,Minor,0.0601,88.051,04-Apr,0.435,Alternative


At a first glance, the data has some interesting obervations. I have noticed that time_signature is as 04-April, which is odd given that time signatures represent how many notes are within a bar(usually about four normal counts). I assume this is because time_signature has been accidentally formatted as a date, so 04-April should really be 04/04. 

In [100]:
music_df.describe()

Unnamed: 0,instance_id,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence
count,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,42499.0,50000.0
mean,118922.89754,41.67002,0.474821,0.533403,196006.5,0.515805,0.149052,0.244182,-11.013448,0.166576,113.68117,0.419746
std,57198.25002,18.496416,0.376703,0.188077,156616.9,0.273039,0.300495,0.234041,6.706273,0.255489,30.622296,0.248419
min,20004.0,0.0,1e-06,0.0582,-1.0,0.000243,0.0,0.00967,-47.599,0.0226,34.151,0.0
25%,69247.75,29.0,0.0801,0.406,118792.5,0.296,0.0,0.0996,-14.15,0.0382,89.028,0.21
50%,118822.5,44.0,0.442,0.546,204433.5,0.543,6.6e-05,0.133,-8.906,0.0522,110.043,0.4
75%,168614.25,55.0,0.867,0.672,259333.0,0.735,0.0468,0.299,-6.161,0.141,134.2065,0.607
max,217850.0,99.0,0.996,0.98,5552917.0,0.998,0.994,1.0,3.744,0.965,239.848,0.986


In [102]:
print(music_df.isnull().sum())

instance_id            0
artist_name            0
track_name             0
track_id               0
popularity             0
acousticness           0
danceability           0
duration_ms            0
energy                 0
instrumentalness       0
key                    0
liveness               0
loudness               0
mode                   0
speechiness            0
tempo               7501
time_signature         0
valence                0
genre                  0
dtype: int64


In [104]:
print(music_df['time_signature'].unique())

['04-Apr' '03-Apr' '01-Apr' '05-Apr' '0/4']


In [50]:
!pip install lightgbm




# Preprocessing

In [112]:
%pip install lightgbm #using lightgbm because it is less time consuming than regular g
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report

# function to correct the time signature 
def correct_time_signature_format(ts):
    if isinstance(ts, str):
        parts = ts.split('-')
        if len(parts) == 2:
            ts_corrected = f"{int(parts[0])}/{parts[1].replace('Apr', '4')}"
            ts_corrected = ts_corrected.replace('03/4', '3/4').replace('04/4', '4/4').replace('05/4', '5/4').replace('01/4', '1/4')
            return ts_corrected
        return ts
    return ts

# applying the function to the training and test dataset
music_df['time_signature'] = music_df['time_signature'].apply(correct_time_signature_format)

X_test = pd.read_csv('/Users/viyankamoodley/Downloads/testing-instances.csv', keep_default_na=False, na_values=['?'])
X_test['time_signature'] = X_test['time_signature'].apply(correct_time_signature_format)

# handling nas in duration
music_df['duration_ms'] = music_df['duration_ms'].replace(-1, np.nan)
X_test['duration_ms'] = X_test['duration_ms'].replace(-1, np.nan)

# feature engineering 
music_df['acousticness_energy'] = music_df['acousticness'] * music_df['energy']
music_df['danceability_valence'] = music_df['danceability'] * music_df['valence']
X_test['acousticness_energy'] = X_test['acousticness'] * X_test['energy']
X_test['danceability_valence'] = X_test['danceability'] * X_test['valence']

# redefining num and cat features
numerical_features2 = ['popularity', 'acousticness', 'danceability', 'duration_ms', 'energy', 
                      'instrumentalness', 'liveness', 'speechiness', 'tempo', 'valence', 
                      'acousticness_energy', 'danceability_valence']
categorical_features = ['key', 'mode', 'time_signature']

# using MinMaxScaler since there are several values that are not normally distrubuted
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')), #chose median rather than mean
    ('scaler', MinMaxScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features2)
    ])

# splitting the data, using a 10% validation set
X = music_df.drop("genre", axis=1)
y = music_df["genre"]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=309)

# applying the preprocessing after the split
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_val_preprocessed = preprocessor.transform(X_val)
X_test_preprocessed = preprocessor.transform(X_test)


models = {
    'DecisionTree': DecisionTreeClassifier(random_state=309),
    'MLP': MLPClassifier(max_iter=200, random_state=309),  
    'LightGBM': LGBMClassifier(random_state=309),
    'RandomForest': RandomForestClassifier(random_state=309),
    'LogisticRegression': LogisticRegression(max_iter=1000, random_state=309)
}

param_grids = {
    'DecisionTree': {
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    },
    'MLP': {
        'hidden_layer_sizes': [(50,)],
        'activation': ['relu'],
        'solver': ['adam'],
        'alpha': [0.0001],
        'learning_rate': ['constant'],
        'max_iter': [200]
    },
    'LightGBM': {
        'n_estimators': [100, 150],
        'max_depth': [3, 5, 7],
        'learning_rate': [0.01, 0.1],
        'num_leaves': [31, 50]
    },
    'RandomForest': {
        'n_estimators': [100, 150],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2],
    },
    'LogisticRegression': {
        'C': [0.1, 1, 10],
        'penalty': ['l2']
    }
}

best_models = {}

for model_name, model in models.items():
    print(f"Training and optimizing {model_name}...")
    grid_search = RandomizedSearchCV(estimator=model, param_distributions=param_grids[model_name], #had to change to randomised because grid was too computationally expenny
                                     n_iter=10,   
                                     cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=309), #started w 5 splits, but run time was very long, so moved to 3
                                     n_jobs=-1, scoring='f1_weighted', random_state=309)
    grid_search.fit(X_train_preprocessed, y_train)
    
    # getting the best model based on the randomised search
    best_models[model_name] = grid_search.best_estimator_
    
    # assessing the optimised model
    y_val_pred = grid_search.predict(X_val_preprocessed)
    
    accuracy = accuracy_score(y_val, y_val_pred)
    f1 = f1_score(y_val, y_val_pred, average='weighted')
    
    print(f"{model_name} Validation Accuracy: {accuracy:.4f}")
    print(f"{model_name} Validation F1 Score: {f1:.4f}")
    print(f"{model_name} Confusion Matrix:\n{confusion_matrix(y_val, y_val_pred)}")
    print(f"{model_name} Classification Report:\n{classification_report(y_val, y_val_pred)}\n")

# choosing the best model based on validation f1 score
best_model_name = max(best_models, key=lambda name: f1_score(
    y_val, best_models[name].predict(X_val_preprocessed), average='weighted')
)
best_model = best_models[best_model_name]

print(f"Best model selected: {best_model_name}")

# making predicitions on the test data
y_test_pred = best_model.predict(X_test_preprocessed)

# storing instance id and the predicted genre into a df for my submissions
submission_df = pd.DataFrame({
    'instance_id': X_test['instance_id'],
    'genre': y_test_pred
})

# saving the submission file
submission_df.to_csv('/Users/viyankamoodley/Desktop/submission.csv', index=False)

print("Model training and prediction completed. Submission file created as 'submission.csv'.")

Note: you may need to restart the kernel to use updated packages.
Training and optimizing DecisionTree...
DecisionTree Validation Accuracy: 0.6288
DecisionTree Validation F1 Score: 0.6288
DecisionTree Confusion Matrix:
[[222   8   7   1  83  37   9   0  50  90]
 [ 31 306  11   4  55   0  51   4   9  36]
 [ 10  26 334   1  16   0  21  42   0   2]
 [  1  12   0 483   0   0   2   4   0   2]
 [ 57   8   7   0 248   7  38   0  53  55]
 [ 16   0   0   0   1 288   3   0 117  68]
 [ 23  61  34   1 116  14 216   7  12  58]
 [  0   7  33   3   1   0   7 471   0   2]
 [ 23   2   1   0  25  54   1   0 369  18]
 [ 31  11   0   2  80  80  16   1  77 207]]
DecisionTree Classification Report:
              precision    recall  f1-score   support

 Alternative       0.54      0.44      0.48       507
       Blues       0.69      0.60      0.65       507
   Classical       0.78      0.74      0.76       452
      Comedy       0.98      0.96      0.97       504
        Folk       0.40      0.52      0.45



MLP Validation Accuracy: 0.6510
MLP Validation F1 Score: 0.6476
MLP Confusion Matrix:
[[228  13   3   0  93  44  12   0  49  65]
 [ 22 347  13   3  46   0  55   4   8   9]
 [ 10  26 347   0   8   0  17  42   1   1]
 [  0  21   0 473   0   0   5   5   0   0]
 [ 53  21  10   0 263   4  38   0  48  36]
 [ 11   0   0   0   3 361   2   0  89  27]
 [ 11  83  43   3  92  16 250   1   4  39]
 [  0   3  52   0   0   0   4 465   0   0]
 [ 22   1   1   1  25  93   2   0 340   8]
 [ 34  13   0   1  71 109  30   1  65 181]]
MLP Classification Report:
              precision    recall  f1-score   support

 Alternative       0.58      0.45      0.51       507
       Blues       0.66      0.68      0.67       507
   Classical       0.74      0.77      0.75       452
      Comedy       0.98      0.94      0.96       504
        Folk       0.44      0.56      0.49       473
     Hip-Hop       0.58      0.73      0.64       493
        Jazz       0.60      0.46      0.52       542
       Opera       0.90



LogisticRegression Validation Accuracy: 0.6142
LogisticRegression Validation F1 Score: 0.6122
LogisticRegression Confusion Matrix:
[[260  22   2   1  73  24  16   0  45  64]
 [ 27 330  24   4  41   0  66   4   7   4]
 [ 12  21 295   1  12   0  19  90   1   1]
 [  0  19   3 476   0   0   3   3   0   0]
 [ 67  24  12   0 245   1  50   0  39  35]
 [ 34   0   0   0   4 311   1   0  87  56]
 [ 18  93  41   5 104  13 224   9   1  34]
 [  0   7  81   2   1   0   2 431   0   0]
 [ 34   1   0   1  38  82   1   0 315  21]
 [ 70  22   1   3  75  64  25   1  60 184]]
LogisticRegression Classification Report:
              precision    recall  f1-score   support

 Alternative       0.50      0.51      0.51       507
       Blues       0.61      0.65      0.63       507
   Classical       0.64      0.65      0.65       452
      Comedy       0.97      0.94      0.95       504
        Folk       0.41      0.52      0.46       473
     Hip-Hop       0.63      0.63      0.63       493
        Jazz     

In [86]:
##### for my histograms
##import matplotlib.pyplot as plt
#import seaborn as sns

# naming num features
#numerical_features = ['popularity', 'acousticness', 'danceability', 'duration_ms', 'energy', 
                     # 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'valence']

# setting up the fig/axes for the histograms
#plt.figure(figsize=(15, 10))  # adjusting the figure size for visual purposes and better dispaly

# lopping through each feature and plotting a histogram
#for i, feature in enumerate(numerical_features, 1):
   # plt.subplot(3, 4, i)  # setting subplot grid size
    #sns.histplot(music_df[feature], bins=30, kde=True, color='skyblue')  # plotting  the density curve too
  #  plt.title(f'Histogram of {feature}')
   # plt.xlabel(feature)
   # plt.ylabel('Frequency')

#plt.tight_layout()  # adjusting the subplots to fit better
#plt.show()



###### code for my plots
#import matplotlib.pyplot as plt
#import seaborn as sns

# naming the num features to analyse
#numerical_features = ['popularity', 'acousticness', 'danceability', 'duration_ms', 'energy', 
                  #    'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'valence']

# calculating the correlation matrix
#corr_matrix = music_df[numerical_features].corr()
#
# plotting the correlations on a heatmap
#plt.figure(figsize=(10, 8))
#sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f')
#plt.title('Correlation Matrix of Features')
#plt.show()
