In [28]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report

In [61]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
musicData = pd.read_csv("Resources/music.csv")

# Review the DataFrame
musicData.head(20)

Unnamed: 0,Artist Name,Track Name,Popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_in min/ms,time_signature,Class
0,Bruno Mars,That's What I Like (feat. Gucci Mane),60.0,0.854,0.564,1.0,-4.964,1,0.0485,0.0171,,0.0849,0.899,134.071,234596.0,4,5
1,Boston,Hitch a Ride,54.0,0.382,0.814,3.0,-7.23,1,0.0406,0.0011,0.00401,0.101,0.569,116.454,251733.0,4,10
2,The Raincoats,No Side to Fall In,35.0,0.434,0.614,6.0,-8.334,1,0.0525,0.486,0.000196,0.394,0.787,147.681,109667.0,4,6
3,Deno,Lingo (feat. J.I & Chunkz),66.0,0.853,0.597,10.0,-6.528,0,0.0555,0.0212,,0.122,0.569,107.033,173968.0,4,5
4,Red Hot Chili Peppers,Nobody Weird Like Me - Remastered,53.0,0.167,0.975,2.0,-4.279,1,0.216,0.000169,0.0161,0.172,0.0918,199.06,229960.0,4,10
5,The Stooges,Search and Destroy - Iggy Pop Mix,53.0,0.235,0.977,6.0,0.878,1,0.107,0.00353,0.00604,0.172,0.241,152.952,208133.0,4,6
6,Solomon Burke,None Of Us Are Free,48.0,0.674,0.658,5.0,-9.647,0,0.104,0.404,1e-06,0.0981,0.677,143.292,329387.0,4,2
7,Randy Travis,On the Other Hand,55.0,0.657,0.415,5.0,-9.915,1,0.025,0.175,6e-06,0.132,0.347,96.03,3.105783,4,4
8,Professional Murder Music,Slow,29.0,0.431,0.776,10.0,-5.403,1,0.0527,2.2e-05,0.0013,0.179,0.318,120.857,237867.0,4,8
9,Dudu Aharon,"◊ì◊ï◊ì◊ï, ◊ô◊ê◊ú◊ú◊î ◊ô◊ê◊ú◊ú◊î",14.0,0.716,0.885,1.0,-4.348,0,0.0333,0.0614,,0.253,0.833,128.043,164093.0,4,9


In [30]:
# Drop non-contributing ID columns
musicDF = musicData.drop(columns=['Artist Name', 'Track Name'])


# Optional: Change Artist Name and Track Name to string
#cat_col = ['Artist Name','Track Name']

#for i in cat_col:
#    data[i] = data[i].astype('str') 


musicDF

Unnamed: 0,Popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_in min/ms,time_signature,Class
0,60.0,0.854,0.564,1.0,-4.964,1,0.0485,0.017100,,0.0849,0.8990,134.071,234596.0,4,5
1,54.0,0.382,0.814,3.0,-7.230,1,0.0406,0.001100,0.004010,0.1010,0.5690,116.454,251733.0,4,10
2,35.0,0.434,0.614,6.0,-8.334,1,0.0525,0.486000,0.000196,0.3940,0.7870,147.681,109667.0,4,6
3,66.0,0.853,0.597,10.0,-6.528,0,0.0555,0.021200,,0.1220,0.5690,107.033,173968.0,4,5
4,53.0,0.167,0.975,2.0,-4.279,1,0.2160,0.000169,0.016100,0.1720,0.0918,199.060,229960.0,4,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17991,35.0,0.166,0.109,7.0,-17.100,0,0.0413,0.993000,0.824000,0.0984,0.1770,171.587,193450.0,3,6
17992,27.0,0.638,0.223,11.0,-10.174,0,0.0329,0.858000,0.000016,0.0705,0.3350,73.016,257067.0,4,2
17993,34.0,0.558,0.981,4.0,-4.683,0,0.0712,0.000030,0.000136,0.6660,0.2620,105.000,216222.0,4,8
17994,29.0,0.215,0.805,6.0,-12.757,0,0.1340,0.001290,0.916000,0.2560,0.3550,131.363,219693.0,4,8


In [4]:
# Count the number of NaN values in each column
nan_counts = musicDF.isna().sum()
nan_counts

Popularity             428
danceability             0
energy                   0
key                   2014
loudness                 0
mode                     0
speechiness              0
acousticness             0
instrumentalness      4377
liveness                 0
valence                  0
tempo                    0
duration_in min/ms       0
time_signature           0
Class                    0
dtype: int64

In [5]:
# Count the number of rows with at least one NaN value
rows_with_nan = musicDF.isna().any(axis=1).sum()
rows_with_nan

6183

In [6]:
# Option1: Drop instrumentalness for now since there are many NaN's- try to figure out how to populate NaN's later?  
# musicDF = musicDF.drop(columns=['Popularity', 'key', 'instrumentalness'])

In [7]:
musicDF.shape

(17996, 15)

In [8]:
# Option2: drop samples with NaN values
#musicDF = musicDF.dropna()

In [11]:
# Option3: imput missing with zero
musicDF = musicDF.fillna(0)

In [14]:
musicDF.shape

(17996, 15)

### Split Data Into Features and our Label (genre)

In [12]:
# Separate the data into labels (Class = genre) and features/attributes

# Separate the y variable, the labels
y = musicDF["Class"].values


# Separate the X variable, the features
X = musicDF.drop("Class", axis=1).values

In [13]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

#### Scale Data

In [15]:
# import the StandardScaler and scale the data
from sklearn.preprocessing import StandardScaler

# make the instance of the standard scaler
scaler = StandardScaler()

# fit the scaler onto the data
X_scaler = scaler.fit(X_train)

# scale the x train and test using .transform function
xTrainScaled = X_scaler.transform(X_train)
xTestScaled = X_scaler.transform(X_test)

### First Try Random Forest Classifier to Create Model

In [70]:
# import the RandomForestClassifier from sklearn.ensemble
from sklearn.ensemble import RandomForestClassifier

# create the random forest classifier model
rfModel = RandomForestClassifier(n_estimators=500, random_state=78)

# fit the model onto our scaled X training data and the trained y data
rfModel = rfModel.fit(xTrainScaled, y_train)

In [71]:
# make the predictions using the testing data
predictedValues = rfModel.predict(xTestScaled)

#### Determine Accuracy

In [72]:
# import the modules for the confusion matrix, accuracy score, and classification report
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [73]:
# calculate the accuracy score
accuracyScore = accuracy_score(y_test, predictedValues)
print(f"Accuracy Score: {(accuracyScore*100):.2f}%")

Accuracy Score: 51.26%


### Try a Logistic Regression Model with the Original Data

In [20]:
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
classifierModel = LogisticRegression(solver='lbfgs', random_state=1)
classifierModel

In [21]:
# Fit the model using training data
classifierModel.fit(X_train, y_train)

In [22]:
# validate the model using the test data - use .score() function to get the accuracy
model_score_train = classifierModel.score(X_train, y_train)
print("Model Score (Accuracy) - Trained Data:", model_score_train)

Model Score (Accuracy) - Trained Data: 0.2968066977846929


In [23]:
model_score_test = classifierModel.score(X_test, y_test)
print("Model Score (Accuracy) - Test Data:", model_score_test)

Model Score (Accuracy) - Test Data: 0.29628806401422536


### Try K Nearest Neighbor (KNN) Classifyier

In [24]:
unique_genres = musicDF['Class'].nunique()
unique_genres

11

In [25]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=11)
knn.fit(xTrainScaled, y_train)


In [26]:
predictedValues = knn.predict(xTestScaled)

In [27]:
# calculate the accuracy score
accuracyScore = accuracy_score(y_test, predictedValues)
print(f"Accuracy Score: {(accuracyScore*100):.2f}%")

Accuracy Score: 46.48%


## GENRE PREDICTOR

In [75]:
from ipywidgets import interact, Dropdown

# Fill in nan's with 0
musicData = musicData.fillna(0)

# select the trained and tested model to use: rfModel or knn; this could be made a dropdown as well
modelType = rfModel

# Create the dropdown options with Track Name and Artist Name
dropdown_options = [(f"{track} - {artist}", track) for track, artist in zip(musicData['Track Name'], musicData['Artist Name'])]
dropdown = Dropdown(options=dropdown_options)

# Define a function to handle the dropdown value change
def on_dropdown_change(change):
    # Retrieve the selected value from the dropdown
    selected_track = change.new
    
    # Update the 'sample_track_name' variable with the selected value
    sample_track_name = selected_track
    
    # Continue with the rest of the code
    sample_data = musicData[musicData['Track Name'] == sample_track_name].values
    sample_artist_name = sample_data[0, 0]
    sample_actual_genre = sample_artist_name = sample_data[0, 16]
    sample_data = sample_data[:, 2:-1]
    sample_data_scaled = scaler.transform(sample_data)  # Preprocess the sample data using the scaler

    # Make the prediction
    prediction = modelType.predict(sample_data_scaled)

    # Convert prediction output to Genre
    genre_label = {
        0: 'Acoustic/Folk_0',
        1: 'Alt_Music_1',
        2: 'Blues_2',
        3: 'Bollywood_3',
        4: 'Country_4',
        5: 'HipHop_5',
        6: 'Indie Alt_6',
        7: 'Instrumental_7',
        8: 'Metal_8',
        9: 'Pop_9',
        10: 'Rock_10'
    }

    predicted_genre = genre_label[prediction[0]]
    actual_genre = genre_label[sample_actual_genre]  

    print(f"Predicted genre for the sample '{sample_track_name}': {predicted_genre}")
    print(f"Actual genre for the sample '{sample_track_name}': {actual_genre}")

# Register the function to handle the dropdown value change event
dropdown.observe(on_dropdown_change, names='value')

# Display the dropdown
display(dropdown)

Dropdown(options=(("That's What I Like (feat. Gucci Mane) - Bruno Mars", "That's What I Like (feat. Gucci Mane…

Predicted genre for the sample 'Meri Dosti Mera Pyar': Bollywood_3
Actual genre for the sample 'Meri Dosti Mera Pyar': Bollywood_3
