## Plan
1. Obtain data from Kaggle and lyrics from Genius
2. Create Database and Class for songs
    - Database will store data which should be retrieved with Class when called.
3. Train Model on genre using audio features and lyrics
4. Load songs and genre into database
5. Create user interactions through discord 
    - Allow songs within database to get genre from there
    - Allow new songs to be run through model
    - Save new songs to databse

In [1]:
# Import used libraries

import pandas as pd
from project_functions import *
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam 
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from collections import Counter
import sqlite3

In [2]:
# Load in datasets from csv and combine them and drop duplicates

spotify_2000 = pd.read_csv('data/Spotify-2000.csv', index_col=0)
spotify_tracks = pd.read_csv('data/Spotify-Tracks.csv', index_col=0)
spotify_2000 = spotify_2000[~spotify_2000['Title'].isin(spotify_tracks['Title'])]
spotify_2000 = spotify_2000.drop(columns=['Year','Beats Per Minute (BPM)'])
spotify_tracks = spotify_tracks[spotify_2000.columns]
data = pd.concat([spotify_tracks, spotify_2000], ignore_index=True)
data = data.drop_duplicates(subset=['Title','Artist'], keep='first')
# data.to_csv('data/All-Songs.csv')

In [3]:
# Filter niche genres into more general ones for training
data = filter_genres(data)
data.to_csv('data/All-Songs.csv')
data['Genre'].value_counts().to_csv('data/Genre-Counts.csv')
data['Top Genre'].value_counts().to_csv('data/Original-Genres.csv')

### See file `project_functions.py` for this step!!
    - Songs filtered to only english songs
    - Lyrics retrieved for each song

In [4]:
# Connect to database and store data
data = pd.read_csv('data/English-Songs.csv', index_col=0)
con = sqlite3.connect('data/spotify.db')
cursor = con.cursor()
cursor.execute('DROP TABLE IF EXISTS songs')
data.to_sql('songs', con, if_exists='append', index=False)
con.commit()

In [5]:
# Instantiate a Song object for each row in the dataset
songs = []  
for i, row in data.iterrows():
    song = Song(
        name=row['Title'],
        artist=row['Artist'],
        energy=row['Energy'],
        danceability=row['Danceability'],
        loudness=row['Loudness (dB)'],
        liveness=row['Liveness'],
        valence=row['Valence'],
        acousticness=row['Acousticness'],
        speechiness=row['Speechiness'],
        popularity=row['Popularity']
    )
    songs.append(song)

In [6]:
X = data.drop(columns=['Artist','Top Genre','Genre', 'Title', 'Length (Duration)'])  
y = data['Genre']  
genre_counts = y.value_counts()
rare_genres = genre_counts[genre_counts <= 1].index
y = y.apply(lambda genre: 'Other' if genre in rare_genres else genre)

for col in X.columns:
    if X[col].dtype == 'object':  # If the column contains strings
        X[col] = X[col].str.replace(',', '').astype(float)

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)\


print("Original distribution:", Counter(y))
print("Training distribution:", Counter(y_train))
print("Testing distribution:", Counter(y_test))





Original distribution: Counter({3: 3549, 9: 2261, 8: 1943, 2: 1645, 7: 1589, 0: 1511, 1: 1129, 5: 554, 4: 209, 6: 176})
Training distribution: Counter({3: 2839, 9: 1809, 8: 1554, 2: 1316, 7: 1271, 0: 1209, 1: 903, 5: 443, 4: 167, 6: 141})
Testing distribution: Counter({3: 710, 9: 452, 8: 389, 2: 329, 7: 318, 0: 302, 1: 226, 5: 111, 4: 42, 6: 35})


In [7]:
xgb = XGBClassifier(random_state=42)
# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [3, 5, 8],
    'learning_rate': [0.01, 0.2,.3,],
    'subsample': [0.8],
    'colsample_bytree': [0.8,]
}


# Create the GridSearchCV object
grid_search = GridSearchCV(
    estimator=xgb,
    param_grid=param_grid,
    scoring='accuracy',
    cv=3,
    verbose=1,
    n_jobs=-1
)

# Perform the grid search
grid_search.fit(X_train, y_train)

# Print the best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Accuracy:", grid_search.best_score_)

# Evaluate on the test set
best_model = grid_search.best_estimator_
test_accuracy = best_model.score(X_test, y_test)
print("Test Accuracy:", test_accuracy)

Fitting 3 folds for each of 27 candidates, totalling 81 fits
Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.2, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best Accuracy: 0.624785444558874
Test Accuracy: 0.6108442004118051


In [109]:
# Define the model
model = Sequential([
    Dense(256, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.3),
    Dense(128, activation='relu'),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dense(len(np.unique(y)), activation='softmax')
])

# Compile the model
model.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

model.summary()



Model: "sequential_11"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_41 (Dense)            (None, 256)               2304      
                                                                 
 dropout_10 (Dropout)        (None, 256)               0         
                                                                 
 dense_42 (Dense)            (None, 128)               32896     
                                                                 
 dropout_11 (Dropout)        (None, 128)               0         
                                                                 
 dense_43 (Dense)            (None, 64)                8256      
                                                                 
 dropout_12 (Dropout)        (None, 64)                0         
                                                                 
 dense_44 (Dense)            (None, 32)              

In [110]:
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

model.fit(
    X_train, y_train,
    validation_split=0.2,
    epochs=50,
    batch_size=32,
    verbose=1,
    callbacks=[early_stopping]
)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50


<keras.callbacks.History at 0x36980a950>

In [112]:
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=1)
print(f"Test Accuracy: {test_accuracy:.2f}")

Test Accuracy: 0.44
