In [1]:
import pandas as pd
import numpy as np
import re
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
import sys
import os

# Specify the directory containing your module
module_dir = '/Users/gabrielvictorgomesferreira/artificial_intelligence/isu_classes/modules'

# Add the directory to sys.path if not already present
if module_dir not in sys.path:
    sys.path.append(module_dir)
    
from help_functions import *

In [2]:
data_loc = "/Users/gabrielvictorgomesferreira/Library/Mobile Documents/com~apple~CloudDocs/Work/ISU Classes/Data/"
file_name = "spotify_songs.csv"
models_loc = "../models/"

In [3]:
songs_df = pd.read_csv(data_loc + file_name)
rows, columns = songs_df.shape
print(f"The dataset contains {rows:,} rows and {columns} columns")
songs_df.head()

The dataset contains 32,833 rows and 23 columns


Unnamed: 0,track_id,track_name,track_artist,track_popularity,track_album_id,track_album_name,track_album_release_date,playlist_name,playlist_id,playlist_genre,playlist_subgenre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
0,6f807x0ima9a1j3VPbc7VN,I Don't Care (with Justin Bieber) - Loud Luxur...,Ed Sheeran,66,2oCs0DGTsRO98Gh5ZSl2Cx,I Don't Care (with Justin Bieber) [Loud Luxury...,2019-06-14,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,dance pop,0.748,0.916,6,-2.634,1,0.0583,0.102,0.0,0.0653,0.518,122.036,194754
1,0r7CVbZTWZgbTCYdfa2P31,Memories - Dillon Francis Remix,Maroon 5,67,63rPSO264uRjW1X5E6cWv6,Memories (Dillon Francis Remix),2019-12-13,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,dance pop,0.726,0.815,11,-4.969,1,0.0373,0.0724,0.00421,0.357,0.693,99.972,162600
2,1z1Hg7Vb0AhHDiEmnDE79l,All the Time - Don Diablo Remix,Zara Larsson,70,1HoSmj2eLcsrR0vE9gThr4,All the Time (Don Diablo Remix),2019-07-05,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,dance pop,0.675,0.931,1,-3.432,0,0.0742,0.0794,2.3e-05,0.11,0.613,124.008,176616
3,75FpbthrwQmzHlBJLuGdC7,Call You Mine - Keanu Silva Remix,The Chainsmokers,60,1nqYsOef1yKKuGOVchbsk6,Call You Mine - The Remixes,2019-07-19,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,dance pop,0.718,0.93,7,-3.778,1,0.102,0.0287,9e-06,0.204,0.277,121.956,169093
4,1e8PAfcKUYoKkxPhrHqw4x,Someone You Loved - Future Humans Remix,Lewis Capaldi,69,7m7vv9wlQ4i0LFuJiE2zsQ,Someone You Loved (Future Humans Remix),2019-03-05,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,dance pop,0.65,0.833,1,-4.672,1,0.0359,0.0803,0.0,0.0833,0.725,123.976,189052


In [6]:
from sklearn.preprocessing import LabelEncoder

genre_encoder = LabelEncoder()
songs_df['genre_label'] = genre_encoder.fit_transform(songs_df['playlist_genre'])
songs_df.head(1)

Unnamed: 0,track_id,track_name,track_artist,track_popularity,track_album_id,track_album_name,track_album_release_date,playlist_name,playlist_id,playlist_genre,playlist_subgenre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,genre_label
0,6f807x0ima9a1j3VPbc7VN,I Don't Care (with Justin Bieber) - Loud Luxur...,Ed Sheeran,66,2oCs0DGTsRO98Gh5ZSl2Cx,I Don't Care (with Justin Bieber) [Loud Luxury...,2019-06-14,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,dance pop,0.748,0.916,6,-2.634,1,0.0583,0.102,0.0,0.0653,0.518,122.036,194754,2


#### Select Numerical Features

In [7]:
numeric_columns = songs_df.select_dtypes(include=['number']).columns
print(numeric_columns)
print(f"There is {len(numeric_columns)} numerical features.")

Index(['track_popularity', 'danceability', 'energy', 'key', 'loudness', 'mode',
       'speechiness', 'acousticness', 'instrumentalness', 'liveness',
       'valence', 'tempo', 'duration_ms', 'genre_label'],
      dtype='object')
There is 14 numerical features.


### Train Model

In [11]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_absolute_error, r2_score, make_scorer, f1_score, recall_score

# Predictor Features
X = songs_df[numeric_columns].drop(columns=['genre_label'])
y = songs_df['genre_label']

# Sore model results
results = []

# Split data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale input features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

# Initialize the model
logit_model = LogisticRegression()
logit_model.fit(X_train, y_train)

# Predict on the test set
y_pred = logit_model.predict(X_test)

# Evaluate model
lg_f1_score = f1_score(y_test, y_pred, average='weighted')
lg_recall_score = recall_score(y_test, y_pred, average='weighted')

# Cross validation using 5-fold cross-validation
lg_f1_cv_score = cross_val_score(logit_model, X_train, y_train, cv=5, scoring='f1_weighted').mean()
lg_recall_cv_score = cross_val_score(logit_model, X_train, y_train, cv=5, scoring='recall_weighted').mean()

results.append({"test_size": 0.2,
        "lg_f1_score": lg_f1_score,
        "lg_recall_score": lg_recall_score,
        "lg_f1_cv_score": lg_f1_cv_score,
        "lg_recall_cv_score": lg_recall_cv_score})

# Transform results into DF
results_df = pd.DataFrame(results)

# Display
results_df.head()

Unnamed: 0,test_size,lg_f1_score,lg_recall_score,lg_f1_cv_score,lg_recall_cv_score
0,0.2,0.462092,0.469012,0.460621,0.467334


### Export Model

In [12]:
import pickle

# Define the file path
model_path = models_loc + "baseline_model.pkl"

# Save the model
with open(model_path, 'wb') as file:
    pickle.dump(logit_model, file)

In [13]:
with open(model_path, 'rb') as file:
    loaded_model = pickle.load(file)