# Modeling- Part 2 (Classification)

In [1]:
import os

import pandas as pd
import numpy as np

import pickle
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, learning_curve, cross_val_score, RandomizedSearchCV, validation_curve
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, accuracy_score, classification_report
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from bayes_opt import BayesianOptimization
import lightgbm as lgb

from sklearn.feature_selection import mutual_info_regression, SelectKBest, f_regression
from sklearn.utils import resample, shuffle
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder


from sklearn.metrics import confusion_matrix
random_state=42

In [2]:
# To suppress future warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

In [3]:
df= pd.read_csv('/Users/josephlim/Desktop/Data Science/Capstone Projects/Capstone project- Spotify/spotify_data_preprocessed_exp.csv')

In [4]:
df.shape

(587927, 12)

In [5]:
df.head()

Unnamed: 0,popularity,duration_ms,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,0.06,126903,0.645,0.445,-13.338,0.451,0.674,0.744,0.151,0.127,104.851,3
1,0.0,98200,0.695,0.263,-22.136,0.957,0.797,0.0,0.148,0.655,102.009,1
2,0.0,181640,0.434,0.177,-21.18,0.0512,0.994,0.0218,0.212,0.457,130.418,5
3,0.0,176907,0.321,0.0946,-27.961,0.0504,0.995,0.918,0.104,0.397,169.98,3
4,0.0,163080,0.402,0.158,-16.9,0.039,0.989,0.13,0.311,0.196,103.22,4


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 587927 entries, 0 to 587926
Data columns (total 12 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   popularity        587927 non-null  float64
 1   duration_ms       587927 non-null  int64  
 2   danceability      587927 non-null  float64
 3   energy            587927 non-null  float64
 4   loudness          587927 non-null  float64
 5   speechiness       587927 non-null  float64
 6   acousticness      587927 non-null  float64
 7   instrumentalness  587927 non-null  float64
 8   liveness          587927 non-null  float64
 9   valence           587927 non-null  float64
 10  tempo             587927 non-null  float64
 11  time_signature    587927 non-null  int64  
dtypes: float64(10), int64(2)
memory usage: 53.8 MB


## Classifications
We'll try to predict songs' popularities by categorizing popularities(into "high","mid","low"), and classifying songs into those categories.

In [7]:
pd.cut(df['popularity'], bins=3)

0         (-0.001, 0.333]
1         (-0.001, 0.333]
2         (-0.001, 0.333]
3         (-0.001, 0.333]
4         (-0.001, 0.333]
               ...       
587922    (-0.001, 0.333]
587923    (-0.001, 0.333]
587924    (-0.001, 0.333]
587925    (-0.001, 0.333]
587926    (-0.001, 0.333]
Name: popularity, Length: 587927, dtype: category
Categories (3, interval[float64, right]): [(-0.001, 0.333] < (0.333, 0.667] < (0.667, 1.0]]

In [8]:
labels=['low','medium', 'high']
df['popularity']= pd.cut(df['popularity'], bins=3, labels=labels, right=True)

In [9]:
df.popularity.value_counts()

low       363052
medium    213090
high       11785
Name: popularity, dtype: int64

The dataset is unbalanced. This is intuitive, because there aren't as many popular songs as there are non-popular songs (otherwise, there will be much more financial stability in music industry!). However,imbalance in dataset will tamper with the accuracy of our model. One way to counteract this is by upsampling songs with high popularity.  We will then perform K-Nearest Neighbor classification, because they are good at handling noisy data. 

### Up-sampling songs

In [10]:
df_high= df[df.popularity=='high']
df_mid= df[df.popularity=='medium']
df_low= df[df.popularity=='low']

In [11]:
# df_low_upsampled= resample(df_low, replace=True, n_samples= 362633, random_state=42)
df_mid_upsampled= resample(df_mid, replace=True, n_samples= 363052, random_state=42)
df_high_upsampled= resample(df_high, replace=True, n_samples= 363052, random_state=42)

# list_df_upsampled_tomid=[df_high_upsampled, df_mid, df_low_upsampled]
list_df_upsampled_tohigh=[df_high_upsampled, df_mid_upsampled, df_low]

# df_resampled= pd.concat(list_df_upsampled_tomid)
df_resampled= pd.concat(list_df_upsampled_tohigh)

In [12]:
df_resampled.popularity.value_counts()

low       363052
medium    363052
high      363052
Name: popularity, dtype: int64

In [13]:
X_re= df_resampled.drop('popularity', axis=1)
y_re= df_resampled['popularity']

In [14]:
X_train, X_test, y_train, y_test= train_test_split(X_re, y_re, random_state= 42, test_size=0.3)

### Baseline Model- Random Prediction 
<br> We will randomly pull samples to serve as a baseline model to compare against different models we train. 

In [15]:
y_test_rand= shuffle(y_test)

In [16]:
print(classification_report(y_test, y_test_rand))

              precision    recall  f1-score   support

        high       0.33      0.33      0.33    108764
         low       0.33      0.33      0.33    108706
      medium       0.34      0.34      0.34    109277

    accuracy                           0.33    326747
   macro avg       0.33      0.33      0.33    326747
weighted avg       0.33      0.33      0.33    326747



### Testing Different Classification Models

### K-Nearest Neighbors (KNN) Classifier

In [17]:
KNN= KNeighborsClassifier()

KNN.fit(X_train, y_train)

y_pred_classification= KNN.predict(X_test)
print(classification_report(y_test, y_pred_classification))

              precision    recall  f1-score   support

        high       0.88      1.00      0.94    108764
         low       0.66      0.54      0.60    108706
      medium       0.63      0.65      0.64    109277

    accuracy                           0.73    326747
   macro avg       0.72      0.73      0.72    326747
weighted avg       0.72      0.73      0.72    326747



Because KNN is not a tree-based algorithm, it requires standardization. We will put it as part of our function.

In [18]:
def fit_model(n_neighbors):
    StandardScaler()
    n_neighbors= round(n_neighbors)

    regressor= KNeighborsClassifier(n_neighbors= n_neighbors)

    return np.mean(cross_validate(regressor, X_train, y_train, scoring='accuracy', error_score= 'raise', cv=5)['test_score'])

In [None]:
KNN_BO = BayesianOptimization(fit_model,{
        'n_neighbors': (1,50)
    })

KNN_BO.maximize(n_iter=10, init_points=2, allow_duplicate_points=True)

|   iter    |  target   | n_neig... |
-------------------------------------


Passing acquisition function parameters or gaussian process parameters to maximize
is no longer supported, and will cause an error in future releases. Instead,
please use the "set_gp_params" method to set the gp params, and pass an instance
 of bayes_opt.util.UtilityFunction using the acquisition_function argument

  KNN_BO.maximize(n_iter=10, init_points=2, allow_duplicate_points=True)


| [0m1        [0m | [0m0.5566   [0m | [0m34.23    [0m |
| [0m2        [0m | [0m0.5209   [0m | [0m49.22    [0m |


In [None]:
print(KNN_BO.max)

In [None]:
n_neighbors= KNN_BO.max['params']['n_neighbors']

In [None]:
model = KNeighborsClassifier(n_neighbors= round(n_neighbors))
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(classification_report(y_pred, y_test))

It's odd that our test performed better than our hyperparameter tuning. We will try n_neighbor with the next best accuracy: 1.565.

In [None]:
model = KNeighborsClassifier(n_neighbors= round(1.565))
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(classification_report(y_pred, y_test))

WOW! This is a huge improvement (though it probably is overfitting). Let's try Random Forest Classification.

### Random Forest Classifier

In [None]:
RFC= RandomForestClassifier(random_state= random_state)

RFC.fit(X_train, y_train)
y_rfc_pred= RFC.predict(X_test)
print(classification_report(y_rfc_pred, y_test))

Clearly, Random Forest Classifier performs best for our case. Let's use this model. Before fully delving into this model though, let's check for feature importance to see which features are more relevant for our task.

#### Random Forest Classification- Bayesian Optimization

In [None]:
def fit_model(n_estimators, min_samples_split, max_depth, max_leaf_nodes):
    n_estimators= round(n_estimators)
    min_samples_split= round(min_samples_split) 
    max_depth= round(max_depth)
    max_leaf_nodes= round(max_leaf_nodes)

    regressor= RandomForestClassifier(n_estimators= n_estimators, 
                                 min_samples_split= min_samples_split,
                                 max_depth=max_depth,
                                 random_state=random_state)

    return np.mean(cross_validate(regressor, X_train, y_train, scoring='accuracy', error_score= 'raise', cv=5)['test_score'])

In [None]:
rf_BO = BayesianOptimization(fit_model,{
        'n_estimators': (1,1000),
        'min_samples_split':(1.5,100), 
        'max_depth': (1,10),
        'max_leaf_nodes': (2,10)
    })

rf_BO.maximize(n_iter=10, init_points=2)

In [None]:
print(rf_BO.max)

Let's set optimized parameteres into variables.

In [None]:
max_depth= rf_BO.max['params']['max_depth']
max_leaf_nodes= rf_BO.max['params']['max_leaf_nodes']
min_samples_split= rf_BO.max['params']['min_samples_split']
n_estimators= rf_BO.max['params']['n_estimators']

In [None]:
model = RandomForestClassifier(max_depth= round(max_depth),
    max_leaf_nodes= round(max_leaf_nodes),
    min_samples_split= round(min_samples_split),
    n_estimators=round(n_estimators))
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(classification_report(y_pred, y_test))

Through hyperparameter tuning, we learned that random forest classification model can expect target score of 0.66. This score is reasonable, as the model is expected to perform better on training set. However, our current target score is still great. 

Clearly, KNN classifier performs better, with accuracy score of 0.82. Although we have a model performance of 0.82, it is possible that the model is overfitting. However, given different iterations of the model, you can expect it to reliably predict anywhere between 60 to 70% accuracy. We have come a long way from the baseline model of 0.33 accuracy.