In [1]:
import pandas as pd
import numpy as np
import os
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, learning_curve
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import mutual_info_regression, SelectKBest, f_regression
from bayes_opt import BayesianOptimization
import lightgbm as lgb
from sklearn.utils import resample
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder


from sklearn.metrics import confusion_matrix

random_seed_number=42
np.random.seed(random_seed_number)

In [2]:
df= pd.read_csv('/Users/josephlim/Desktop/Data Science/Capstone Projects/Capstone project- Spotify/Data/Cleaned Data/US_1921_2021_normalized')

In [3]:
df.head()

Unnamed: 0,popularity,duration_ms,year,danceability,energy,key,loudness,m_mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,0.94,126903,1922,0.645,0.445,C,-13.338,major,0.451,0.674,0.744,0.151,0.127,104.851,3
1,1.0,98200,1922,0.695,0.263,C,-22.136,major,0.957,0.797,0.0,0.148,0.655,102.009,1
2,1.0,181640,1922,0.434,0.177,C#,-21.18,major,0.0512,0.994,0.0218,0.212,0.457,130.418,5
3,1.0,176907,1922,0.321,0.0946,G,-27.961,major,0.0504,0.995,0.918,0.104,0.397,169.98,3
4,1.0,163080,1922,0.402,0.158,D#,-16.9,minor,0.039,0.989,0.13,0.311,0.196,103.22,4


As we saw in the previous stage, narrowing down on the past ten years gave more insight in the data. We'll focus on data from the past ten years.

In [4]:
years= df['year'].unique()
last_ten= years[90:]
df_10= df.loc[df['year'].isin(last_ten)]

In [5]:
df_10.shape

(92194, 15)

We also won't need years moving forward.

In [6]:
df_noyr= df_10.drop('year', axis=1)

In [7]:
df_noyr.head()

Unnamed: 0,popularity,duration_ms,danceability,energy,key,loudness,m_mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
73438,0.14,266773,0.429,0.661,B,-7.227,major,0.0281,0.00239,0.000121,0.234,0.285,173.372,4
73439,0.33,201960,0.659,0.857,B,-5.85,major,0.0437,0.0045,2e-06,0.335,0.798,106.965,4
73440,0.16,216880,0.556,0.864,D#,-5.87,minor,0.0584,0.00958,0.0,0.209,0.4,105.143,4
73442,0.16,284200,0.949,0.661,F,-4.244,minor,0.0572,0.0302,0.0,0.0454,0.76,104.504,4
73443,0.63,201960,0.659,0.857,B,-5.85,major,0.0437,0.0045,2e-06,0.335,0.798,106.965,4


It's usually standard to encode categorical variables into dummy variables before fitting them into dummy variables using pd.get_dummies() or OneHotEncoder. However, doing so for different keys would introduce twelve more features. This is not very efficient. While random forest model, which I plan top use later, can handle categorical variables, the linear regression model cannot. As such, we will encode keys into numbers 0-11 and minor and major into 0 and 1, respectively.

In [8]:
keys= ['C','C#','D','D#', 'E', 'F','F#','G', 'Ab','A','Bb','B' ]
numbers=[x for x in range(12)]
key_dict= dict(zip(keys, numbers))
key_dict.update({'minor':0,'major':1})
df=df_noyr.replace(key_dict)

In [9]:
df.head()

Unnamed: 0,popularity,duration_ms,danceability,energy,key,loudness,m_mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
73438,0.14,266773,0.429,0.661,11,-7.227,1,0.0281,0.00239,0.000121,0.234,0.285,173.372,4
73439,0.33,201960,0.659,0.857,11,-5.85,1,0.0437,0.0045,2e-06,0.335,0.798,106.965,4
73440,0.16,216880,0.556,0.864,3,-5.87,0,0.0584,0.00958,0.0,0.209,0.4,105.143,4
73442,0.16,284200,0.949,0.661,5,-4.244,0,0.0572,0.0302,0.0,0.0454,0.76,104.504,4
73443,0.63,201960,0.659,0.857,11,-5.85,1,0.0437,0.0045,2e-06,0.335,0.798,106.965,4


We have a lot of features to work with. Having too much features can lead to overfitting. Let's see try to reduce the number of features.

#### Correlation
Let's first try to remove features least correlated to popularity.

In [10]:
df_corr= df.corr()['popularity'].abs().sort_values(ascending=False)
df_corr

popularity          1.000000
instrumentalness    0.187363
loudness            0.128224
danceability        0.108979
liveness            0.074906
duration_ms         0.046728
speechiness         0.023304
acousticness        0.020443
time_signature      0.019853
valence             0.013380
tempo               0.012789
energy              0.001682
key                 0.000797
m_mode              0.000708
Name: popularity, dtype: float64

Energy, key, and musical mode are three features least correlated with popularity. Let's drop them.

In [11]:
least_corr= ['key','m_mode', 'energy']
df= df.drop(least_corr, axis=1)
features= df.drop('popularity', axis=1).columns

#### Checking for Multi-Collinearity
We will use Variance Inflation Factor (VIF) and tolerance to evaluate our features' multi-collinearity. VIF above 4 or tolerance below 0.25 indicates possibility of multi-collinearity.

In [12]:
def calculate_vif(df, features):
    vif, tolerance= {}, {}
    for feature in features:
        X= [f for f in features if f != feature]    #regressing against each feature
        X, y= df[X], df[feature]
        
        r2= LinearRegression().fit(X, y).score(X, y)
        
        tolerance[feature]= 1- r2
        vif[feature]= 1/ (tolerance[feature])

    return pd.DataFrame({'VIF':vif, 'Tolerance': tolerance})

In [13]:
calculate_vif(df, features)

Unnamed: 0,VIF,Tolerance
duration_ms,1.024239,0.976335
danceability,1.53197,0.652754
loudness,1.630868,0.613171
speechiness,1.069225,0.935257
acousticness,1.416809,0.705811
instrumentalness,1.180669,0.846977
liveness,1.053575,0.94915
valence,1.461079,0.684426
tempo,1.075929,0.92943
time_signature,1.051632,0.950903


On the brightside, we can be sure there isn't any multi-collinearity. On the downside, we're not able to remove features based on multi-collinearity. Let's export our data to be used in modeling stage. We'll split our data into training and testing set before modeling.

In [15]:
df.to_csv('spotify_data_preprocessed.csv',index=False)