In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression

In [2]:
df= pd.read_csv('/Users/josephlim/Desktop/Data Science/Capstone Projects/Capstone project- Spotify/Data/Cleaned Data/US_1921_2021_normalized.csv')

In [3]:
df.head()

Unnamed: 0,popularity,duration_ms,year,danceability,energy,key,loudness,m_mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,0.94,126903,1922,0.645,0.445,C,-13.338,major,0.451,0.674,0.744,0.151,0.127,104.851,3
1,1.0,98200,1922,0.695,0.263,C,-22.136,major,0.957,0.797,0.0,0.148,0.655,102.009,1
2,1.0,181640,1922,0.434,0.177,C#,-21.18,major,0.0512,0.994,0.0218,0.212,0.457,130.418,5
3,1.0,176907,1922,0.321,0.0946,G,-27.961,major,0.0504,0.995,0.918,0.104,0.397,169.98,3
4,1.0,163080,1922,0.402,0.158,D#,-16.9,minor,0.039,0.989,0.13,0.311,0.196,103.22,4


We won't need years moving forward.

In [4]:
df_noyr= df.drop('year', axis=1)

In [5]:
df_noyr.head()

Unnamed: 0,popularity,duration_ms,danceability,energy,key,loudness,m_mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,0.94,126903,0.645,0.445,C,-13.338,major,0.451,0.674,0.744,0.151,0.127,104.851,3
1,1.0,98200,0.695,0.263,C,-22.136,major,0.957,0.797,0.0,0.148,0.655,102.009,1
2,1.0,181640,0.434,0.177,C#,-21.18,major,0.0512,0.994,0.0218,0.212,0.457,130.418,5
3,1.0,176907,0.321,0.0946,G,-27.961,major,0.0504,0.995,0.918,0.104,0.397,169.98,3
4,1.0,163080,0.402,0.158,D#,-16.9,minor,0.039,0.989,0.13,0.311,0.196,103.22,4


It's usually standard to encode categorical variables into dummy variables before fitting them into dummy variables using pd.get_dummies() or OneHotEncoder. However, doing so for different keys would introduce twelve more features. This is not very efficient. While random forest model, which I plan top use later, can handle categorical variables, the linear regression model cannot. As such, we will encode keys into numbers 0-11 and minor and major into 0 and 1, respectively.

In [6]:
keys= ['C','C#','D','D#', 'E', 'F','F#','G', 'Ab','A','Bb','B' ]
numbers=[x for x in range(12)]
key_dict= dict(zip(keys, numbers))
key_dict.update({'minor':0,'major':1})
df=df_noyr.replace(key_dict)

In [7]:
df.head()

Unnamed: 0,popularity,duration_ms,danceability,energy,key,loudness,m_mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,0.94,126903,0.645,0.445,0,-13.338,1,0.451,0.674,0.744,0.151,0.127,104.851,3
1,1.0,98200,0.695,0.263,0,-22.136,1,0.957,0.797,0.0,0.148,0.655,102.009,1
2,1.0,181640,0.434,0.177,1,-21.18,1,0.0512,0.994,0.0218,0.212,0.457,130.418,5
3,1.0,176907,0.321,0.0946,7,-27.961,1,0.0504,0.995,0.918,0.104,0.397,169.98,3
4,1.0,163080,0.402,0.158,3,-16.9,0,0.039,0.989,0.13,0.311,0.196,103.22,4


We have a lot of features to work with. Having too much features can lead to overfitting. Let's see try to reduce the number of features.

#### Correlation
Let's first try to remove features least correlated to popularity.

In [8]:
df_corr= df.corr()['popularity'].abs().sort_values(ascending=False)
df_corr

popularity          1.000000
acousticness        0.370882
loudness            0.327028
energy              0.302315
instrumentalness    0.236487
danceability        0.187000
time_signature      0.086759
tempo               0.071364
liveness            0.048740
speechiness         0.047357
m_mode              0.033655
duration_ms         0.027681
key                 0.015299
valence             0.004643
Name: popularity, dtype: float64

Key, energy, and musical mode are three features least correlated with popularity. Let's drop them.

In [9]:
least_corr= ['key','m_mode','energy']
df= df.drop(least_corr, axis=1)
features= df.drop('popularity', axis=1).columns

#### Checking for Multi-Collinearity
We will use Variance Inflation Factor (VIF) and tolerance to evaluate our features' multi-collinearity. VIF above 4 or tolerance below 0.25 indicates possibility of multi-collinearity.

In [10]:
def calculate_vif(df, features):
    vif, tolerance= {}, {}
    for feature in features:
        X= [f for f in features if f != feature]    #regressing against each feature
        X, y= df[X], df[feature]
        
        r2= LinearRegression().fit(X, y).score(X, y)
        
        tolerance[feature]= 1- r2
        vif[feature]= 1/ (tolerance[feature])

    return pd.DataFrame({'VIF':vif, 'Tolerance': tolerance})

In [11]:
calculate_vif(df, features)

Unnamed: 0,VIF,Tolerance
duration_ms,1.055708,0.947232
danceability,1.655435,0.604071
loudness,1.62874,0.613971
speechiness,1.226436,0.815371
acousticness,1.446321,0.69141
instrumentalness,1.175262,0.850874
liveness,1.092801,0.91508
valence,1.50145,0.666023
tempo,1.094314,0.913814
time_signature,1.066448,0.937692


In [12]:
df.to_csv('spotify_data_preprocessed.csv',index=False)