The goal of this notebook is to start talking about feature selection for models where the features and the target are both numerical.

The original dataset can be found here: https://www.kaggle.com/datasets/paakhim10/taylor-swift-the-myth-the-legend?select=taylorswift-Features.csv

In [20]:
# import libraries
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
import pickle

df = pd.read_csv("taylorswift-Features.csv")

df.head()

Unnamed: 0.1,Unnamed: 0,album_id,album_name,id,track_name,danceability,swiftiness,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,0,1o59UpKw81iHR0HPiSkJR0,1989 (Taylor's Version) [Deluxe],4WUepByoeqcedHoYhSNHRt,Welcome To New York (Taylor's Version),0.757,100,0.61,7,-4.84,1,0.0327,0.00942,3.7e-05,0.367,0.685,116.998
1,1,1o59UpKw81iHR0HPiSkJR0,1989 (Taylor's Version) [Deluxe],0108kcWLnn2HlH2kedi1gn,Blank Space (Taylor's Version),0.733,100,0.733,0,-5.376,1,0.067,0.0885,0.0,0.168,0.701,96.057
2,2,1o59UpKw81iHR0HPiSkJR0,1989 (Taylor's Version) [Deluxe],3Vpk1hfMAQme8VJ0SNRSkd,Style (Taylor's Version),0.511,100,0.822,11,-4.785,0,0.0397,0.000421,0.0197,0.0899,0.305,94.868
3,3,1o59UpKw81iHR0HPiSkJR0,1989 (Taylor's Version) [Deluxe],1OcSfkeCg9hRC2sFKB4IMJ,Out Of The Woods (Taylor's Version),0.545,100,0.885,0,-5.968,1,0.0447,0.000537,5.6e-05,0.385,0.206,92.021
4,4,1o59UpKw81iHR0HPiSkJR0,1989 (Taylor's Version) [Deluxe],2k0ZEeAqzvYMcx9Qt5aClQ,All You Had To Do Was Stay (Taylor's Version),0.588,100,0.721,0,-5.579,1,0.0317,0.000656,0.0,0.131,0.52,96.997


In [21]:
# lets delete some columns we won't care about
df = df.drop(["Unnamed: 0", "album_id", "album_name", "id", "track_name"], axis=1)

df.head()

Unnamed: 0,danceability,swiftiness,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,0.757,100,0.61,7,-4.84,1,0.0327,0.00942,3.7e-05,0.367,0.685,116.998
1,0.733,100,0.733,0,-5.376,1,0.067,0.0885,0.0,0.168,0.701,96.057
2,0.511,100,0.822,11,-4.785,0,0.0397,0.000421,0.0197,0.0899,0.305,94.868
3,0.545,100,0.885,0,-5.968,1,0.0447,0.000537,5.6e-05,0.385,0.206,92.021
4,0.588,100,0.721,0,-5.579,1,0.0317,0.000656,0.0,0.131,0.52,96.997


In [22]:
df.shape

(246, 12)

Next we clean the data. We need to do the following:

- check for missing values and handle them
- encode any categorical data. There are technically some categories (mode, key), but they are already encoded! So we're good there
- remove outliers- lets assume we want to keep all the data points since we dont have a ton
- split the data into training and testing
- scale the features

In [23]:
# check for missing data
df.isna().sum()

danceability        0
swiftiness          0
energy              0
key                 0
loudness            0
mode                0
speechiness         0
acousticness        0
instrumentalness    0
liveness            0
valence             0
tempo               0
dtype: int64

No missing data! So we can move forward with splitting our data into a training and testing set.

In [24]:
# split the target from the features

yDF = pd.DataFrame(df["danceability"])

yDF.head()

Unnamed: 0,danceability
0,0.757
1,0.733
2,0.511
3,0.545
4,0.588


In [25]:
xDF = df.drop(columns="danceability")

xDF.head()

Unnamed: 0,swiftiness,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,100,0.61,7,-4.84,1,0.0327,0.00942,3.7e-05,0.367,0.685,116.998
1,100,0.733,0,-5.376,1,0.067,0.0885,0.0,0.168,0.701,96.057
2,100,0.822,11,-4.785,0,0.0397,0.000421,0.0197,0.0899,0.305,94.868
3,100,0.885,0,-5.968,1,0.0447,0.000537,5.6e-05,0.385,0.206,92.021
4,100,0.721,0,-5.579,1,0.0317,0.000656,0.0,0.131,0.52,96.997


In [26]:
# split training and testing set
from sklearn.model_selection import train_test_split

xTrain, xTest, yTrain, yTest = train_test_split(xDF, 
                                                yDF,
                                                test_size = 0.3,
                                                shuffle=True)

xTrain.head()

Unnamed: 0,swiftiness,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
208,100,0.852,7,-4.775,1,0.0373,0.00233,0.0,0.554,0.442,120.969
154,100,0.316,5,-10.345,1,0.519,0.298,2e-06,0.0812,0.541,92.875
87,100,0.909,7,-3.669,1,0.0628,0.0222,0.0,0.333,0.541,100.023
46,100,0.509,7,-8.747,1,0.19,0.205,0.0,0.262,0.113,108.978
199,100,0.61,2,-7.283,1,0.0732,0.122,1e-06,0.13,0.374,74.957


In [27]:
yTrain.head()

Unnamed: 0,danceability
208,0.496
154,0.664
87,0.578
46,0.741
199,0.574


In [28]:
# Standardization
from sklearn.preprocessing import StandardScaler

xScaler = StandardScaler()
yScaler = StandardScaler()

xColNames = xTrain.columns.values.tolist()

xTrain[xColNames] = xScaler.fit_transform(xTrain[xColNames])

yColNames = yTrain.columns.values.tolist()

yTrain[yColNames] = yScaler.fit_transform(yTrain[yColNames])

# save the scalers for later predictions
with open('xScaler.pkl', 'wb') as f:
    pickle.dump(xScaler, f)

with open('yScaler.pkl', 'wb') as f:
    pickle.dump(yScaler, f)

xTrain.head()

Unnamed: 0,swiftiness,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
208,0.0,1.543455,0.659964,1.016736,0.285939,-0.334038,-0.975388,-0.13344,2.279222,0.252775,-0.031875
154,0.0,-1.325785,0.059362,-0.991303,0.285939,8.832269,-0.099646,-0.133331,-0.541904,0.774193,-0.949798
87,0.0,1.84858,0.659964,1.415459,0.285939,0.151204,-0.916535,-0.13344,0.960548,0.774193,-0.71625
46,0.0,-0.292644,0.659964,-0.415209,0.285939,2.571702,-0.375102,-0.13344,0.536902,-1.480018,-0.42366
199,0.0,0.248015,-0.841542,0.112578,0.285939,0.349106,-0.620938,-0.133381,-0.250722,-0.105371,-1.535237


In [29]:
xTest[xColNames] = xScaler.transform(xTest[xColNames])
yTest[yColNames] = yScaler.transform(yTest[yColNames])

xTest.head()

Unnamed: 0,swiftiness,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
151,0.0,-1.497083,0.059362,-1.904834,0.285939,0.518465,1.730803,-0.132634,-0.484026,-0.347646,0.919014
22,0.0,0.381841,-0.841542,1.322448,0.285939,-0.524329,-0.963896,-0.13344,-0.328291,-1.006002,0.657334
131,0.0,-1.497083,0.059362,-1.904834,0.285939,0.518465,1.730803,-0.132634,-0.484026,-0.347646,0.919014
69,0.0,-0.158818,-0.841542,0.591695,0.285939,-0.514814,-0.86411,-0.13344,-0.443451,-1.258811,-0.911407
44,0.0,-0.951071,-0.841542,-0.979406,0.285939,0.092214,0.270591,-0.132876,-0.27459,-0.021101,-0.062098


In [30]:
from sklearn.feature_selection import RFE
from sklearn import tree

model = tree.DecisionTreeRegressor()
selector = RFE(model, n_features_to_select=6, step=1)
selector = selector.fit(xTrain, yTrain)
selector.support_

array([False,  True, False, False, False,  True, False,  True,  True,
        True,  True])

In [31]:
columnsToRemove = []
for col in xTrain.columns:
    if col not in xTrain.columns[selector.get_support()]:
        columnsToRemove.append(col)

xTrain = xTrain.drop(columnsToRemove, axis=1)
xTest = xTest.drop(columnsToRemove, axis=1)

xTrain.head()

Unnamed: 0,energy,speechiness,instrumentalness,liveness,valence,tempo
208,1.543455,-0.334038,-0.13344,2.279222,0.252775,-0.031875
154,-1.325785,8.832269,-0.133331,-0.541904,0.774193,-0.949798
87,1.84858,0.151204,-0.13344,0.960548,0.774193,-0.71625
46,-0.292644,2.571702,-0.13344,0.536902,-1.480018,-0.42366
199,0.248015,0.349106,-0.133381,-0.250722,-0.105371,-1.535237


Now lets do some predictions!

In [32]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error

from sklearn.linear_model import LinearRegression

model = LinearRegression().fit(xTrain, yTrain)
preds = model.predict(xTest)

print(r2_score(yTest, preds))
print(mean_absolute_error(yTest, preds))

0.2621974726854496
0.7266129958773057


In [33]:
from sklearn import svm

svmModel = svm.SVR().fit(xTrain, yTrain)

preds = svmModel.predict(xTest)

print(r2_score(yTest, preds))
print(mean_absolute_error(yTest, preds))

0.3903648479592837
0.6168704886209015


  y = column_or_1d(y, warn=True)


In [34]:
# bias check for svm
preds = svmModel.predict(xTrain)

print(r2_score(yTrain, preds))
print(mean_absolute_error(yTrain, preds))

0.5001444596602362
0.4993021613254676


In [35]:
from sklearn import tree

dtModel = tree.DecisionTreeRegressor().fit(xTrain, yTrain)

preds = dtModel.predict(xTest)

print(r2_score(yTest, preds))
print(mean_absolute_error(yTest, preds))

0.11206149908172203
0.7469709476855237


In [36]:
# bias check for tree
preds = dtModel.predict(xTrain)

print(r2_score(yTrain, preds))
print(mean_absolute_error(yTrain, preds))

1.0
0.0


In [37]:
from sklearn.neural_network import MLPRegressor

nnModel = MLPRegressor().fit(xTrain, yTrain)

preds = nnModel.predict(xTest)

print(r2_score(yTest, preds))
print(mean_absolute_error(yTest, preds))

  y = column_or_1d(y, warn=True)


0.490021873804086
0.589972057393258




In [38]:
# bias check for nn
preds = nnModel.predict(xTrain)

print(r2_score(yTrain, preds))
print(mean_absolute_error(yTrain, preds))

0.5704246581304977
0.5066641053134067
