In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

random_seed_number=42
np.random.seed(random_seed_number)

In [2]:
df= pd.read_csv('/Users/josephlim/Desktop/Data Science/Capstone Projects/Capstone project- Spotify/Data/Cleaned Data/US_1921_2021_normalized')

In [3]:
df.head()

Unnamed: 0,popularity,duration_ms,year,danceability,energy,key,loudness,m_mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,0.94,126903,1922,0.645,0.445,C,-13.338,major,0.451,0.674,0.744,0.151,0.127,104.851,3
1,1.0,98200,1922,0.695,0.263,C,-22.136,major,0.957,0.797,0.0,0.148,0.655,102.009,1
2,1.0,181640,1922,0.434,0.177,C#,-21.18,major,0.0512,0.994,0.0218,0.212,0.457,130.418,5
3,1.0,176907,1922,0.321,0.0946,G,-27.961,major,0.0504,0.995,0.918,0.104,0.397,169.98,3
4,1.0,163080,1922,0.402,0.158,D#,-16.9,minor,0.039,0.989,0.13,0.311,0.196,103.22,4


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 586672 entries, 0 to 586671
Data columns (total 15 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   popularity        586672 non-null  float64
 1   duration_ms       586672 non-null  int64  
 2   year              586672 non-null  int64  
 3   danceability      586672 non-null  float64
 4   energy            586672 non-null  float64
 5   key               586672 non-null  object 
 6   loudness          586672 non-null  float64
 7   m_mode            586672 non-null  object 
 8   speechiness       586672 non-null  float64
 9   acousticness      586672 non-null  float64
 10  instrumentalness  586672 non-null  float64
 11  liveness          586672 non-null  float64
 12  valence           586672 non-null  float64
 13  tempo             586672 non-null  float64
 14  time_signature    586672 non-null  int64  
dtypes: float64(10), int64(3), object(2)
memory usage: 67.1+ MB


As we saw in the previous stage, narrowing down on the past ten years gave more insight in the data. We'll focus on data from the past ten years.

In [5]:
years= df['year'].unique()
last_ten= years[90:]
df_10= df.loc[df['year'].isin(last_ten)]

We also won't need years moving forward.

In [6]:
df_noyr= df.drop('year', axis=1)

In [7]:
df_noyr.head()

Unnamed: 0,popularity,duration_ms,danceability,energy,key,loudness,m_mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,0.94,126903,0.645,0.445,C,-13.338,major,0.451,0.674,0.744,0.151,0.127,104.851,3
1,1.0,98200,0.695,0.263,C,-22.136,major,0.957,0.797,0.0,0.148,0.655,102.009,1
2,1.0,181640,0.434,0.177,C#,-21.18,major,0.0512,0.994,0.0218,0.212,0.457,130.418,5
3,1.0,176907,0.321,0.0946,G,-27.961,major,0.0504,0.995,0.918,0.104,0.397,169.98,3
4,1.0,163080,0.402,0.158,D#,-16.9,minor,0.039,0.989,0.13,0.311,0.196,103.22,4


Let's create dummy variables for categorical variables.

In [8]:
dfo=df_noyr.select_dtypes(include=['object'])
cat_col= dfo.columns.to_list()
num_col=[x for x in df_noyr.columns.to_list() if x not in cat_col]
cat_keys= ['K','M']
prefix_dict= dict(zip(cat_col, cat_keys))
df=pd.concat([df_noyr.drop(dfo, axis=1), pd.get_dummies(dfo, prefix= prefix_dict)], axis=1)

In [9]:
df

Unnamed: 0,popularity,duration_ms,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,...,K_C,K_C#,K_D,K_D#,K_E,K_F,K_F#,K_G,M_major,M_minor
0,0.94,126903,0.645,0.4450,-13.338,0.4510,0.674,0.744000,0.1510,0.1270,...,1,0,0,0,0,0,0,0,1,0
1,1.00,98200,0.695,0.2630,-22.136,0.9570,0.797,0.000000,0.1480,0.6550,...,1,0,0,0,0,0,0,0,1,0
2,1.00,181640,0.434,0.1770,-21.180,0.0512,0.994,0.021800,0.2120,0.4570,...,0,1,0,0,0,0,0,0,1,0
3,1.00,176907,0.321,0.0946,-27.961,0.0504,0.995,0.918000,0.1040,0.3970,...,0,0,0,0,0,0,0,1,1,0
4,1.00,163080,0.402,0.1580,-16.900,0.0390,0.989,0.130000,0.3110,0.1960,...,0,0,0,1,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
586667,0.50,258267,0.560,0.5180,-7.471,0.0292,0.785,0.000000,0.0648,0.2110,...,1,0,0,0,0,0,0,0,0,1
586668,0.28,153293,0.765,0.6630,-5.223,0.0652,0.141,0.000297,0.0924,0.6860,...,1,0,0,0,0,0,0,0,1,0
586669,0.30,187601,0.535,0.3140,-12.823,0.0408,0.895,0.000150,0.0874,0.0663,...,0,0,0,0,0,0,0,1,0,1
586670,0.42,142003,0.696,0.6150,-6.212,0.0345,0.206,0.000003,0.3050,0.4380,...,0,0,0,0,0,0,0,0,1,0


Let's separate numerical values from categorical values and standardize them. We don't need to standardize categorical values again after turning them into dummy variables.

In [10]:
df_num= df[num_col]

In [11]:
scaler=StandardScaler()
scaled_df= scaler.fit_transform(df_num)
scaled_df= pd.DataFrame(scaled_df, columns= num_col)

In [12]:
df_s=pd.concat([df.drop(scaled_df, axis=1), scaled_df], axis=1)
df_s.columns=df.columns.to_list()

In [13]:
df_s.head()

Unnamed: 0,popularity,duration_ms,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,...,K_C,K_C#,K_D,K_D#,K_E,K_F,K_F#,K_G,M_major,M_minor
0,0,0,0,0,1,0,0,0,0,0,...,0.490096,-0.385182,-0.615393,1.924128,0.642528,2.362779,-0.341434,-1.650527,-0.457392,-1.845842
1,0,0,0,0,1,0,0,0,0,0,...,0.791115,-1.107625,-2.34411,4.736917,0.995129,-0.42512,-0.35771,0.3986,-0.552876,-6.072724
2,0,0,0,0,0,1,0,0,0,0,...,-0.780204,-1.449,-2.156266,-0.298309,1.559864,-0.343432,-0.010498,-0.369823,0.401596,2.38104
3,0,0,0,0,0,0,0,0,0,0,...,-1.460507,-1.776084,-3.488663,-0.302756,1.56273,3.014787,-0.596418,-0.602678,1.730782,-1.845842
4,0,0,0,0,0,0,0,1,0,0,...,-0.972856,-1.52442,-1.315289,-0.366127,1.54553,0.062013,0.526596,-1.382743,-0.51219,0.267599


Let's split and save the data to use in modeling stage.

In [14]:
X= df_s.drop('popularity', axis=1)
y= df_s['popularity']

In [15]:
X_train, X_test, y_train, y_test= train_test_split(X, y,test_size=0.3, random_state=12)

In [16]:
X_train.to_csv('X_train.csv', index=False)
X_test.to_csv('X_test.csv', index=False)
y_train.to_csv('y_train.csv', index=False)
y_test.to_csv('y_test.csv', index=False)