In [2]:
#importing necessary libraries
import sklearn as sk
import pandas as pd
import numpy as np
import re

In [3]:
#reading the cleaned dataset
df=pd.read_csv(r'data_without_outliers.csv')

In [4]:
df.head()

Unnamed: 0,artists,acousticness,danceability,duration_ms,energy,liveness,loudness,speechiness,tempo,valence,popularity,key,mode,count,genres
0,"""Weird Al"" Yankovic",0.176934,0.664788,218165.2576,0.685136,0.162139,-9.877909,0.082392,132.495091,0.754409,33.681818,9,1,132,"['antiviral pop', 'comedy rock', 'comic', 'par..."
1,$pyda,0.000122,0.514,331240.0,0.899,0.367,-5.115,0.0602,174.028,0.266,58.0,7,1,1,[]
2,'Be More Chill' Ensemble,0.764,0.497,164400.0,0.474,0.453,-7.279,0.044,133.934,0.493,52.0,3,1,1,[]
3,'In The Heights' Original Broadway Company,0.636588,0.541,301908.2941,0.402282,0.193824,-9.405882,0.150006,114.952529,0.407765,46.588235,7,1,17,"['broadway', 'show tunes']"
4,'Little Women' Original Broadway Cast,0.902,0.345,275360.0,0.325,0.0721,-7.955,0.0354,90.03,0.168,42.0,1,1,2,[]


# Description of the columns

1) acousticness- confidence measure whether a track is acoustic or not -independant
2) danciblity- Danceability describes how suitable a track is for dancing based on a combination of musical elements including tempo, rhythm        stability, beat strength, and overall regularity- dependant on tempo,loudness,energy,key,mode
3) duration_ms- duration of the song in miliseconds - independant
4) energy-represents a perceptual measure of intensity and activity - dependant on the bpm and loudness 
5) livliness-Detects the presence of an audience in the recording -independant
6) instrumentalness-Predicts whether a track contains no vocals -depends on speechiness
7) loudness-Loudness values are averaged across the entire track and are useful for comparing relative loudness of tracks -independant
8) speechiness-	Speechiness detects the presence of spoken words in a track-independant
9) tempo-The overall estimated tempo of a track in beats per minute -dependant on energy,danciblity
10) valance-describing the musical positiveness conveyed by a track.-independant
11) poularity-the recent popularity of the song in the USA- dependant on all the factors
12) key-The estimated overall key of the track. Integers map to pitches using standard-independant
13) mode-tells weather a song is in the major/minor scale-2 categories variable
14) count-The number of tracks from the original dataset, produced by the given artist (non necessay for any analysis)
15) genre-the genre of the song depends on all the above factors -dependant on all factors

Using visualization, it was found that instrumentalness had many missing values and since filling them will not add any value, it was removed
Popularity also had many missing values to be replaced hence such rows were dropped

It is noticed that there are no NaN values as such but they are represented by a '0'
Hence we will replace 0 with NaN

In [4]:
df=df.replace('[]',np.NaN)
df.loc[:,'danceability']=df.loc[:,'danceability'].replace(0.0,np.NaN)
df.loc[:,'energy']=df.loc[:,'energy'].replace(0.0,np.NaN)
df.loc[:,'tempo']=df.loc[:,'tempo'].replace(0,np.NaN)
df.loc[:,'valence']=df.loc[:,'valence'].replace(0.0,np.NaN)
df.loc[:,'acousticness']=df.loc[:,'acousticness'].replace(0.0,np.NaN)
df.loc[:,'liveness']=df.loc[:,'liveness'].replace(0.0,np.NaN)

In [5]:
df.isnull().sum()

artists            0
acousticness       8
danceability      45
duration_ms        0
energy             2
liveness           6
loudness           0
speechiness        0
tempo             45
valence           46
popularity         0
key                0
mode               0
count              0
genres          3661
dtype: int64

# Filling of Null values

1) There are only 8 outliers for accousticness and it is univariate therefore relaced using mean

In [6]:
#acousticness
df.loc[:,'acousticness']=df.loc[:,'acousticness'].replace(np.nan,df['acousticness'].mean())


2) There are 6 nan values for liveness and it is independant of any other factors. The data is heavily right skewed hence replacement is using mode

In [7]:
mode=df['liveness'].mode()
mode.values

array([0.103])

In [8]:
#2)liveliness
#it is observed that there is only 1 mode which is 0.103
mode=0.103
df.loc[:,'liveness']=df.loc[:,'liveness'].replace(np.nan,mode)

3) The remaining missing numeric values are under danceability,energy,tempo,valence which are dependant variables.
Hence we perform multivariate imputing

Since the number of labeled data is high compared to the number of missing values, we will use a supervised training algorithm called MICE
or Multiple Impution by Chained Equation.
Further details can be found under: https://jhu.pure.elsevier.com/en/publications/multiple-imputation-by-chained-equations-what-is-it-and-how-does--5

In [9]:
#getting columns that will be used to fill in the missing values
cols=['acousticness',
'danceability',
 'energy',
 'loudness',
 'speechiness',
 'tempo',
 'valence',
 'popularity',
 'key',
 'mode']


In [10]:
#importing the python library for mice
import sys
from impyute.imputation.cs import mice
sys.setrecursionlimit(100000) #Increase the recursion limit of the OS

# start the KNN training
imputed_training=mice(df[cols].values)
imputed_training

array([[1.76934091e-01, 6.64787879e-01, 6.85136364e-01, ...,
        3.36818182e+01, 9.00000000e+00, 1.00000000e+00],
       [1.22000000e-04, 5.14000000e-01, 8.99000000e-01, ...,
        5.80000000e+01, 7.00000000e+00, 1.00000000e+00],
       [7.64000000e-01, 4.97000000e-01, 4.74000000e-01, ...,
        5.20000000e+01, 3.00000000e+00, 1.00000000e+00],
       ...,
       [8.64000000e-01, 6.28000000e-01, 1.72000000e-01, ...,
        4.10000000e+01, 1.10000000e+01, 1.00000000e+00],
       [9.79000000e-01, 2.41000000e-01, 6.28000000e-02, ...,
        6.00000000e+01, 2.00000000e+00, 0.00000000e+00],
       [6.12000000e-01, 3.74000000e-01, 8.20000000e-02, ...,
        5.50000000e+01, 1.00000000e+00, 0.00000000e+00]])

In [11]:
df2=pd.DataFrame(imputed_training)
df2.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.176934,0.664788,0.685136,-9.877909,0.082392,132.495091,0.754409,33.681818,9.0,1.0
1,0.000122,0.514,0.899,-5.115,0.0602,174.028,0.266,58.0,7.0,1.0
2,0.764,0.497,0.474,-7.279,0.044,133.934,0.493,52.0,3.0,1.0
3,0.636588,0.541,0.402282,-9.405882,0.150006,114.952529,0.407765,46.588235,7.0,1.0
4,0.902,0.345,0.325,-7.955,0.0354,90.03,0.168,42.0,1.0,1.0


In [12]:
#relacing the filled values back into the original dataset
df['danceability']=df2[1]
df['energy']=df2[2]
df['tempo']=df2[5]
df['valence']=df2[6]

In [13]:
df.head()

Unnamed: 0,artists,acousticness,danceability,duration_ms,energy,liveness,loudness,speechiness,tempo,valence,popularity,key,mode,count,genres
0,"""Weird Al"" Yankovic",0.176934,0.664788,218165.2576,0.685136,0.162139,-9.877909,0.082392,132.495091,0.754409,33.681818,9,1,132,"['antiviral pop', 'comedy rock', 'comic', 'par..."
1,$pyda,0.000122,0.514,331240.0,0.899,0.367,-5.115,0.0602,174.028,0.266,58.0,7,1,1,
2,'Be More Chill' Ensemble,0.764,0.497,164400.0,0.474,0.453,-7.279,0.044,133.934,0.493,52.0,3,1,1,
3,'In The Heights' Original Broadway Company,0.636588,0.541,301908.2941,0.402282,0.193824,-9.405882,0.150006,114.952529,0.407765,46.588235,7,1,17,"['broadway', 'show tunes']"
4,'Little Women' Original Broadway Cast,0.902,0.345,275360.0,0.325,0.0721,-7.955,0.0354,90.03,0.168,42.0,1,1,2,


In [14]:
#testing purposes
df.iloc[2247]

artists         Brown Noise for Babies
acousticness                  0.431509
danceability                  0.508955
duration_ms                      73887
energy                        2.01e-05
liveness                         0.103
loudness                       -15.776
speechiness                          0
tempo                          103.321
valence                       0.180863
popularity                          60
key                                  5
mode                                 1
count                                1
genres                 ['white noise']
Name: 2247, dtype: object

4) Genres is a column containing categorical values

In [15]:
#to find the total number of unique genres
genre=df['genres']
genres_cnt=[]
lst=[]
for g in genre:
    g=re.sub(r'[^\w]', ' ', str(g))
    lst=g.split()
    genres_cnt.append(g)
print(len((list(set(genres_cnt)))))

8284


There are 3661 missing values in genre and 8284 unique genres in the dataset. It is impossible to predict the missing values. Instead of dropping such rows, NaN will be replaced by "others" as this column will not be used for any statistical analysis

In [16]:
df=df.fillna("[others]")

In [17]:
df.head()

Unnamed: 0,artists,acousticness,danceability,duration_ms,energy,liveness,loudness,speechiness,tempo,valence,popularity,key,mode,count,genres
0,"""Weird Al"" Yankovic",0.176934,0.664788,218165.2576,0.685136,0.162139,-9.877909,0.082392,132.495091,0.754409,33.681818,9,1,132,"['antiviral pop', 'comedy rock', 'comic', 'par..."
1,$pyda,0.000122,0.514,331240.0,0.899,0.367,-5.115,0.0602,174.028,0.266,58.0,7,1,1,[others]
2,'Be More Chill' Ensemble,0.764,0.497,164400.0,0.474,0.453,-7.279,0.044,133.934,0.493,52.0,3,1,1,[others]
3,'In The Heights' Original Broadway Company,0.636588,0.541,301908.2941,0.402282,0.193824,-9.405882,0.150006,114.952529,0.407765,46.588235,7,1,17,"['broadway', 'show tunes']"
4,'Little Women' Original Broadway Cast,0.902,0.345,275360.0,0.325,0.0721,-7.955,0.0354,90.03,0.168,42.0,1,1,2,[others]


In [18]:
df.isnull().sum()

artists         0
acousticness    0
danceability    0
duration_ms     0
energy          0
liveness        0
loudness        0
speechiness     0
tempo           0
valence         0
popularity      0
key             0
mode            0
count           0
genres          0
dtype: int64

All missing values have been filled