In [13]:
# Library import
import numpy as np
import pandas as pd
from sklearn import preprocessing

In [14]:
# Load the data
# data file is located in folder data
df = pd.read_csv('./rawdata/music_genre.csv')
print(df.head(7))
print(df.tail())

   instance_id           artist_name            track_name  popularity  \
0      32894.0              Röyksopp  Röyksopp's Night Out        27.0   
1      46652.0  Thievery Corporation      The Shining Path        31.0   
2      30097.0        Dillon Francis             Hurricane        28.0   
3      62177.0              Dubloadz                 Nitro        34.0   
4      24907.0           What So Not      Divide & Conquer        32.0   
5      89064.0            Axel Boman                 Hello        47.0   
6      43760.0        Jordan Comolli                 Clash        46.0   

   acousticness  danceability  duration_ms  energy  instrumentalness key  \
0       0.00468         0.652         -1.0   0.941          0.792000  A#   
1       0.01270         0.622     218293.0   0.890          0.950000   D   
2       0.00306         0.620     215613.0   0.755          0.011800  G#   
3       0.02540         0.774     166875.0   0.700          0.002530  C#   
4       0.00465         0.6

In [15]:
# count the number of NaN values in each column
print(df.isnull().sum())

instance_id         5
artist_name         5
track_name          5
popularity          5
acousticness        5
danceability        5
duration_ms         5
energy              5
instrumentalness    5
key                 5
liveness            5
loudness            5
mode                5
speechiness         5
tempo               5
obtained_date       5
valence             5
music_genre         5
dtype: int64


In [16]:
# mark missing or 0 values as NaN
df = df.replace(0, np.NaN)
# drop rows with missing values
df.dropna(inplace=True)
print(df.isnull().sum())

instance_id         0
artist_name         0
track_name          0
popularity          0
acousticness        0
danceability        0
duration_ms         0
energy              0
instrumentalness    0
key                 0
liveness            0
loudness            0
mode                0
speechiness         0
tempo               0
obtained_date       0
valence             0
music_genre         0
dtype: int64


In [17]:
print(df.head(7))
print(df.tail())

   instance_id           artist_name            track_name  popularity  \
0      32894.0              Röyksopp  Röyksopp's Night Out        27.0   
1      46652.0  Thievery Corporation      The Shining Path        31.0   
2      30097.0        Dillon Francis             Hurricane        28.0   
3      62177.0              Dubloadz                 Nitro        34.0   
4      24907.0           What So Not      Divide & Conquer        32.0   
5      89064.0            Axel Boman                 Hello        47.0   
6      43760.0        Jordan Comolli                 Clash        46.0   

   acousticness  danceability  duration_ms  energy  instrumentalness key  \
0       0.00468         0.652         -1.0   0.941          0.792000  A#   
1       0.01270         0.622     218293.0   0.890          0.950000   D   
2       0.00306         0.620     215613.0   0.755          0.011800  G#   
3       0.02540         0.774     166875.0   0.700          0.002530  C#   
4       0.00465         0.6

In [18]:
# normalize the data attributes loudness
df['loudness'] = df['loudness'].astype(float)
df['normalized_loudness'] = (df['loudness'] - df['loudness'].min()) / (df['loudness'].max() - df['loudness'].min())

In [19]:
# normalize the data attributes duration_ms
df['duration_ms'] = df['duration_ms'].astype(float)
df['normalized_duration_ms'] = (df['duration_ms'] - df['duration_ms'].min()) / (df['duration_ms'].max() - df['duration_ms'].min())

In [20]:
# normalize the data attributes popularity
df['popularity'] = df['popularity'].astype(float)
df['normalized_popularity'] = (df['popularity'] - df['popularity'].min()) / (df['popularity'].max() - df['popularity'].min())

In [21]:
print(df.head())
print(df.tail())

   instance_id           artist_name            track_name  popularity  \
0      32894.0              Röyksopp  Röyksopp's Night Out        27.0   
1      46652.0  Thievery Corporation      The Shining Path        31.0   
2      30097.0        Dillon Francis             Hurricane        28.0   
3      62177.0              Dubloadz                 Nitro        34.0   
4      24907.0           What So Not      Divide & Conquer        32.0   

   acousticness  danceability  duration_ms  energy  instrumentalness key  ...  \
0       0.00468         0.652         -1.0   0.941           0.79200  A#  ...   
1       0.01270         0.622     218293.0   0.890           0.95000   D  ...   
2       0.00306         0.620     215613.0   0.755           0.01180  G#  ...   
3       0.02540         0.774     166875.0   0.700           0.00253  C#  ...   
4       0.00465         0.638     222369.0   0.587           0.90900  F#  ...   

   loudness   mode speechiness               tempo obtained_date val

In [22]:
df.to_csv('input.csv')