# Table of contents
1. [Imports](#imports)
2. [Preprocessing](#preproc)

## 1. Imports

In [2]:
import copy
import numba
import matplotlib.pyplot as plt
plt.style.use('classic')
import numpy as np
import pandas as pd
import seaborn as sns 
import statistics
%matplotlib inline 
from os import getcwd
import plotly.express as px
import matplotlib.cm as cm
import pylab as pl

from scipy import sparse
from scipy import stats
from scipy.sparse import csr_matrix
from scipy import spatial

import sklearn
from sklearn import preprocessing
from sklearn import linear_model
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.metrics.pairwise import cosine_similarity

from surprise import accuracy
from surprise import BaselineOnly
from surprise import Dataset
from surprise import Reader
from surprise import SVD
from surprise.model_selection import cross_validate
from surprise.model_selection import GridSearchCV
from surprise.prediction_algorithms import SVD
from surprise.prediction_algorithms.knns import KNNBasic

getcwd()

'C:\\Users\\tessa\\UVT Thesis'

## 2. Preprocessing

#### Song data

(The Spotify Audio Features Hit Predictor Dataset, 1960 - 2019)

In [3]:
# data source
# https://doi.org/10.4121/uuid:d77e74b0-66bc-47ac-8b25-5796d3084478
# https://data.4tu.nl/articles/dataset/The_Spotify_Audio_Features_Hit_Predictor_Dataset_1960-2019_/12716381/1

In [4]:
data00 = pd.read_csv('dataset-of-00s.csv', low_memory=False)
data10 = pd.read_csv('dataset-of-10s.csv', low_memory=False)
data60 = pd.read_csv('dataset-of-60s.csv', low_memory=False)
data70 = pd.read_csv('dataset-of-70s.csv', low_memory=False)
data80 = pd.read_csv('dataset-of-80s.csv', low_memory=False)
data90 = pd.read_csv('dataset-of-90s.csv', low_memory=False)

In [5]:
data = data00.merge(data10, how='outer')
data = data.merge(data60, how='outer')
data = data.merge(data70, how='outer')
data = data.merge(data80, how='outer')
song_data = data.merge(data90, how='outer')
data.head()

Unnamed: 0,track,artist,uri,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,chorus_hit,sections,target
0,Lucky Man,Montgomery Gentry,spotify:track:4GiXBCUF7H6YfNQsnBRIzl,0.578,0.471,4,-7.27,1,0.0289,0.368,0.0,0.159,0.532,133.061,196707,4,30.88059,13,1
1,On The Hotline,Pretty Ricky,spotify:track:1zyqZONW985Cs4osz9wlsu,0.704,0.854,10,-5.477,0,0.183,0.0185,0.0,0.148,0.688,92.988,242587,4,41.51106,10,1
2,Clouds Of Dementia,Candlemass,spotify:track:6cHZf7RbxXCKwEkgAZT4mY,0.162,0.836,9,-3.009,1,0.0473,0.000111,0.00457,0.174,0.3,86.964,338893,4,65.32887,13,0
3,"Heavy Metal, Raise Hell!",Zwartketterij,spotify:track:2IjBPp2vMeX7LggzRN3iSX,0.188,0.994,4,-3.745,1,0.166,7e-06,0.0784,0.192,0.333,148.44,255667,4,58.59528,9,0
4,I Got A Feelin',Billy Currington,spotify:track:1tF370eYXUcWwkIvaq3IGz,0.63,0.764,2,-4.353,1,0.0275,0.363,0.0,0.125,0.631,112.098,193760,4,22.62384,10,1


In [6]:
song_data.isna().values.any()

False

In [7]:
song_data = song_data.rename(columns={'track': 'title'})

In [8]:
song_data['title'].value_counts()

Falling                       17
Angel                         13
Crazy                         12
You                           12
Hold On                       12
                              ..
Here, There And Everywhere     1
Kill This Love                 1
If I Have To Go Away           1
Gucci Gang                     1
Ni**as in Paris                1
Name: title, Length: 35860, dtype: int64

In [9]:
song_data = song_data.drop_duplicates(subset=['title'], keep='first', inplace=False, ignore_index=False)

In [10]:
song_data.head()

Unnamed: 0,title,artist,uri,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,chorus_hit,sections,target
0,Lucky Man,Montgomery Gentry,spotify:track:4GiXBCUF7H6YfNQsnBRIzl,0.578,0.471,4,-7.27,1,0.0289,0.368,0.0,0.159,0.532,133.061,196707,4,30.88059,13,1
1,On The Hotline,Pretty Ricky,spotify:track:1zyqZONW985Cs4osz9wlsu,0.704,0.854,10,-5.477,0,0.183,0.0185,0.0,0.148,0.688,92.988,242587,4,41.51106,10,1
2,Clouds Of Dementia,Candlemass,spotify:track:6cHZf7RbxXCKwEkgAZT4mY,0.162,0.836,9,-3.009,1,0.0473,0.000111,0.00457,0.174,0.3,86.964,338893,4,65.32887,13,0
3,"Heavy Metal, Raise Hell!",Zwartketterij,spotify:track:2IjBPp2vMeX7LggzRN3iSX,0.188,0.994,4,-3.745,1,0.166,7e-06,0.0784,0.192,0.333,148.44,255667,4,58.59528,9,0
4,I Got A Feelin',Billy Currington,spotify:track:1tF370eYXUcWwkIvaq3IGz,0.63,0.764,2,-4.353,1,0.0275,0.363,0.0,0.125,0.631,112.098,193760,4,22.62384,10,1


In [11]:
song_data.shape

(35860, 19)

In [12]:
for col in song_data.columns:
    print(col)

title
artist
uri
danceability
energy
key
loudness
mode
speechiness
acousticness
instrumentalness
liveness
valence
tempo
duration_ms
time_signature
chorus_hit
sections
target


In [13]:
song_data.dtypes

title                object
artist               object
uri                  object
danceability        float64
energy              float64
key                   int64
loudness            float64
mode                  int64
speechiness         float64
acousticness        float64
instrumentalness    float64
liveness            float64
valence             float64
tempo               float64
duration_ms           int64
time_signature        int64
chorus_hit          float64
sections              int64
target                int64
dtype: object

####  User data
(Million song dataset)

In [14]:
# Data source
# https://www.kaggle.com/anuragbanerjee/million-song-data-set-subset
# Million Songs Dataset Source: http://labrosa.ee.columbia.edu/millionsong/ Paper: http://ismir2011.ismir.net/papers/OS6-1.pdf 
# The current notebook uses a subset of the above data containing 10,000 songs obtained from: 
# https://github.com/turi-code/tutorials/blob/master/notebooks/recsys_rank_10K_song.ipynb

In [15]:
data2 = pd.read_fwf('10000.txt', header=None)
data2.head()

Unnamed: 0,0,1,2
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBXHDL12A81C204C0,1
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBYHAJ12A6701BF1D,1
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SODACBL12A8C13C273,1


In [16]:
data2.columns = ['user_id', 'song_id', 'play_count']

In [17]:
data3 = pd.read_csv('song_data.csv', header=0)
data3.head()

Unnamed: 0,song_id,title,release,artist_name,year
0,SOQMMHC12AB0180CB8,Silent Night,Monster Ballads X-Mas,Faster Pussy cat,2003
1,SOVFVAK12A8C1350D9,Tanssi vaan,Karkuteillä,Karkkiautomaatti,1995
2,SOGTUKN12AB017F4F1,No One Could Ever,Butter,Hudson Mohawke,2006
3,SOBNYVR12A8C13558C,Si Vos Querés,De Culo,Yerba Brava,2003
4,SOHSBXH12A8C13B0DF,Tangle Of Aspens,Rene Ablaze Presents Winter Sessions,Der Mystic,0


In [18]:
user_data = data2.merge(data3, how='left')
user_data.head(n=11)

Unnamed: 0,user_id,song_id,play_count,title,release,artist_name,year
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1,The Cove,Thicker Than Water,Jack Johnson,0
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2,Entre Dos Aguas,Flamenco Para Niños,Paco De Lucia,1976
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBXHDL12A81C204C0,1,Stronger,Graduation,Kanye West,2007
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBXHDL12A81C204C0,1,Stronger,Graduation,Kanye West,2007
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBYHAJ12A6701BF1D,1,Constellations,In Between Dreams,Jack Johnson,2005
5,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SODACBL12A8C13C273,1,Learn To Fly,There Is Nothing Left To Lose,Foo Fighters,1999
6,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SODDNQT12A6D4F5F7E,5,Apuesta Por El Rock 'N' Roll,Antología Audiovisual,Héroes del Silencio,2007
7,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SODXRTY12AB0180F3B,1,Paper Gangsta,The Fame Monster,Lady GaGa,2008
8,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOFGUAY12AB017B0A8,1,Stacked Actors,There Is Nothing Left To Lose,Foo Fighters,1999
9,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOFRQTD12A81C233C0,1,Sehr kosmisch,Musik von Harmonia,Harmonia,0


In [19]:
user_data.isna().values.any()

False

In [20]:
user_data.shape

(2086946, 7)

In [21]:
user_data['user_id'].value_counts()
# 76,353 unique id's and 2,086,946 observations, meaning that people on average listened to 27.3 songs

6d625c6557df84b60d90426c0116138b617b9449    724
fbee1c8ce1a346fa07d2ef648cec81117438b91f    687
4e11f45d732f4861772b2906f81a7d384552ad12    576
6a9cf03dfb2fc82f5b3b043c9c3fdbab997fd54d    545
1aa4fd215aadb160965110ed8a829745cde319eb    545
                                           ... 
a53f9a7139f5ad4896865b281cd011187af74918      1
4261658283ebc09092b4718dc10bb22fec4d80ff      1
9acbb256ba089356dc6ed3b31820badfc579da5d      1
38f8b6f0abab153083b15a0512b18f617df571bf      1
b923c4fc244d671f96ede79a60b10da1583ddab2      1
Name: user_id, Length: 76353, dtype: int64

In [22]:
user_data['song_id'].value_counts()
# 10,000 unique songs

SOFRQTD12A81C233C0    8277
SOWCKVR12A8C142411    7952
SOAUWYT12A81C206F1    7032
SOAXGDH12A8C13F8A1    6949
SOBONKR12A58A7A7E0    6412
                      ... 
SOLIGVL12AB017DBAE      51
SOWNLZF12A58A79811      51
SOBPGWB12A6D4F7EF3      50
SOYYBJJ12AB017E9FD      48
SOGSPGJ12A8C134FAA      48
Name: song_id, Length: 10000, dtype: int64

In [23]:
for col in user_data.columns:
    print(col)

user_id
song_id
play_count
title
release
artist_name
year


In [24]:
user_data.dtypes

user_id        object
song_id        object
play_count      int64
title          object
release        object
artist_name    object
year            int64
dtype: object

In [25]:
user_data = user_data.rename(columns={'release': 'album'})

#### Merge data

In [26]:
data_outer = pd.merge(song_data, user_data, how='outer')

#### Take the relative play count

In [27]:
data_outer.head()

Unnamed: 0,title,artist,uri,danceability,energy,key,loudness,mode,speechiness,acousticness,...,time_signature,chorus_hit,sections,target,user_id,song_id,play_count,album,artist_name,year
0,Lucky Man,Montgomery Gentry,spotify:track:4GiXBCUF7H6YfNQsnBRIzl,0.578,0.471,4.0,-7.27,1.0,0.0289,0.368,...,4.0,30.88059,13.0,1.0,0a00498b9d607844a8826184ae7278097d1c008a,SOICKYJ12A6310E910,2.0,Urban Hymns,The Verve,1997.0
1,Lucky Man,Montgomery Gentry,spotify:track:4GiXBCUF7H6YfNQsnBRIzl,0.578,0.471,4.0,-7.27,1.0,0.0289,0.368,...,4.0,30.88059,13.0,1.0,7ec12bb04cc91eeb52f5bef8833aa7d51ee9310a,SOICKYJ12A6310E910,1.0,Urban Hymns,The Verve,1997.0
2,Lucky Man,Montgomery Gentry,spotify:track:4GiXBCUF7H6YfNQsnBRIzl,0.578,0.471,4.0,-7.27,1.0,0.0289,0.368,...,4.0,30.88059,13.0,1.0,e07a79f2d3e0db17991f6eb8d5a3314e22795748,SOICKYJ12A6310E910,1.0,Urban Hymns,The Verve,1997.0
3,Lucky Man,Montgomery Gentry,spotify:track:4GiXBCUF7H6YfNQsnBRIzl,0.578,0.471,4.0,-7.27,1.0,0.0289,0.368,...,4.0,30.88059,13.0,1.0,1a4f2d9fdfd834e1a8b0ecc3559e67971d6e78f1,SOICKYJ12A6310E910,3.0,Urban Hymns,The Verve,1997.0
4,Lucky Man,Montgomery Gentry,spotify:track:4GiXBCUF7H6YfNQsnBRIzl,0.578,0.471,4.0,-7.27,1.0,0.0289,0.368,...,4.0,30.88059,13.0,1.0,6e240eea9c3992fc27373129c30a1ccb05810497,SOICKYJ12A6310E910,1.0,Urban Hymns,The Verve,1997.0


In [28]:
data_outer2 = data_outer.copy('deep')

In [29]:
data_outer2['tot_play_count'] = data_outer['play_count'].groupby(data_outer['user_id']).transform('sum')

In [30]:
grouped = data_outer.groupby(data_outer['user_id'])
data_outer2['number_of_obs'] = grouped['user_id'].transform('count')

In [31]:
data_outer2['mean_play_count'] = (data_outer2['tot_play_count']/data_outer2['number_of_obs'])

In [32]:
grouped = data_outer.groupby(data_outer['user_id'])
data_outer2['rel_rating'] = data_outer2['play_count'] / data_outer2['mean_play_count']

In [33]:
data_outer2['rel_rating'].max()

34.548913043478265

In [34]:
data_outer2.head()

Unnamed: 0,title,artist,uri,danceability,energy,key,loudness,mode,speechiness,acousticness,...,user_id,song_id,play_count,album,artist_name,year,tot_play_count,number_of_obs,mean_play_count,rel_rating
0,Lucky Man,Montgomery Gentry,spotify:track:4GiXBCUF7H6YfNQsnBRIzl,0.578,0.471,4.0,-7.27,1.0,0.0289,0.368,...,0a00498b9d607844a8826184ae7278097d1c008a,SOICKYJ12A6310E910,2.0,Urban Hymns,The Verve,1997.0,90.0,29.0,3.103448,0.644444
1,Lucky Man,Montgomery Gentry,spotify:track:4GiXBCUF7H6YfNQsnBRIzl,0.578,0.471,4.0,-7.27,1.0,0.0289,0.368,...,7ec12bb04cc91eeb52f5bef8833aa7d51ee9310a,SOICKYJ12A6310E910,1.0,Urban Hymns,The Verve,1997.0,11.0,9.0,1.222222,0.818182
2,Lucky Man,Montgomery Gentry,spotify:track:4GiXBCUF7H6YfNQsnBRIzl,0.578,0.471,4.0,-7.27,1.0,0.0289,0.368,...,e07a79f2d3e0db17991f6eb8d5a3314e22795748,SOICKYJ12A6310E910,1.0,Urban Hymns,The Verve,1997.0,70.0,32.0,2.1875,0.457143
3,Lucky Man,Montgomery Gentry,spotify:track:4GiXBCUF7H6YfNQsnBRIzl,0.578,0.471,4.0,-7.27,1.0,0.0289,0.368,...,1a4f2d9fdfd834e1a8b0ecc3559e67971d6e78f1,SOICKYJ12A6310E910,3.0,Urban Hymns,The Verve,1997.0,123.0,59.0,2.084746,1.439024
4,Lucky Man,Montgomery Gentry,spotify:track:4GiXBCUF7H6YfNQsnBRIzl,0.578,0.471,4.0,-7.27,1.0,0.0289,0.368,...,6e240eea9c3992fc27373129c30a1ccb05810497,SOICKYJ12A6310E910,1.0,Urban Hymns,The Verve,1997.0,91.0,32.0,2.84375,0.351648


In [35]:
#data_outer2['rel_play_count'] = data_outer2['mean_play_count'])/data_outer2['play_count']

In [36]:
#data_outer2['rel_play_count'] = data_outer2['rel_play_count'].round(3)

In [37]:
#grouped = data_outer.groupby(data_outer['user_id'])
#data_outer2['rel_play_count'] = grouped['user_id'].transform()

In [38]:
#df = data_outer['play_count'].value_counts(normalize=False).loc[lambda x:x>100]
#df.head(n=10)

In [39]:
#df = data_outer['play_count'].value_counts(normalize=True).cumsum()
#df.head(n=15)

In [40]:
# gebruik alleen scores tot 90%, dus tot een play count van 6. de rest ook op 6 zetten

#data_outer.loc[(data_outer['play_count'] > 6)] = 6

In [42]:
data_outer = data_outer2.copy(deep=True)

#### Split data into train and test

In [43]:
data_outer['user_id'].isna().sum()

#NA"s eruit halen

33736

In [44]:
data_outer = data_outer.dropna(subset=['user_id'])

In [45]:
data_outer['user_id'].value_counts()

6d625c6557df84b60d90426c0116138b617b9449    724
fbee1c8ce1a346fa07d2ef648cec81117438b91f    687
4e11f45d732f4861772b2906f81a7d384552ad12    576
6a9cf03dfb2fc82f5b3b043c9c3fdbab997fd54d    545
1aa4fd215aadb160965110ed8a829745cde319eb    545
                                           ... 
717b179b1a7224f74a0f58e333282f04ff960753      1
92d43bc6122091ddd779a2938d5b59b4b8114df1      1
836071687850be292b31b2af3c4b6b7ca8b52cbd      1
130e5c1007c9b7abe988ec9fd12790cf4d58bdf2      1
ba3815d62bf0a48e02393d7aa1207fa2b2b121d4      1
Name: user_id, Length: 76353, dtype: int64

In [46]:
from sklearn.model_selection import GroupShuffleSplit

In [47]:
data_outer.head()

Unnamed: 0,title,artist,uri,danceability,energy,key,loudness,mode,speechiness,acousticness,...,user_id,song_id,play_count,album,artist_name,year,tot_play_count,number_of_obs,mean_play_count,rel_rating
0,Lucky Man,Montgomery Gentry,spotify:track:4GiXBCUF7H6YfNQsnBRIzl,0.578,0.471,4.0,-7.27,1.0,0.0289,0.368,...,0a00498b9d607844a8826184ae7278097d1c008a,SOICKYJ12A6310E910,2.0,Urban Hymns,The Verve,1997.0,90.0,29.0,3.103448,0.644444
1,Lucky Man,Montgomery Gentry,spotify:track:4GiXBCUF7H6YfNQsnBRIzl,0.578,0.471,4.0,-7.27,1.0,0.0289,0.368,...,7ec12bb04cc91eeb52f5bef8833aa7d51ee9310a,SOICKYJ12A6310E910,1.0,Urban Hymns,The Verve,1997.0,11.0,9.0,1.222222,0.818182
2,Lucky Man,Montgomery Gentry,spotify:track:4GiXBCUF7H6YfNQsnBRIzl,0.578,0.471,4.0,-7.27,1.0,0.0289,0.368,...,e07a79f2d3e0db17991f6eb8d5a3314e22795748,SOICKYJ12A6310E910,1.0,Urban Hymns,The Verve,1997.0,70.0,32.0,2.1875,0.457143
3,Lucky Man,Montgomery Gentry,spotify:track:4GiXBCUF7H6YfNQsnBRIzl,0.578,0.471,4.0,-7.27,1.0,0.0289,0.368,...,1a4f2d9fdfd834e1a8b0ecc3559e67971d6e78f1,SOICKYJ12A6310E910,3.0,Urban Hymns,The Verve,1997.0,123.0,59.0,2.084746,1.439024
4,Lucky Man,Montgomery Gentry,spotify:track:4GiXBCUF7H6YfNQsnBRIzl,0.578,0.471,4.0,-7.27,1.0,0.0289,0.368,...,6e240eea9c3992fc27373129c30a1ccb05810497,SOICKYJ12A6310E910,1.0,Urban Hymns,The Verve,1997.0,91.0,32.0,2.84375,0.351648


In [48]:
train_ind, test_ind = next(GroupShuffleSplit(test_size=0.25, n_splits=2, random_state=42).split(data_outer, groups=data_outer['user_id']))

print(train_ind)
print('\n')
print(test_ind)

[      0       3       4 ... 2086943 2086944 2086945]


[      1       2       6 ... 2086933 2086934 2086942]


In [49]:
traindata_outer = data_outer[data_outer.index.isin(train_ind)]

In [50]:
testdata_outer = data_outer[data_outer.index.isin(test_ind)]

In [51]:
# make a deep copy
data_inner = data_outer.copy(deep=True)
testdata_inner = testdata_outer.copy(deep=True)

In [52]:
# delete all na's for data_inner
data_inner = data_inner.dropna()
testdata_inner = testdata_inner.dropna()

In [53]:
print(data_outer.shape)
print(data_inner.shape)
print(testdata_outer.shape)
print(testdata_inner.shape)

(2086946, 29)
(651026, 29)
(515370, 29)
(163252, 29)


#### Normalize data

In [54]:
numeric_values = data_inner.select_dtypes(np.number)
numeric_values2 =(numeric_values-numeric_values.min())/(numeric_values.max()-numeric_values.min())
numeric_values2 = pd.DataFrame(numeric_values2)
numeric_values2.tail()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,...,time_signature,chorus_hit,sections,target,play_count,year,tot_play_count,number_of_obs,mean_play_count,rel_rating
684753,0.622292,0.848359,0.545455,0.807874,0.0,0.444053,0.055332,0.0,0.375193,0.679282,...,0.75,0.123925,0.37931,1.0,0.0,0.993532,0.016466,0.035961,0.014254,0.013905
684754,0.622292,0.848359,0.545455,0.807874,0.0,0.444053,0.055332,0.0,0.375193,0.679282,...,0.75,0.123925,0.37931,1.0,0.0,0.993532,0.065275,0.098202,0.026905,0.009268
684755,0.622292,0.848359,0.545455,0.807874,0.0,0.444053,0.055332,0.0,0.375193,0.679282,...,0.75,0.123925,0.37931,1.0,0.0,0.993532,0.022346,0.082988,0.003365,0.023698
684756,0.622292,0.848359,0.545455,0.807874,0.0,0.444053,0.055332,0.0,0.375193,0.679282,...,0.75,0.123925,0.37931,1.0,0.05102,0.993532,0.027051,0.045643,0.022262,0.066782
684757,0.622292,0.848359,0.545455,0.807874,0.0,0.444053,0.055332,0.0,0.375193,0.679282,...,0.75,0.123925,0.37931,1.0,0.040816,0.993532,0.009997,0.006916,0.062006,0.025694


In [55]:
numeric_values3 = data_outer.select_dtypes(np.number)
numeric_values4 =(numeric_values3-numeric_values3.min())/(numeric_values3.max()-numeric_values3.min())
numeric_values4 = pd.DataFrame(numeric_values4)
numeric_values4.tail()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,...,time_signature,chorus_hit,sections,target,play_count,year,tot_play_count,number_of_obs,mean_play_count,rel_rating
2120677,,,,,,,,,,,...,,,,,0.010204,0.999502,0.098206,0.037344,0.140658,0.004298
2120678,,,,,,,,,,,...,,,,,0.010204,0.999502,0.042635,0.062241,0.027889,0.017706
2120679,,,,,,,,,,,...,,,,,0.0,0.999502,0.020876,0.069156,0.005282,0.01997
2120680,,,,,,,,,,,...,,,,,0.010204,0.999502,0.059982,0.040111,0.074834,0.007933
2120681,,,,,,,,,,,...,,,,,0.0,0.999502,0.101441,0.048409,0.11047,0.00247


In [56]:
numeric_values5 = testdata_outer.select_dtypes(np.number)
numeric_values6 =(numeric_values5-numeric_values5.min())/(numeric_values5.max()-numeric_values5.min())
numeric_values6 = pd.DataFrame(numeric_values6)
numeric_values6.tail()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,...,time_signature,chorus_hit,sections,target,play_count,year,tot_play_count,number_of_obs,mean_play_count,rel_rating
2086907,,,,,,,,,,,...,,,,,0.010204,0.98408,0.194355,0.561549,0.008038,0.039495
2086910,,,,,,,,,,,...,,,,,0.0,0.98408,0.008527,0.037344,0.000916,0.02981
2086933,,,,,,,,,,,...,,,,,0.0,0.98408,0.028815,0.089903,0.006414,0.021093
2086934,,,,,,,,,,,...,,,,,0.0,0.98408,0.062629,0.195021,0.006505,0.020991
2086942,,,,,,,,,,,...,,,,,0.061224,0.98408,0.069391,0.062241,0.053267,0.043713


In [57]:
numeric_values7 = testdata_inner.select_dtypes(np.number)
numeric_values8 =(numeric_values7-numeric_values7.min())/(numeric_values7.max()-numeric_values7.min())
numeric_values8 = pd.DataFrame(numeric_values8)
numeric_values8.tail()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,...,time_signature,chorus_hit,sections,target,play_count,year,tot_play_count,number_of_obs,mean_play_count,rel_rating
684739,0.622292,0.848359,0.545455,0.807874,0.0,0.444053,0.055332,0.0,0.375193,0.679282,...,0.75,0.123925,0.37931,1.0,0.020408,0.993532,0.042046,0.109267,0.010263,0.058786
684744,0.622292,0.848359,0.545455,0.807874,0.0,0.444053,0.055332,0.0,0.375193,0.679282,...,0.75,0.123925,0.37931,1.0,0.0,0.993532,0.124669,0.237898,0.018687,0.013764
684749,0.622292,0.848359,0.545455,0.807874,0.0,0.444053,0.055332,0.0,0.375193,0.679282,...,0.75,0.123925,0.37931,1.0,0.0,0.993532,0.119377,0.235131,0.017705,0.014232
684754,0.622292,0.848359,0.545455,0.807874,0.0,0.444053,0.055332,0.0,0.375193,0.679282,...,0.75,0.123925,0.37931,1.0,0.0,0.993532,0.065275,0.098202,0.026905,0.010755
684756,0.622292,0.848359,0.545455,0.807874,0.0,0.444053,0.055332,0.0,0.375193,0.679282,...,0.75,0.123925,0.37931,1.0,0.05102,0.993532,0.027051,0.045643,0.022262,0.077618


In [58]:
# replace these columns in the original df

data_inner['danceability'] = numeric_values2['danceability']
data_inner['energy'] = numeric_values2['energy']
data_inner['key'] = numeric_values2['key']
data_inner['loudness'] = numeric_values2['loudness']
data_inner['mode'] = numeric_values2['mode']
data_inner['speechiness'] = numeric_values2['speechiness']
data_inner['acousticness'] = numeric_values2['acousticness']
data_inner['instrumentalness'] = numeric_values2['instrumentalness']
data_inner['liveness'] = numeric_values2['liveness']
data_inner['valence'] = numeric_values2['valence']
data_inner['tempo'] = numeric_values2['tempo']
data_inner['duration_ms'] = numeric_values2['duration_ms']
data_inner['time_signature'] = numeric_values2['time_signature']
data_inner['chorus_hit'] = numeric_values2['chorus_hit']
data_inner['sections'] = numeric_values2['sections']

data_outer['danceability'] = numeric_values4['danceability']
data_outer['energy'] = numeric_values4['energy']
data_outer['key'] = numeric_values4['key']
data_outer['loudness'] = numeric_values4['loudness']
data_outer['mode'] = numeric_values4['mode']
data_outer['speechiness'] = numeric_values4['speechiness']
data_outer['acousticness'] = numeric_values4['acousticness']
data_outer['instrumentalness'] = numeric_values4['instrumentalness']
data_outer['liveness'] = numeric_values4['liveness']
data_outer['valence'] = numeric_values4['valence']
data_outer['tempo'] = numeric_values4['tempo']
data_outer['duration_ms'] = numeric_values4['duration_ms']
data_outer['time_signature'] = numeric_values4['time_signature']
data_outer['chorus_hit'] = numeric_values4['chorus_hit']
data_outer['sections'] = numeric_values4['sections']

testdata_outer['danceability'] = numeric_values6['danceability']
testdata_outer['energy'] = numeric_values6['energy']
testdata_outer['key'] = numeric_values6['key']
testdata_outer['loudness'] = numeric_values6['loudness']
testdata_outer['mode'] = numeric_values6['mode']
testdata_outer['speechiness'] = numeric_values6['speechiness']
testdata_outer['acousticness'] = numeric_values6['acousticness']
testdata_outer['instrumentalness'] = numeric_values6['instrumentalness']
testdata_outer['liveness'] = numeric_values6['liveness']
testdata_outer['valence'] = numeric_values6['valence']
testdata_outer['tempo'] = numeric_values6['tempo']
testdata_outer['duration_ms'] = numeric_values6['duration_ms']
testdata_outer['time_signature'] = numeric_values6['time_signature']
testdata_outer['chorus_hit'] = numeric_values6['chorus_hit']
testdata_outer['sections'] = numeric_values6['sections']

testdata_inner['danceability'] = numeric_values8['danceability']
testdata_inner['energy'] = numeric_values8['energy']
testdata_inner['key'] = numeric_values8['key']
testdata_inner['loudness'] = numeric_values8['loudness']
testdata_inner['mode'] = numeric_values8['mode']
testdata_inner['speechiness'] = numeric_values8['speechiness']
testdata_inner['acousticness'] = numeric_values8['acousticness']
testdata_inner['instrumentalness'] = numeric_values8['instrumentalness']
testdata_inner['liveness'] = numeric_values8['liveness']
testdata_inner['valence'] = numeric_values8['valence']
testdata_inner['tempo'] = numeric_values8['tempo']
testdata_inner['duration_ms'] = numeric_values8['duration_ms']
testdata_inner['time_signature'] = numeric_values8['time_signature']
testdata_inner['chorus_hit'] = numeric_values8['chorus_hit']
testdata_inner['sections'] = numeric_values8['sections']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testdata_outer['danceability'] = numeric_values6['danceability']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testdata_outer['energy'] = numeric_values6['energy']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testdata_outer['key'] = numeric_values6['key']
A value is trying to be set on a copy of

In [59]:
## export train and test set

In [60]:
data_outer.to_csv('data_outer_preprocessed.csv', header=True, index=False)
data_inner.to_csv('data_inner_preprocessed.csv', header=True, index=False)
testdata_outer.to_csv('testdata_outer.csv', header=True, index=False)
testdata_inner.to_csv('testdata_inner.csv', header=True, index=False)