In [1]:
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns 
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn import metrics
from sklearn.impute import SimpleImputer

___

# Data Cleaning and Wrangling
___

In [2]:
#Reintroduce raw training dataset for the purpose of comparison and reference
train_path = ("/Users/sa14/Desktop/Music Recommendation Algorithm with Unsupervised Learning/data/music_rec_trainset.csv")

#Load music recommendation training dataset csv as a dataframe
raw_df = pd.read_csv(train_path)

In [3]:
#Remove "Unnamed:0" column from raw dataframe
raw_df.drop(raw_df.filter(regex="Unnamed").columns, axis=1, inplace=True)

#Verify "Unnamed:0" column removal
raw_df.head()

Unnamed: 0,artist_name,track_name,release_date,genre,lyrics,len,dating,violence,world/life,night/time,...,communication,obscene,music,movement/places,light/visual perceptions,family/spiritual,sadness,feelings,topic,age
0,mukesh,mohabbat bhi jhoothi,1950,pop,hold time feel break feel untrue convince spea...,95,0.000598,0.063746,0.000598,0.000598,...,0.263751,0.000598,0.039288,0.000598,0.000598,0.000598,0.380299,0.117175,sadness,1.0
1,frankie laine,i believe,1950,pop,believe drop rain fall grow believe darkest ni...,51,0.035537,0.096777,0.443435,0.001284,...,0.001284,0.001284,0.118034,0.001284,0.212681,0.051124,0.001284,0.001284,world/life,1.0
2,johnnie ray,cry,1950,pop,sweetheart send letter goodbye secret feel bet...,24,0.00277,0.00277,0.00277,0.00277,...,0.250668,0.00277,0.323794,0.00277,0.00277,0.00277,0.00277,0.225422,music,1.0
3,pérez prado,patricia,1950,pop,kiss lips want stroll charm mambo chacha merin...,54,0.048249,0.001548,0.001548,0.001548,...,0.001548,0.001548,0.001548,0.12925,0.001548,0.001548,0.225889,0.001548,romantic,1.0
4,giorgos papadopoulos,apopse eida oneiro,1950,pop,till darling till matter know till dream live ...,48,0.00135,0.00135,0.417772,0.00135,...,0.00135,0.00135,0.00135,0.00135,0.00135,0.029755,0.0688,0.00135,romantic,1.0


In [4]:
#This dataset (recommend_test_set.csv) has already been trained with info from the raw dataset

#Store music recommendation test dataset file path in variable for readability
test_path = ("/Users/sa14/Desktop/Music Recommendation Algorithm with Unsupervised Learning/data/recommend_test_set.csv")

#Load music recommendation training dataset csv as a dataframe
test_df = pd.read_csv(test_path)

In [5]:
#get first 5 rows of test_df
test_df.head()

Unnamed: 0.1,Unnamed: 0,artist_name,track_name,release_date,genre,lyrics,len,dating,violence,world/life,...,obscene,music,movement/places,light/visual perceptions,family/spiritual,like/girls,sadness,feelings,topic,age
0,76885,godsmack,immune,1998,rock,come world society futher place home land deat...,74,0.000907,0.348191,0.375448,...,0.000907,0.019389,0.000907,0.000907,0.000907,0.000907,0.000907,0.018854,world/life,0.314286
1,65394,dennis brown,second chance,1993,reggae,maybe maybe treat good feel second best girl s...,43,0.001224,0.029943,0.001224,...,0.001224,0.001224,0.001224,0.001224,0.001224,0.056842,0.001224,0.062092,night/time,0.385714
2,10980,the black crowes,sister luck,1990,pop,worry sick eye hurt rest head life outside gir...,54,0.00112,0.48249,0.00112,...,0.00112,0.00112,0.00112,0.078222,0.00112,0.051132,0.031571,0.202862,violence,0.428571
3,842,jerry lee lewis,your cheating heart,1960,pop,cheat heart weep sleep sleep come night cheat ...,25,0.20474,0.002506,0.002506,...,0.002506,0.002506,0.002506,0.002506,0.002506,0.002506,0.474607,0.002506,sadness,0.857143
4,2764,paul anka,eso beso,1966,pop,beso kiss beso kiss know samba bossanova close...,97,0.00117,0.00117,0.00117,...,0.00117,0.00117,0.00117,0.314626,0.00117,0.053731,0.00117,0.00117,romantic,0.771429


In [14]:
#Get features in raw dataset 
raw_df.columns

Index(['artist_name', 'track_name', 'release_date', 'genre', 'lyrics', 'len',
       'dating', 'violence', 'world/life', 'night/time', 'shake the audience',
       'family/gospel', 'romantic', 'communication', 'obscene', 'music',
       'movement/places', 'light/visual perceptions', 'family/spiritual',
       'sadness', 'feelings', 'topic', 'age'],
      dtype='object')

In [6]:
#Remove "Unnamed:0" column from test dataframe
test_df.drop(test_df.filter(regex="Unnamed").columns, axis=1, inplace=True)

#Verify "Unnamed:0" column removal
test_df.head()

Unnamed: 0,artist_name,track_name,release_date,genre,lyrics,len,dating,violence,world/life,night/time,...,obscene,music,movement/places,light/visual perceptions,family/spiritual,like/girls,sadness,feelings,topic,age
0,godsmack,immune,1998,rock,come world society futher place home land deat...,74,0.000907,0.348191,0.375448,0.000907,...,0.000907,0.019389,0.000907,0.000907,0.000907,0.000907,0.000907,0.018854,world/life,0.314286
1,dennis brown,second chance,1993,reggae,maybe maybe treat good feel second best girl s...,43,0.001224,0.029943,0.001224,0.306688,...,0.001224,0.001224,0.001224,0.001224,0.001224,0.056842,0.001224,0.062092,night/time,0.385714
2,the black crowes,sister luck,1990,pop,worry sick eye hurt rest head life outside gir...,54,0.00112,0.48249,0.00112,0.00112,...,0.00112,0.00112,0.00112,0.078222,0.00112,0.051132,0.031571,0.202862,violence,0.428571
3,jerry lee lewis,your cheating heart,1960,pop,cheat heart weep sleep sleep come night cheat ...,25,0.20474,0.002506,0.002506,0.129818,...,0.002506,0.002506,0.002506,0.002506,0.002506,0.002506,0.474607,0.002506,sadness,0.857143
4,paul anka,eso beso,1966,pop,beso kiss beso kiss know samba bossanova close...,97,0.00117,0.00117,0.00117,0.050289,...,0.00117,0.00117,0.00117,0.314626,0.00117,0.053731,0.00117,0.00117,romantic,0.771429


**The lyric column will be dropped in most dataframes.**

In [7]:
clean_test = test_df.drop(["lyrics"], axis=1)

In [8]:
clean_test.head()

Unnamed: 0,artist_name,track_name,release_date,genre,len,dating,violence,world/life,night/time,shake the audience,...,obscene,music,movement/places,light/visual perceptions,family/spiritual,like/girls,sadness,feelings,topic,age
0,godsmack,immune,1998,rock,74,0.000907,0.348191,0.375448,0.000907,0.225414,...,0.000907,0.019389,0.000907,0.000907,0.000907,0.000907,0.000907,0.018854,world/life,0.314286
1,dennis brown,second chance,1993,reggae,43,0.001224,0.029943,0.001224,0.306688,0.001224,...,0.001224,0.001224,0.001224,0.001224,0.001224,0.056842,0.001224,0.062092,night/time,0.385714
2,the black crowes,sister luck,1990,pop,54,0.00112,0.48249,0.00112,0.00112,0.00112,...,0.00112,0.00112,0.00112,0.078222,0.00112,0.051132,0.031571,0.202862,violence,0.428571
3,jerry lee lewis,your cheating heart,1960,pop,25,0.20474,0.002506,0.002506,0.129818,0.002506,...,0.002506,0.002506,0.002506,0.002506,0.002506,0.002506,0.474607,0.002506,sadness,0.857143
4,paul anka,eso beso,1966,pop,97,0.00117,0.00117,0.00117,0.050289,0.00117,...,0.00117,0.00117,0.00117,0.314626,0.00117,0.053731,0.00117,0.00117,romantic,0.771429
