In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder,StandardScaler,OneHotEncoder
from sklearn.cluster import KMeans, AgglomerativeClustering, Birch
from sklearn.metrics.pairwise import cosine_similarity
import tensorflow as tf
import pickle


In [2]:
music_df=pd.read_csv('music_genre.csv')
music_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50005 entries, 0 to 50004
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   instance_id       50000 non-null  float64
 1   artist_name       50000 non-null  object 
 2   track_name        50000 non-null  object 
 3   popularity        50000 non-null  float64
 4   acousticness      50000 non-null  float64
 5   danceability      50000 non-null  float64
 6   duration_ms       50000 non-null  float64
 7   energy            50000 non-null  float64
 8   instrumentalness  50000 non-null  float64
 9   key               50000 non-null  object 
 10  liveness          50000 non-null  float64
 11  loudness          50000 non-null  float64
 12  mode              50000 non-null  object 
 13  speechiness       50000 non-null  float64
 14  tempo             50000 non-null  object 
 15  obtained_date     50000 non-null  object 
 16  valence           50000 non-null  float6

In [3]:
music_df=music_df.dropna()

In [4]:
music_df=music_df[music_df['tempo']!='?']

In [5]:
music_df=music_df[music_df['artist_name']!='empty_field']

In [6]:
music_df=music_df[music_df['duration_ms']>0]

In [7]:
music_df

Unnamed: 0,instance_id,artist_name,track_name,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,obtained_date,valence,music_genre
1,46652.0,Thievery Corporation,The Shining Path,31.0,0.01270,0.622,218293.0,0.890,0.950000,D,0.124,-7.043,Minor,0.0300,115.00200000000001,4-Apr,0.531,Electronic
2,30097.0,Dillon Francis,Hurricane,28.0,0.00306,0.620,215613.0,0.755,0.011800,G#,0.534,-4.617,Major,0.0345,127.994,4-Apr,0.333,Electronic
3,62177.0,Dubloadz,Nitro,34.0,0.02540,0.774,166875.0,0.700,0.002530,C#,0.157,-4.498,Major,0.2390,128.014,4-Apr,0.270,Electronic
4,24907.0,What So Not,Divide & Conquer,32.0,0.00465,0.638,222369.0,0.587,0.909000,F#,0.157,-6.266,Major,0.0413,145.036,4-Apr,0.323,Electronic
6,43760.0,Jordan Comolli,Clash,46.0,0.02890,0.572,214408.0,0.803,0.000008,B,0.106,-4.294,Major,0.3510,149.995,4-Apr,0.230,Electronic
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49999,28408.0,Night Lovell,Barbie Doll,56.0,0.13300,0.849,237667.0,0.660,0.000008,C,0.296,-7.195,Major,0.0516,99.988,4-Apr,0.629,Hip-Hop
50001,43557.0,Roy Woods,Drama (feat. Drake),72.0,0.15700,0.709,251860.0,0.362,0.000000,B,0.109,-9.814,Major,0.0550,122.04299999999999,4-Apr,0.113,Hip-Hop
50002,39767.0,Berner,Lovin' Me (feat. Smiggz),51.0,0.00597,0.693,189483.0,0.763,0.000000,D,0.143,-5.443,Major,0.1460,131.079,4-Apr,0.395,Hip-Hop
50003,57944.0,The-Dream,Shawty Is Da Shit,65.0,0.08310,0.782,262773.0,0.472,0.000000,G,0.106,-5.016,Minor,0.0441,75.88600000000001,4-Apr,0.354,Hip-Hop


In [8]:
data=music_df.copy().drop(['artist_name','track_name','instance_id','obtained_date','popularity','duration_ms'],axis=1).reset_index(drop=True)

In [9]:
songs=music_df.copy()[['instance_id','artist_name','track_name','music_genre']].reset_index(drop=True)

In [10]:
# encoding data
le=LabelEncoder()
data['key']=le.fit_transform(data['key'])

In [11]:
data['mode']=le.fit_transform(data['mode'])

In [12]:
encoder=OneHotEncoder(sparse=False)
music_genre=encoder.fit_transform(data['music_genre'].values.reshape(-1,1))
music_genre_names=encoder.categories_



In [13]:
music_genre_names

[array(['Alternative', 'Anime', 'Blues', 'Classical', 'Country',
        'Electronic', 'Hip-Hop', 'Jazz', 'Rap', 'Rock'], dtype=object)]

In [14]:
music_genre_df=pd.DataFrame(columns=music_genre_names,data=music_genre)

In [15]:
music_genre_df

Unnamed: 0,Alternative,Anime,Blues,Classical,Country,Electronic,Hip-Hop,Jazz,Rap,Rock
0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
38551,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
38552,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
38553,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
38554,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [16]:
data=data.drop('music_genre',axis=1)

In [17]:
# data=pd.concat([data,music_genre_df],axis=1)

In [18]:
data.columns=data.columns.astype(str)

In [19]:
data

Unnamed: 0,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,valence
0,0.01270,0.622,0.890,0.950000,5,0.124,-7.043,1,0.0300,115.00200000000001,0.531
1,0.00306,0.620,0.755,0.011800,11,0.534,-4.617,0,0.0345,127.994,0.333
2,0.02540,0.774,0.700,0.002530,4,0.157,-4.498,0,0.2390,128.014,0.270
3,0.00465,0.638,0.587,0.909000,9,0.157,-6.266,0,0.0413,145.036,0.323
4,0.02890,0.572,0.803,0.000008,2,0.106,-4.294,0,0.3510,149.995,0.230
...,...,...,...,...,...,...,...,...,...,...,...
38551,0.13300,0.849,0.660,0.000008,3,0.296,-7.195,0,0.0516,99.988,0.629
38552,0.15700,0.709,0.362,0.000000,2,0.109,-9.814,0,0.0550,122.04299999999999,0.113
38553,0.00597,0.693,0.763,0.000000,5,0.143,-5.443,0,0.1460,131.079,0.395
38554,0.08310,0.782,0.472,0.000000,10,0.106,-5.016,1,0.0441,75.88600000000001,0.354


In [20]:
songs

Unnamed: 0,instance_id,artist_name,track_name,music_genre
0,46652.0,Thievery Corporation,The Shining Path,Electronic
1,30097.0,Dillon Francis,Hurricane,Electronic
2,62177.0,Dubloadz,Nitro,Electronic
3,24907.0,What So Not,Divide & Conquer,Electronic
4,43760.0,Jordan Comolli,Clash,Electronic
...,...,...,...,...
38551,28408.0,Night Lovell,Barbie Doll,Hip-Hop
38552,43557.0,Roy Woods,Drama (feat. Drake),Hip-Hop
38553,39767.0,Berner,Lovin' Me (feat. Smiggz),Hip-Hop
38554,57944.0,The-Dream,Shawty Is Da Shit,Hip-Hop


In [21]:
scaler=StandardScaler()
data_scaled=scaler.fit_transform(data)

In [22]:
#compute similarities between songs
cosine_sim =cosine_similarity(data_scaled,data_scaled)

In [23]:
cosine_sim.shape

(38556, 38556)

In [24]:
cosine_sim

array([[ 1.        , -0.13201759, -0.18630127, ..., -0.1218423 ,
         0.17063677,  0.16621528],
       [-0.13201759,  1.        ,  0.15245171, ...,  0.29518563,
         0.1694994 ,  0.18837832],
       [-0.18630127,  0.15245171,  1.        , ...,  0.8955456 ,
         0.06446   , -0.02481693],
       ...,
       [-0.1218423 ,  0.29518563,  0.8955456 , ...,  1.        ,
         0.05075142, -0.00240172],
       [ 0.17063677,  0.1694994 ,  0.06446   , ...,  0.05075142,
         1.        ,  0.60196254],
       [ 0.16621528,  0.18837832, -0.02481693, ..., -0.00240172,
         0.60196254,  1.        ]])

In [25]:
songs.loc[songs['artist_name']=='Muse']

Unnamed: 0,instance_id,artist_name,track_name,music_genre
27047,77204.0,Muse,Reapers,Rock
27208,46478.0,Muse,Resistance,Rock
27264,26976.0,Muse,Assassin,Rock
27367,91516.0,Muse,Starlight,Rock
27378,73134.0,Muse,Mercy,Rock
27409,23265.0,Muse,Break it to Me,Rock
27470,83980.0,Muse,Dig Down,Rock
27868,38164.0,Muse,The Dark Side,Rock
28085,38748.0,Muse,Time Is Running Out,Rock
28362,39182.0,Muse,Uprising,Rock


In [26]:
id=38748.0

In [27]:
# getting the song's index number
song_idx=songs.loc[songs['instance_id']==id].index

In [28]:
song_idx=song_idx[0]

In [29]:
#converting matrix to dataframe
cosine_sim=pd.DataFrame(cosine_sim)

In [30]:
cosine_sim

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,38546,38547,38548,38549,38550,38551,38552,38553,38554,38555
0,1.000000,-0.132018,-0.186301,0.548871,-0.195336,0.802922,0.124553,0.161348,-0.188246,0.534830,...,-0.107469,-0.070811,0.526200,-0.270931,-0.156745,-0.075698,-0.326080,-0.121842,0.170637,0.166215
1,-0.132018,1.000000,0.152452,0.274762,-0.076521,-0.051534,0.199228,-0.304069,-0.237029,-0.108402,...,-0.583438,0.294008,-0.204799,-0.140347,0.163761,0.277984,-0.105020,0.295186,0.169499,0.188378
2,-0.186301,0.152452,1.000000,0.058563,0.845930,-0.174189,0.469525,0.464504,-0.420006,0.134901,...,0.236544,0.705754,0.088363,0.740380,0.184590,0.384330,0.455085,0.895546,0.064460,-0.024817
3,0.548871,0.274762,0.058563,1.000000,-0.048436,0.574265,-0.004968,-0.240628,-0.314824,0.390983,...,-0.507616,-0.047159,0.582007,-0.229088,-0.266881,-0.127651,0.025017,0.142499,0.007170,-0.171756
4,-0.195336,-0.076521,0.845930,-0.048436,1.000000,-0.334800,0.621966,0.608655,-0.565455,0.310996,...,0.247786,0.306201,-0.150921,0.820999,0.384759,-0.023131,0.248590,0.724297,-0.298152,-0.324172
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38551,-0.075698,0.277984,0.384330,-0.127651,-0.023131,0.016713,-0.328587,-0.091004,-0.174134,-0.372772,...,-0.030852,0.895711,0.407569,0.054652,-0.174565,1.000000,0.287624,0.464689,0.184167,0.413350
38552,-0.326080,-0.105020,0.455085,0.025017,0.248590,-0.387473,-0.252220,0.325287,0.068686,0.155623,...,0.377281,0.413492,0.238625,0.057819,-0.518882,0.287624,1.000000,0.375004,0.090060,-0.289460
38553,-0.121842,0.295186,0.895546,0.142499,0.724297,-0.096262,0.427632,0.351473,-0.591627,0.140166,...,-0.085507,0.670922,0.065604,0.525494,0.024267,0.464689,0.375004,1.000000,0.050751,-0.002402
38554,0.170637,0.169499,0.064460,0.007170,-0.298152,0.403032,0.154098,0.007547,0.590930,-0.150464,...,0.290181,0.244424,0.049444,-0.077944,-0.244078,0.184167,0.090060,0.050751,1.000000,0.601963


In [31]:
# iterating through the similarity scores to find closest to the song selected using index
rec=cosine_sim[song_idx]
score=rec.sort_values(ascending=False)
similar=songs.iloc[score.index]

In [32]:
type=songs.at[song_idx,'music_genre']

In [33]:
best_10=similar.loc[similar['music_genre']==type]

In [34]:
best_10=best_10[0:9].reset_index(drop=True)

In [35]:
best_10

Unnamed: 0,instance_id,artist_name,track_name,music_genre
0,38748.0,Muse,Time Is Running Out,Rock
1,39009.0,Def Leppard,Foolin',Rock
2,76148.0,Neon Indian,Polish Girl,Rock
3,48970.0,Nine Inch Nails,The Hand That Feeds,Rock
4,52042.0,PVRIS,You and I,Rock
5,53885.0,Red Hot Chili Peppers,Don't Forget Me,Rock
6,49960.0,Panic! At The Disco,House Of Memories,Rock
7,58951.0,Night Riots,Nothing Personal,Rock
8,32479.0,Dio,Rainbow in the Dark,Rock


In [37]:
# # saving data and songs as csv
# data.to_csv('song_data.csv',index=False)
# songs.to_csv('songlist.csv',index=False)


In [37]:
# checking results
music_df.loc[music_df['instance_id']==38748.0]

Unnamed: 0,instance_id,artist_name,track_name,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,obtained_date,valence,music_genre
36474,38748.0,Muse,Time Is Running Out,68.0,0.00242,0.585,237040.0,0.842,0.00686,A,0.0866,-5.883,Minor,0.0556,118.211,4-Apr,0.428,Rock


In [38]:
music_df.loc[music_df['instance_id']==39009.0]

Unnamed: 0,instance_id,artist_name,track_name,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,obtained_date,valence,music_genre
35966,39009.0,Def Leppard,Foolin',54.0,0.000337,0.608,274118.0,0.787,0.00782,A,0.111,-6.703,Minor,0.0277,113.796,4-Apr,0.474,Rock
