In [1]:
import pandas
import math
import sys
import numpy as np

In [2]:
 '''
    constants
'''
TOP_PLAYERS_COUNT = 3

In [3]:
with open("music.csv", 'r') as csvfile:
    music = pandas.read_csv(csvfile)

In [4]:
print(music.columns.values)

['player' 'movies' 'songs' 'theme_songs' 'albums' 'national_awards'
 'other_awards' 'sung' 'stage_performance']


In [5]:
# Select ilayaraaja from our dataset
selected_player = music[music["player"] == "Ilayaraaja"].iloc[0]

In [6]:
print(selected_player)

player               Ilayaraaja
movies                     4000
songs                     15000
theme_songs                 500
albums                        8
national_awards              65
other_awards                250
sung                       1500
stage_performance          1500
Name: 0, dtype: object


In [7]:
# Choose only the numeric columns (we'll use these to compute euclidean distance)
distance_columns = [
    'movies',
    'songs',
    'theme_songs',
    'albums',
    'national_awards',
    'other_awards',
    'sung',
    'stage_performance'
]

In [8]:
distance_columns

['movies',
 'songs',
 'theme_songs',
 'albums',
 'national_awards',
 'other_awards',
 'sung',
 'stage_performance']

In [9]:
def euclidean_distance(row):
    """
    A simple euclidean distance function
    """
    inner_value = 0
    for k in distance_columns:
        inner_value += (row[k] - selected_player[k]) ** 2
    return math.sqrt(inner_value)

In [10]:
# Select only the numeric columns from the music dataset
music_numeric = music[distance_columns]

In [11]:
print(music_numeric)

   movies  songs  theme_songs  albums  national_awards  other_awards  sung  \
0    4000  15000          500       8               65           250  1500   
1     600   6000          420       6               45           120   600   
2     400   7000          120       2               24            14   700   
3     120   6000          110       1               12            12   500   
4      90   1200           60       1               11            35   140   

   stage_performance  
0               1500  
1                700  
2                600  
3                400  
4                120  


In [12]:
# Normalize all of the numeric columns
music_normalized = (music_numeric - music_numeric.mean()) / music_numeric.std()

In [13]:
print(music_normalized)

     movies     songs  theme_songs    albums  national_awards  other_awards  \
0  1.774595  1.594937     1.275416  1.370989         1.445512      1.611764   
1 -0.265169 -0.208384     0.879938  0.747812         0.585088      0.332586   
2 -0.385155 -0.008015    -0.603104 -0.498542        -0.318357     -0.710436   
3 -0.553136 -0.208384    -0.652538 -0.810130        -0.834611     -0.730116   
4 -0.571134 -1.170155    -0.899712 -0.810130        -0.877632     -0.503799   

       sung  stage_performance  
0  1.621666           1.616444  
1 -0.175747           0.069608  
2  0.023966          -0.123747  
3 -0.375460          -0.510456  
4 -1.094425          -1.051849  


In [14]:
from scipy.spatial import distance

In [15]:
# Fill in NA values in music_normalized
music_normalized.fillna(0, inplace=True)

In [16]:
# Find the normalized vector for ilayaraaja .
ilayaraaja_normalized = music_normalized[music["player"] == "Ilayaraaja"]

In [17]:
ilayaraaja_normalized

Unnamed: 0,movies,songs,theme_songs,albums,national_awards,other_awards,sung,stage_performance
0,1.774595,1.594937,1.275416,1.370989,1.445512,1.611764,1.621666,1.616444


In [18]:
# Find the distance between ilayaraaja and everyone else.
euclidean_distances = music_normalized.apply(lambda row: distance.euclidean(row, ilayaraaja_normalized), axis=1)

In [19]:
print(euclidean_distances)

0    0.000000
1    3.994669
2    5.323788
3    6.028281
4    6.856106
dtype: float64


In [20]:
# Create a new dataframe with distances.
distance_frame = pandas.DataFrame(data={"dist": euclidean_distances, "idx": euclidean_distances.index})
print(distance_frame)

       dist  idx
0  0.000000    0
1  3.994669    1
2  5.323788    2
3  6.028281    3
4  6.856106    4


In [21]:
print(distance_frame[1:3])

       dist  idx
1  3.994669    1
2  5.323788    2


In [22]:
distance_frame.sort_values(by=["dist"], inplace=True)

In [23]:
# Find the most similar player to ilayaraaja (the lowest distance to ilayaraaja is ilayaraaja, the second smallest is the most similar non-ilayaraaja player)
second_smallest = distance_frame.iloc[1]["idx"]

In [25]:
print('closest player index: '+str(int(second_smallest)))

closest player index: 1


In [30]:
most_similar_to_ilayaraaja = music.loc[int(second_smallest)]["player"]

In [31]:
print(most_similar_to_ilayaraaja)

A. R. Rahman


In [32]:
print('most similar : '+str(most_similar_to_ilayaraaja) + ' - distance : '+str(distance_frame.iloc[1]["dist"]))

most similar : A. R. Rahman - distance : 3.9946691469873015


In [33]:
print(len(distance_frame))

5


In [36]:
print('Top '+str(TOP_PLAYERS_COUNT)+' Similar players to Ilayaraaja')
for i in range(TOP_PLAYERS_COUNT + 1):
    
    if i == 0:
        continue
    
    current_farthest = distance_frame.iloc[i]["idx"]
    #print('closest player index: '+str(int(current_farthest)))
    most_similar_to_ilayaraaja = music.loc[int(current_farthest)]["player"]

    print('most similar '+str(i)+' : '+str(most_similar_to_ilayaraaja) + ' - distance : '+str("{0:.2f}".format(distance_frame.iloc[i]["dist"])))

Top 3 Similar players to Ilayaraaja
most similar 1 : A. R. Rahman - distance : 3.99
most similar 2 : Yuvan Shankar Raja - distance : 5.32
most similar 3 : Bharadwaj - distance : 6.03


In [38]:
print('\nMost '+str(TOP_PLAYERS_COUNT)+' Similar Players to Ilayaraaja')
for i in range(TOP_PLAYERS_COUNT):
    
    least_i = len(distance_frame)-1 - i
    
    current_farthest = distance_frame.iloc[least_i]["idx"]
    #print('closest player index: '+str(int(current_farthest)))
    most_similar_to_ilayaraaja = music.loc[int(current_farthest)]["player"]
    
    distance = str(str("{0:.2f}".format(distance_frame.iloc[least_i]["dist"])))

    print('most opposite '+str(i)+' : '+str(most_similar_to_ilayaraaja) + ' - distance : '+distance)    


Most 3 Similar Players to Ilayaraaja
most opposite 0 : Harris Jayaraj - distance : 6.86
most opposite 1 : Bharadwaj - distance : 6.03
most opposite 2 : Yuvan Shankar Raja - distance : 5.32


In [39]:
print('\nAll players Sorted')
for i in range(len(distance_frame)):
    
    current_farthest = distance_frame.iloc[i]["idx"]
    #print('closest player index: '+str(int(current_farthest)))
    close_to_ilayaraaja = music.loc[int(current_farthest)]["player"]

    current_distance = distance_frame.iloc[i]["dist"]
    percentile = 100 - (100 / 18.9714833602) * current_distance
    
    if percentile < 0:
        percentile = 0
        
    distance = str(str("{0:.2f}".format(percentile)))   

    print('similar '+str(i)+' : '+str(close_to_ilayaraaja) + ' - distance : '+str('{0:.2f}'.format(current_distance) ) + ", Percentile : "+ distance)


All players Sorted
similar 0 : Ilayaraaja - distance : 0.00, Percentile : 100.00
similar 1 : A. R. Rahman - distance : 3.99, Percentile : 78.94
similar 2 : Yuvan Shankar Raja - distance : 5.32, Percentile : 71.94
similar 3 : Bharadwaj - distance : 6.03, Percentile : 68.22
similar 4 : Harris Jayaraj - distance : 6.86, Percentile : 63.86
