In [1]:
import csv
import ast
import statistics
import math
from collections import defaultdict
from load_dataset_module import all

In [2]:
# Loading the csv file into program and extracting only the needed columns
def load_dataset_module(data_path): 
    data = []
    needed_columns = {'acousticness', 'artists', 'danceability', 'energy', 'id', 'liveness', 'loudness', 'name', 'popularity', 'speechiness', 'tempo', 'valence'}
    with open(data_path, mode='r',errors='ignore',encoding='utf-8') as csv_file:
        csv_reader = csv.DictReader(csv_file)
        for row in csv_reader:
            typed_data = dict()
            for k,v in dict(row).items():
                if k in needed_columns:
                    if k == 'artists':
                        typed_data[k] = ast.literal_eval(v)
                    elif k == 'popularity':
                        typed_data[k] = int(v)
                    elif k in ['id','name']:
                        typed_data[k] = str(v)
                    else:
                        typed_data[k] = float(v)
            data.append(typed_data)
    return data`

In [3]:
load_dataset_module('./data.csv')

[{'acousticness': 0.995,
  'artists': ['Carl Woitschach'],
  'danceability': 0.708,
  'energy': 0.195,
  'id': '6KbQ3uYMLKb5jDxLF7wYDD',
  'liveness': 0.151,
  'loudness': -12.428,
  'name': 'Singende Bataillone 1. Teil',
  'popularity': 0,
  'speechiness': 0.0506,
  'tempo': 118.469,
  'valence': 0.779},
 {'acousticness': 0.994,
  'artists': ['Robert Schumann', 'Vladimir Horowitz'],
  'danceability': 0.379,
  'energy': 0.0135,
  'id': '6KuQTIu1KoTTkLXKrwlLPV',
  'liveness': 0.0763,
  'loudness': -28.454,
  'name': 'Fantasiestücke, Op. 111: Più tosto lento',
  'popularity': 0,
  'speechiness': 0.0462,
  'tempo': 83.97200000000002,
  'valence': 0.0767},
 {'acousticness': 0.604,
  'artists': ['Seweryn Goszczyński'],
  'danceability': 0.7490000000000001,
  'energy': 0.22,
  'id': '6L63VW0PibdM1HDSBoqnoM',
  'liveness': 0.119,
  'loudness': -19.924,
  'name': 'Chapter 1.18 - Zamek kaniowski',
  'popularity': 0,
  'speechiness': 0.929,
  'tempo': 107.177,
  'valence': 0.88},
 {'acousticness

In [4]:
# Creating artist_music dict
def get_artist_music(data):
    # Artists column contain multiple artist so representing each artist in each column 
    artist_data = []
    for row in data:
        for artist in row['artists']:
            artist_music_data = {i:j for i,j in row.items() if i != 'artists'}
            artist_music_data['artist'] = artist
            artist_data.append(artist_music_data)

    # Creating unique artist rows
    unique_artist = defaultdict(list)
    for row in artist_data:
        unique_artist[row['artist']].append(row)

    # Taking median if an artist has multiple music
    artist_music = dict()
    artist_music_columns = {'artist', 'music_names', 'acousticness', 'danceability', 'energy', 'liveness', 'loudness', 'popularity', 'speechiness', 'tempo', 'valence'}
    artist_id = 1
    for artist, features in unique_artist.items():
        artist_music_data = {'artist':artist}
        artist_music_data['music_names'] = [music['name'] for music in features]
        for f in artist_music_columns:
            if f not in artist_music_data.keys():
                artist_music_data[f] = statistics.median([music[f] for music in features])
        artist_music[artist_id] = artist_music_data
        artist_id += 1
    return artist_music

# Creating music_features dict
def get_music_features(data):
    music_features = dict()
    music_features_columns = {'id', 'artists','name', 'acousticness',  'energy','danceability', 'liveness', 'loudness', 'popularity', 'speechiness', 'tempo', 'valence'}
    music_id = 1
    for row in data:
        music_data = {k:v for k,v in row.items() if k in music_features_columns}
        music_features[music_id] = music_data
        music_id += 1
    return music_features

In [5]:
get_music_features(load_dataset_module('./data.csv'))

{1: {'acousticness': 0.995,
  'artists': ['Carl Woitschach'],
  'danceability': 0.708,
  'energy': 0.195,
  'id': '6KbQ3uYMLKb5jDxLF7wYDD',
  'liveness': 0.151,
  'loudness': -12.428,
  'name': 'Singende Bataillone 1. Teil',
  'popularity': 0,
  'speechiness': 0.0506,
  'tempo': 118.469,
  'valence': 0.779},
 2: {'acousticness': 0.994,
  'artists': ['Robert Schumann', 'Vladimir Horowitz'],
  'danceability': 0.379,
  'energy': 0.0135,
  'id': '6KuQTIu1KoTTkLXKrwlLPV',
  'liveness': 0.0763,
  'loudness': -28.454,
  'name': 'Fantasiestücke, Op. 111: Più tosto lento',
  'popularity': 0,
  'speechiness': 0.0462,
  'tempo': 83.97200000000002,
  'valence': 0.0767},
 3: {'acousticness': 0.604,
  'artists': ['Seweryn Goszczyński'],
  'danceability': 0.7490000000000001,
  'energy': 0.22,
  'id': '6L63VW0PibdM1HDSBoqnoM',
  'liveness': 0.119,
  'loudness': -19.924,
  'name': 'Chapter 1.18 - Zamek kaniowski',
  'popularity': 0,
  'speechiness': 0.929,
  'tempo': 107.177,
  'valence': 0.88},
 4: {'

In [6]:
get_artist_music(load_dataset_module('./data.csv'))

{1: {'artist': 'Carl Woitschach',
  'music_names': ['Singende Bataillone 1. Teil',
   'Per aspera ad astra',
   'Singende Bataillone 2. Teil',
   'Lore, Lore, Lore',
   'Reserve hat Ruh 2. Teil',
   'Taxis Marsch',
   'Preussens Gloria',
   'Unsere Garde',
   'Ruhmesmärsche der alten Armee 2. Teil',
   'Ruhmesmärsche der alten Armee 1. Teil',
   'Mein Vaterland 2. Teil',
   'Manöverliebe 2. Teil',
   'Alle mit uns 2. Teil',
   'Mein Vaterland 1. Teil',
   'Manöverleben 1. Teil',
   'Deutscher Weckruf',
   'Reserve hat Ruh 1. Teil',
   'Alle mit uns 1. Teil',
   'Marsch der Schweizer Nationalgarde',
   'Unsere Marine',
   'Kaiser Friedrich Marsch'],
  'loudness': -8.73,
  'speechiness': 0.0474,
  'valence': 0.779,
  'energy': 0.438,
  'acousticness': 0.988,
  'liveness': 0.23,
  'danceability': 0.632,
  'popularity': 0,
  'tempo': 117.6},
 2: {'artist': 'Robert Schumann',
  'music_names': ['Fantasiestücke, Op. 111: Più tosto lento',
   'Nachtstücke, Op. 23: No. 4 in F',
   'Humoreske, O

In [7]:
#Using mathematical formula, define functions to calculate similarity scores
def cosine_similarity(data, id1, id2):
    def square_rooted(x):
        return round(math.sqrt(sum([a*a for a in x])),3)
    def compute(v1, v2):
        numerator = sum(a*b for a,b in zip(v1,v2))
        denominator = square_rooted(v1)*square_rooted(v2)
        return round(numerator/float(denominator),3)
    d1 = [v for v in data[id1].values() if type(v) in [int,float]]
    d2 = [v for v in data[id2].values() if type(v) in [int,float]]
    return compute(d1, d2)

def euclidean_similarity(data, id1, id2):
    def compute(v1, v2):
        return math.sqrt(sum(pow(a-b,2) for a, b in zip(v1, v2)))
    d1 = [v for v in data[id1].values() if type(v) in [int,float]]
    d2 = [v for v in data[id2].values() if type(v) in [int,float]]
    return compute(d1, d2)

def jaccard_similarity(data, id1, id2): 
    def compute(v1, v2):
        intersection_cardinality = len(set.intersection(*[set(v1), set(v2)]))
        union_cardinality = len(set.union(*[set(v1), set(v2)]))
        return intersection_cardinality/float(union_cardinality)
    d1 = [v for v in data[id1].values() if type(v) in [int,float]]
    d2 = [v for v in data[id2].values() if type(v) in [int,float]]
    return compute(d1, d2)

def manhattan_similarity(data, id1, id2):
    def compute(v1, v2):
        return sum(abs(a-b) for a,b in zip(v1,v2))
    d1 = [v for v in data[id1].values() if type(v) in [int,float]]
    d2 = [v for v in data[id2].values() if type(v) in [int,float]]
    return compute(d1, d2)

def pearson_similarity(data, id1, id2):
    def compute(v1, v2):
        x_mean = statistics.mean(v1)
        y_mean = statistics.mean(v2)
        numerator = sum([(x-x_mean)*(y-y_mean) for x,y in zip(v1,v2)])
        denominator = math.sqrt(sum([pow(x-x_mean,2) for x in v1])) * math.sqrt(sum([pow(y-y_mean,2)for y in v2]))
        return numerator/denominator
    d1 = [v for v in data[id1].values() if type(v) in [int,float]]
    d2 = [v for v in data[id2].values() if type(v) in [int,float]]
    return compute(d1, d2)

In [8]:
def compute_similarity(similarity_func, data, id1, id2):
    return similarity_func(data, id1, id2)

In [9]:
data_path = './data.csv'
data = load_dataset_module(data_path)
artist_music = get_artist_music(data)
music_features = get_music_features(data)
max_artist_music_id = len(artist_music.keys())
max_music_features_id = len(music_features.keys())
similarity_map = {
    1:cosine_similarity,
    2:euclidean_similarity,
    3:jaccard_similarity,
    4:manhattan_similarity,
    5:pearson_similarity
}
data_map = {
    1:artist_music,
    2:music_features
}

In [11]:
print("Welcome To Similarity Finder")
print("============================")
print("You can find similarity within artist or music.")
print("\nBasic Info")
print("----------")
print(f"Artist index start from 1 and ends at {max_artist_music_id}")
print(f"Music index start from 1 and ends at {max_music_features_id}")
print("\nAvailable Similarity Functions")
print("------------------------------")
for k,v in similarity_map.items():
    print(f"{k}.{v.__name__}")
print("\nAvailable Data")
print("----------------")
print("1.artist_music")
print("2.music_features")
inp = 'y'
while inp not in ['n','no']:
    condition_unmet = True
    while condition_unmet:
        sim_fun = int(input("\nEnter similarity function number:"))
        if sim_fun in similarity_map.keys():
            condition_unmet = False
        else:
            print("Invalid input! Please enter a value from 1-5")
    condition_unmet = True
    while condition_unmet:
        sim_data = int(input("Enter data number:"))
        if sim_data in data_map.keys():
            condition_unmet = False
        else:
            print("Invalid input! Please enter a value 1 or 2")
    id_max = max_artist_music_id if sim_data == 1 else max_music_features_id
    condition_unmet = True
    while condition_unmet:
        id_data1 = int(input("Enter first id:"))
        if id_data1 >= 1 or id_data1 <= id_max:
            condition_unmet = False
        else:
            print(f"Invalid input! Please enter a value from 1 and {id_max}")
    condition_unmet = True
    while condition_unmet:
        id_data2 = int(input("Enter second id:"))
        if id_data2 >= 1 or id_data2 <= id_max:
            condition_unmet = False
        else:
            print(f"Invalid input! Please enter a value from 1 and {id_max}")

    print(f"\n\nSimilarity: {compute_similarity(similarity_map[sim_fun],data_map[sim_data],id_data1,id_data2)}")
    inp = input("\nDo you want to continue(y/n):")

Welcome To Similarity Finder
You can find similarity within artist or music.

Basic Info
----------
Artist index start from 1 and ends at 27622
Music index start from 1 and ends at 169909

Available Similarity Functions
------------------------------
1.cosine_similarity
2.euclidean_similarity
3.jaccard_similarity
4.manhattan_similarity
5.pearson_similarity

Available Data
----------------
1.artist_music
2.music_features

Enter similarity function number:1
Enter data number:1
Enter first id:5
Enter second id:6


Similarity: 0.986

Do you want to continue(y/n):y

Enter similarity function number:5
Enter data number:3
Invalid input! Please enter a value 1 or 2
Enter data number:2
Enter first id:4
Enter second id:5


Similarity: 0.9920912362889084

Do you want to continue(y/n):n
