# **Importing Necessary Libraries**

In [1]:
import numpy as np

In [2]:
from utils.utils import load_json, write_json
from utils.similarity_score import cosine, match, manhattan_distance, inverse_manhattan_distance

In [3]:
songs = load_json('data/vector_data.json')

In [4]:
users = load_json('data/selected_users.json')

# **Creating the Similarity Function**

In [86]:
function_map = {
    'in_movie': match,
    'lyrics': cosine,
    'language': cosine,
    'topics': cosine,
    'sentiment': cosine,
    'artist': match,
    'album_type': match,
    'album_name': match,
    'album_artist': match,
    'popularity': inverse_manhattan_distance,
    'age': inverse_manhattan_distance,
    'explicit': match,
    'duration': inverse_manhattan_distance,
    'listener_play_ratio': manhattan_distance,
    'genre': cosine,
    'acousticness': manhattan_distance,
    'danceability': manhattan_distance,
    'energy': manhattan_distance,
    'instrumentalness': manhattan_distance,
    'key': match,
    'liveness': manhattan_distance,
    'loudness': inverse_manhattan_distance,
    'mode': match,
    'speechiness': manhattan_distance,
    'tempo': inverse_manhattan_distance,
    'valence': manhattan_distance,
    'num_sections': cosine,
    'num_segments': cosine,
    'artist_vector': cosine
}

In [111]:
scale = {
    'tempo': 100,
    'loudness': 10,
    'duration': 1e5,
    'age': 25,
    'popularity': 100,
}

In [184]:
feature_weights = {
    'in_movie': 0.1,
    'lyrics': 0.5,
    'language': 0.8,
    'topics': 0.6,
    'sentiment': 0.8,
    'artist': 0,
    'album_type': 0.1,
    'album_artist': 0.2,
    'popularity': 0.2,
    'age': 0.4,
    'explicit': 0.1,
    'duration': 0.3,
    'listener_play_ratio': 0.4,
    'genre': 0.7,
    'acousticness': 0.5,
    'danceability': 0.5,
    'energy': 0.5,
    'instrumentalness': 0.2,
    'key': 0.2,
    'liveness': 0.2,
    'loudness': 0.5,
    'mode': 0.1,
    'speechiness': 0.2,
    'tempo': 0.6,
    'valence': 0.5,
    'num_sections': 0.2,
    'num_segments': 0.2,
    'artist_vector': 0.7,
    'album_name': 0.1,
}


In [197]:
def similarity(song1, song2, verbose=False):
    global function_map, scale, feature_weights
    scores = []
    features = set(song1.keys()).intersection(song2.keys())
    features.remove('id')
    features.remove('title')

    weights = {k: feature_weights[k] for k in features}

    if 'album_type' in features:
        if song1['album_type'] == 0 or song2['album_type'] == 0:
            weights['artist_vector'] += weights['album_artist']
            weights['album_name'] = 0
            weights['album_artist'] = 0
    
    for feature in features:
        func = function_map[feature]

        if isinstance(song1[feature], list):
            if song1[feature] == [0] * len(song1[feature]):
                del weights[feature]
                continue
        
        if isinstance(song2[feature], list):
            if song2[feature] == [0] * len(song2[feature]):
                del weights[feature]
                continue

        if str(song1[feature]) == 'nan' or str(song2[feature]) == 'nan':
            del weights[feature]
            continue

        if feature in scale:
            score = func(song1[feature], song2[feature], scale=scale[feature])
        else:
            score = func(song1[feature], song2[feature])

        score = score * weights[feature]
        
        if verbose:
            print(f'{feature}: {score}')
        
        scores.append(score)

    scale_factor = sum(list(weights.values()))
    
    return sum(scores) / scale_factor

In [173]:
for i in range(len(songs)):
    if songs[i]['artist'] == 'Alan Walker':
        print(i)

1966
3207
3920
5157
5551
5951
6829
7085
8964
10784
11750


In [199]:
similarity(songs[1966], songs[3920], verbose=True)

popularity: 0.12658227848101264
album_name: 0
genre: 0.6876339407468565
in_movie: 0.1
album_type: 0.1
artist: 0
explicit: 0.1
age: 0.37037037037037035
duration: 0.19867549668874174
artist_vector: 0.7999999999999999
album_artist: 0.0
listener_play_ratio: 0.375904763545557


0.9223118870427544

In [175]:
feat = 'listener_play_ratio'

songs[1966][feat], songs[3207][feat]

(0.1170976564065723, 0.249634562286828)

# **Using the User Data**

In [125]:
def find_song(id):
    s = None
    for song in songs:
        if song['id'] == id:
            s = song
            break
    
    return s

In [187]:
res = []

for user in users:
    main = find_song(user['SongIDs'][-1])
    userid = user['UserID']
    songids = user['SongIDs'][:-1]

    if main is None:
        continue
    
    arr = []
    for id in songids:
        song = find_song(id)

        if song is None:
            continue

        sim = similarity(main, song)
        arr.append(sim)

    if len(arr) == 0:
        continue

    avg = sum(arr) / len(arr)

    res.append({'userid': userid, 'avg_similarity': avg})

    print(f'User {userid} avg similarity: {avg}')

User 730 avg similarity: 0.7132425812568542
User 1038 avg similarity: 0.6470503102281183
User 1132 avg similarity: 0.691883500231268
User 1829 avg similarity: 0.7284584504826815
User 1854 avg similarity: 0.6485068402373406
User 1855 avg similarity: 0.9018039142872585
User 1856 avg similarity: 0.9018039142872585
User 2386 avg similarity: 0.7792750434982222
User 2459 avg similarity: 0.6143761374051876
User 2460 avg similarity: 0.6143761374051876
User 2916 avg similarity: 0.7893666283244785
User 3132 avg similarity: 0.6557286519939356
User 3133 avg similarity: 0.7651862893201199
User 3861 avg similarity: 0.7623465979595334
User 4045 avg similarity: 0.7255826317085843
User 4434 avg similarity: 0.7470098038331124
User 4614 avg similarity: 0.875958115050451
User 4783 avg similarity: 0.6603664189908174
User 5106 avg similarity: 0.6549745223154377
User 5178 avg similarity: 0.7423590753416767
User 5285 avg similarity: 0.8128944111628454
User 5501 avg similarity: 0.7135286276353233
User 5550 avg

In [188]:
len(res)

1571

In [193]:
count = 0
for i in res:
    print(i)
    if i['avg_similarity'] > 0.7:
        count += 1

count

{'userid': '730', 'avg_similarity': 0.7132425812568542}
{'userid': '1038', 'avg_similarity': 0.6470503102281183}
{'userid': '1132', 'avg_similarity': 0.691883500231268}
{'userid': '1829', 'avg_similarity': 0.7284584504826815}
{'userid': '1854', 'avg_similarity': 0.6485068402373406}
{'userid': '1855', 'avg_similarity': 0.9018039142872585}
{'userid': '1856', 'avg_similarity': 0.9018039142872585}
{'userid': '2386', 'avg_similarity': 0.7792750434982222}
{'userid': '2459', 'avg_similarity': 0.6143761374051876}
{'userid': '2460', 'avg_similarity': 0.6143761374051876}
{'userid': '2916', 'avg_similarity': 0.7893666283244785}
{'userid': '3132', 'avg_similarity': 0.6557286519939356}
{'userid': '3133', 'avg_similarity': 0.7651862893201199}
{'userid': '3861', 'avg_similarity': 0.7623465979595334}
{'userid': '4045', 'avg_similarity': 0.7255826317085843}
{'userid': '4434', 'avg_similarity': 0.7470098038331124}
{'userid': '4614', 'avg_similarity': 0.875958115050451}
{'userid': '4783', 'avg_similarity

834

In [194]:
count / len(res) * 100

53.087205601527685