In [4]:
import numpy as np
import pandas as pd

In [5]:
from typing import List, Dict


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [7]:
song_data = pd.read_csv("song_data.csv",index_col="encode_id")
song_data.head()

Unnamed: 0_level_0,title,artists_name,album,genres_id
encode_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ZZAIEEWO,!!E,['Jasper Riedijk'],Black Flag (Single),"['IWZ9Z08O', 'IWZ9Z09A']"
ZUUB8EZC,#'s,['Yung Mal'],,"['IWZ9Z08O', 'IWZ9Z09B']"
ZUZF6F9C,#'s (Remix),"['Yung Mal', 'Stunna 4 Vegas']",#'s (Remix) (Single),"['IWZ9Z08O', 'IWZ9Z09B']"
ZWAC98UW,#A,['Fabrizio Moro'],Figli di nessuno,"['IWZ9Z08O', 'IWZ9Z097']"
ZW77F8E7,#BabyBaby,['MONSTAR'],#BabyBaby (Single),"['IWZ9Z08I', 'IWZ9Z088', 'IWZ97FCD']"


In [8]:
import re
import time
from datasketch import MinHash, MinHashLSHForest

In [9]:
def preprocess(text):
    return text.split()

In [48]:
# Number of permutations
permutations = 128

# Number of Recommendations to return
num_recommendations = 1


In [49]:
def get_forest(data, perms):
    start_time = time.time()

    min_hash = []

    for genre in data['genres_id']:
        m = MinHash(num_perm=perms)
        m.update(genre.encode('utf-8'))
        min_hash.append(m)
    forest = MinHashLSHForest(num_perm=perms)

    for i,m in enumerate(min_hash):
        forest.add(i,m)
    forest.index()

    print('It tooks %s seconds to build forest.'%(time.time()-start_time))
    return forest   


In [120]:
def predict(text, database, perms, num_results, forest):
    start_time = time.time()

    m = MinHash(num_perm=perms)
    genres = database.loc[database['title'] == text]['genres_id']
    for genre in genres:
        m.update(genre.encode('utf-8'))
    idx_array = np.array(forest.query(m, num_results))

    if len(idx_array) == 0:
        return None  # if your query is empty, return none
    result = database.iloc[idx_array]['title'] + \
        database.iloc[idx_array]['genres_id']
    print('It took %s seconds to query forest.' % (time.time()-start_time))
    return result


In [51]:
forest = get_forest(song_data,permutations)

It tooks 149.24301433563232 seconds to build forest.


In [121]:
num_recommendations = 5
title = "Feliz Navidad"
result = predict(title, song_data, permutations, num_recommendations, forest)

print("\n Top Recommendations(s) is(are)\n", result)


It took 0.01199793815612793 seconds to query forest.

 Top Recommendations(s) is(are)
 encode_id
ZW6IZ6UU    Chuyện Đời Công Nhân['IWZ9Z08I', 'IWZ9Z088', '...
ZW6Z0C8I    Bướm Trắng['IWZ9Z08I', 'IWZ9Z088', 'IWZ9Z08B',...
ZW78E6AU    Apologize['IWZ9Z08I', 'IWZ9Z088', 'IWZ9Z08B', ...
ZW6WCWUW    Buông Tay Lặng Im Accapella['IWZ9Z08I', 'IWZ9Z...
ZW6IO7D7    Chôn Sâu Nỗi Nhớ['IWZ9Z08I', 'IWZ9Z088', 'IWZ9...
dtype: object
