In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [9]:
from sklearn.neighbors import NearestNeighbors
from fuzzywuzzy import fuzz
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.sparse import csr_matrix


class Recommender:
    def __init__(self, metric, algorithm, k, data, decode_id_song):
        self.metric = metric
        self.algorithm = algorithm
        self.k = k
        self.data = data
        self.decode_id_song = decode_id_song
        self.data = data
        self.model = self._recommender().fit(data)

    def make_recommendation(self, new_song, n_recommendations):
        recommended = self._recommend(new_song=new_song,
                                      n_recommendations=n_recommendations)
        print("... Done")
        return recommended

    def _recommender(self):
        return NearestNeighbors(metric=self.metric,
                                algorithm=self.algorithm,
                                n_neighbors=self.k,
                                n_jobs=-1)

    def _recommend(self, new_song, n_recommendations):
        # Get the id of the recommended songs
        recommendations = []
        recommendation_ids = self._get_recommendations(
            new_song=new_song, n_recommendations=n_recommendations)
        # return the name of the song using a mapping dictionary
        recommendations_map = self._map_indeces_to_song_title(
            recommendation_ids)
        # Translate this recommendations into the ranking of song titles recommended
        for i, (idx, dist) in enumerate(recommendation_ids):
            recommendations.append(recommendations_map[idx])
        return recommendations

    def _get_recommendations(self, new_song, n_recommendations):
        # Get the id of the song according to the text
        recom_song_id = self._fuzzy_matching(song=new_song)
        # Start the recommendation process
        print(f"Starting the recommendation process for {new_song} ...")
        # Return the n neighbors for the song id
        distances, indices = self.model.kneighbors(
            self.data[recom_song_id], n_neighbors=n_recommendations + 1)
        return sorted(list(
            zip(indices.squeeze().tolist(),
                distances.squeeze().tolist())),
                      key=lambda x: x[1])[:0:-1]

    def _map_indeces_to_song_title(self, recommendation_ids):
        # get reverse mapper
        return {
            song_id: song_title
            for song_title, song_id in self.decode_id_song.items()
        }

    def _fuzzy_matching(self, song):
        match_tuple = []
        # get match
        for title, idx in self.decode_id_song.items():
            ratio = fuzz.ratio(title.lower(), song.lower())
            if ratio >= 60:
                match_tuple.append((title, idx, ratio))
        # sort
        match_tuple = sorted(match_tuple, key=lambda x: x[2])[::-1]
        if not match_tuple:
            print(
                f"The recommendation system could not find a match for {song}")
            return
        return match_tuple[0][1]


song_info = pd.read_csv('../input/million-song-data-set-subset/10000.txt',
                        sep='\t',
                        header=None)
song_info.columns = ['user_id', 'song_id', 'listen_count']

#Read song  metadata
song_actual = pd.read_csv('../input/million-song-data-set-subset/song_data.csv')
song_actual.drop_duplicates(['song_id'], inplace=True)

#Merge the two dataframes above to create input dataframe for recommender systems
songs = pd.merge(song_info, song_actual, on="song_id", how="left")
songs.to_csv('./songs.csv', index=False)
df_songs = pd.read_csv('./songs.csv')
#dict1=dict()
#for _,song_id,_,title,_,_,_ in songs.items():
#    dict1[song_id]=title
#print(songs.head())
unique_songs = df_songs['title'].unique().shape[0]
unique_artists = df_songs['artist_name'].unique().shape[0]
unique_users = df_songs['user_id'].unique().shape[0]

#Most popular songs

ten_pop_songs = df_songs.groupby(
    'title')['listen_count'].count().reset_index().sort_values(
        ['listen_count', 'title'], ascending=[0, 1])
ten_pop_songs['percentage'] = round(
    ten_pop_songs['listen_count'].div(ten_pop_songs['listen_count'].sum()) *
    100, 2)

ten_pop_songs = ten_pop_songs[:10]
#print(ten_pop_songs)
labels = ten_pop_songs['title'].tolist()
counts = ten_pop_songs['listen_count'].tolist()

#plt.figure()
#sns.barplot(x=counts, y=labels, palette='Set3')
#sns.despine(left=True, bottom=True)

# Most popular artist

ten_pop_artists = df_songs.groupby(
    ['artist_name'])['listen_count'].count().reset_index().sort_values(
        ['listen_count', 'artist_name'], ascending=[0, 1])

ten_pop_artists = ten_pop_artists[:10]
#print(ten_pop_artists)

#plt.figure()
labels = ten_pop_artists['artist_name'].tolist()
counts = ten_pop_artists['listen_count'].tolist()
#sns.barplot(x=counts, y=labels, palette='Set2')
#sns.despine(left=True, bottom=True)

listen_counts = pd.DataFrame(df_songs.groupby('listen_count').size(),
                             columns=['count'])

#plt.figure(figsize=(20, 5))
#sns.boxplot(x='listen_count', data=df_songs)
#sns.despine()
listen_counts_temp = listen_counts[listen_counts['count'] > 50].reset_index(
    drop=False)

#plt.figure(figsize=(16, 8))
#sns.barplot(x='listen_count',
#            y='count',
#            palette='Set3',
#            data=listen_counts_temp)
#plt.gca().spines['top'].set_visible(False)
#plt.gca().spines['right'].set_visible(False)
#plt.show()
song_user = df_songs.groupby('user_id')['song_id'].count()

#plt.figure(figsize=(16, 8))
#sns.distplot(song_user.values, color='orange')
#plt.gca().spines['top'].set_visible(False)
#plt.gca().spines['right'].set_visible(False)
#plt.show()

values_matrix = unique_users * unique_songs
#Prepare the data
song_ten_id = song_user[song_user > 16].index.to_list()

df_song_id_more_ten = df_songs[df_songs['user_id'].isin(
    song_ten_id)].reset_index(drop=True)

# convert the dataframe into a pivot table

df_songs_features = df_song_id_more_ten.pivot(index='song_id',
                                              columns='user_id',
                                              values='listen_count').fillna(0)

# obtain a sparse matrix
df=df_songs_features
df1=df.copy()
mat_songs_features = csr_matrix(df_songs_features.values)

df_unique_songs = df_songs.drop_duplicates(subset=['song_id']).reset_index(
    drop=True)[['song_id', 'title']]

decode_id_song = {
    song: i
    for i, song in enumerate(
        list(
            df_unique_songs.set_index('song_id').loc[
                df_songs_features.index].title))
}

model = Recommender(metric='cosine',
                    algorithm='brute',
                    k=20,
                    data=mat_songs_features,
                    decode_id_song=decode_id_song)

song = 'shy Boy'
new_recommendations = model.make_recommendation(new_song=song,
                                                n_recommendations=10)
print(f"The recommendations for {song} are:")
print(f"{new_recommendations}")


def recommend_songs(user, num_recommended_songs):

    #print('The list of the songs {} Has Watched \n'.format(user))

    for m in df[df[user] > 0][user].index.tolist():
        print(m)

    print('\n')

    recommended_songs = []

    for m in df[df[user] == 0].index.tolist():

        index_df = df.index.tolist().index(m)
        predicted_rating = df1.iloc[index_df, df1.columns.tolist().index(user)]
        recommended_songs.append((m, predicted_rating))

    sorted_rm = sorted(recommended_songs, key=lambda x: x[1], reverse=True)

    print('The list of the Recommended songs \n')
    rank = 1
    for recommended_song in sorted_rm[:num_recommended_songs]:

        print('{}: {} - predicted count:{}'.format(rank, dict1[recommended_song[0]],
                                                    recommended_song[1]))
        rank = rank + 1


def song_recommender(user, num_neighbors, num_recommendation):

    number_neighbors = num_neighbors

    knn = NearestNeighbors(metric='cosine', algorithm='brute')
    knn.fit(df.values)
    distances, indices = knn.kneighbors(df.values,
                                        n_neighbors=number_neighbors)

    user_index = df.columns.tolist().index(user)

    for m, t in list(enumerate(df.index)):
        if df.iloc[m, user_index] == 0:
            sim_songs = indices[m].tolist()
            song_distances = distances[m].tolist()

            if m in sim_songs:
                id_song = sim_songs.index(m)
                sim_songs.remove(m)
                song_distances.pop(id_song)

            else:
                sim_songs = sim_songs[:n_neighbors - 1]
                song_distances = song_distances[:n_neighbors - 1]

            song_similarity = [1 - x for x in song_distances]
            song_similarity_copy = song_similarity.copy()
            nominator = 0

            for s in range(0, len(song_similarity)):
                if df.iloc[sim_songs[s], user_index] == 0:
                    if len(song_similarity_copy) == (number_neighbors - 1):
                        song_similarity_copy.pop(s)

                    else:
                        song_similarity_copy.pop(s -
                                                 (len(song_similarity) -
                                                  len(song_similarity_copy)))

                else:
                    nominator = nominator + song_similarity[s] * df.iloc[
                        sim_songs[s], user_index]

            if len(song_similarity_copy) > 0:
                if sum(song_similarity_copy) > 0:
                    predicted_r = nominator / sum(song_similarity_copy)

                else:
                    predicted_r = 0

            else:
                predicted_r = 0

            df1.iloc[m, user_index] = predicted_r
    recommend_songs(user, num_recommendation)
song_recommender('0012bf75d43a724f62dc746d9e85ae0088a3a1d6', 3, 4)
