In [None]:
import pandas as pd
import numpy as np 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import hstack
from fuzzywuzzy import process
import streamlit as st



In [3]:
# Importing album data
raw_data = pd.read_csv('rym_top_5000_all_time.csv')

In [4]:
# View first 5 rows
raw_data.head()

Unnamed: 0,Ranking,Album,Artist Name,Release Date,Genres,Descriptors,Average Rating,Number of Ratings,Number of Reviews
0,1.0,OK Computer,Radiohead,16 June 1997,"Alternative Rock, Art Rock","melancholic, anxious, futuristic, alienation, ...",4.23,70382,1531
1,2.0,Wish You Were Here,Pink Floyd,12 September 1975,"Progressive Rock, Art Rock","melancholic, atmospheric, progressive, male vo...",4.29,48662,983
2,3.0,In the Court of the Crimson King,King Crimson,10 October 1969,"Progressive Rock, Art Rock","fantasy, epic, progressive, philosophical, com...",4.3,44943,870
3,4.0,Kid A,Radiohead,3 October 2000,"Art Rock, Experimental Rock, Electronic","cold, melancholic, futuristic, atmospheric, an...",4.21,58590,734
4,5.0,To Pimp a Butterfly,Kendrick Lamar,15 March 2015,"Conscious Hip Hop, West Coast Hip Hop, Jazz Rap","political, conscious, poetic, protest, concept...",4.27,44206,379


In [None]:
# View information about dataset - 'Descriptors' has missing values
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Ranking            5000 non-null   float64
 1   Album              5000 non-null   object 
 2   Artist Name        5000 non-null   object 
 3   Release Date       5000 non-null   object 
 4   Genres             5000 non-null   object 
 5   Descriptors        4886 non-null   object 
 6   Average Rating     5000 non-null   float64
 7   Number of Ratings  5000 non-null   object 
 8   Number of Reviews  5000 non-null   int64  
dtypes: float64(2), int64(1), object(6)
memory usage: 351.7+ KB


In [None]:
# The data has no duplicates
raw_data.duplicated().sum()

np.int64(0)

In [None]:
# Remove missing values 
cleaned_data = raw_data.dropna().copy()
cleaned_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4886 entries, 0 to 4999
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Ranking            4886 non-null   float64
 1   Album              4886 non-null   object 
 2   Artist Name        4886 non-null   object 
 3   Release Date       4886 non-null   object 
 4   Genres             4886 non-null   object 
 5   Descriptors        4886 non-null   object 
 6   Average Rating     4886 non-null   float64
 7   Number of Ratings  4886 non-null   object 
 8   Number of Reviews  4886 non-null   int64  
dtypes: float64(2), int64(1), object(6)
memory usage: 381.7+ KB


In [None]:
# Create 'Release Year' variable 
cleaned_data['Release Year'] = cleaned_data['Release Date'].astype(str).str[-4:].astype(int)
cleaned_data.drop(['Release Date', 'Number of Ratings', 'Number of Reviews'], axis=1, inplace=True)
cleaned_data.head()

Unnamed: 0,Ranking,Album,Artist Name,Genres,Descriptors,Average Rating,Release Year
0,1.0,OK Computer,Radiohead,"Alternative Rock, Art Rock","melancholic, anxious, futuristic, alienation, ...",4.23,1997
1,2.0,Wish You Were Here,Pink Floyd,"Progressive Rock, Art Rock","melancholic, atmospheric, progressive, male vo...",4.29,1975
2,3.0,In the Court of the Crimson King,King Crimson,"Progressive Rock, Art Rock","fantasy, epic, progressive, philosophical, com...",4.3,1969
3,4.0,Kid A,Radiohead,"Art Rock, Experimental Rock, Electronic","cold, melancholic, futuristic, atmospheric, an...",4.21,2000
4,5.0,To Pimp a Butterfly,Kendrick Lamar,"Conscious Hip Hop, West Coast Hip Hop, Jazz Rap","political, conscious, poetic, protest, concept...",4.27,2015


In [None]:
# Create feature vectors from categorical features 'Genres' and 'Descriptors'
initialise_tfidf_genres = TfidfVectorizer()
genres_tfidf = initialise_tfidf_genres.fit_transform(cleaned_data['Genres'])

initialise_tfidf_descriptors = TfidfVectorizer()
descriptors_tfidf = initialise_tfidf_descriptors.fit_transform(cleaned_data['Descriptors'])

categorical_features = hstack((genres_tfidf, descriptors_tfidf))

In [None]:
# Filter and scale numerical features between 0 and 1 
numerical_features = cleaned_data[['Average Rating', 'Release Year']]
scaler = MinMaxScaler()
scaled_numerical_features = scaler.fit_transform(numerical_features)

In [None]:
# Horizontally stack (add columns) the categorical and numerical features 
combined_features = hstack((categorical_features, scaled_numerical_features))

# Compute the cosine similarity matrix 
# Cosine similarity is used instead of Euclidean distance (typical for numerical features) and Jaccard index (typical for categorical features)
# due to its suitability for sparse, TF-IDF comparisons
similarity_matrix = cosine_similarity(combined_features, combined_features)

In [None]:
# ML-based album recommender function based on genre, descriptors, average rating, and release year 
def recommend_albums(album_name, no_recs=5):    
        # Gets the row index of the album that matches the entered album name 
        album_idx = cleaned_data[cleaned_data['Album'] == album_name].index[0]

        # Lists the similarity scores between that index and every other album in tuples & in order of albums 
        # E.g. (0, 0.9) means the user specified album has a cosine similarity 0.9 with album 1
        similarity_scores = list(enumerate(similarity_matrix[album_idx]))

        # Sorts the list of tuples based on their second item in descending order, skipping the user album itself 
        sorted_similar_albums = sorted(similarity_scores, key=lambda x: x[1], reverse=True)[1:no_recs+1]

        # Loop through i tuples with the highest similarity and retrieve the album name, genre, and descriptors from 'cleaned_data' 
        recommended_albums = [
        {
            'Album': cleaned_data.iloc[i[0]]['Album'],
            'Genre': cleaned_data.iloc[i[0]]['Genres'],
            'Descriptors': cleaned_data.iloc[i[0]]['Descriptors']
        }
        for i in sorted_similar_albums]
        
        return recommended_albums

In [None]:
# Simpler album recommender function based on genres
def recommend_albums_genre_version(genre, no_recs=5):

    filtered_albums = cleaned_data[cleaned_data['Genres'].str.contains(genre, case=False, na=False)].copy()

    top_albums_by_genre = filtered_albums.sort_values(by="Average Rating", ascending=False).head(no_recs)

    recommended_albums_by_genre = [
        {
            'Album': cleaned_data.iloc[i[0]]['Album'],
            'Genre': cleaned_data.iloc[i[0]]['Genres'],
            'Descriptors': cleaned_data.iloc[i[0]]['Descriptors']
        }
        for i in top_albums_by_genre]