In [62]:
import pandas as pd 
import numpy as np 

# Define the path to the dataset
file_path = '/kaggle/input/-spotify-tracks-dataset/dataset.csv'

# Read the dataset using pandas
spotify_data = pd.read_csv(file_path)

# Display the first few rows of the dataset to verify
print(spotify_data.head())

   Unnamed: 0                track_id                 artists  \
0           0  5SuOikwiRyPMVoIQDJUgSV             Gen Hoshino   
1           1  4qPNDBW1i3p13qLCt0Ki3A            Ben Woodward   
2           2  1iJBSr7s7jYXzM8EGcbK5b  Ingrid Michaelson;ZAYN   
3           3  6lfxq3CG4xtTiEg7opyCyx            Kina Grannis   
4           4  5vjLSffimiIP26QG5WcN2K        Chord Overstreet   

                                          album_name  \
0                                             Comedy   
1                                   Ghost (Acoustic)   
2                                     To Begin Again   
3  Crazy Rich Asians (Original Motion Picture Sou...   
4                                            Hold On   

                   track_name  popularity  duration_ms  explicit  \
0                      Comedy          73       230666     False   
1            Ghost - Acoustic          55       149610     False   
2              To Begin Again          57       210826     False   


In [63]:
df = pd.DataFrame(spotify_data)
df.drop("album_name", axis=1, inplace = True)
df.drop("duration_ms", axis=1, inplace = True)
df.drop("explicit", axis=1, inplace = True)
df.drop("danceability", axis=1, inplace = True)
df.drop("energy", axis=1, inplace = True)
df.drop("key", axis=1, inplace = True)
df.drop("loudness", axis=1, inplace = True)
df.drop("mode", axis=1, inplace = True)
df.drop("speechiness", axis=1, inplace = True)
df.drop("acousticness", axis=1, inplace = True)
df.drop("instrumentalness", axis=1, inplace = True)
df.drop("liveness", axis=1, inplace = True)
df.drop("valence", axis=1, inplace = True)
df.drop("tempo", axis=1, inplace = True)
df.drop("time_signature", axis=1, inplace = True)


spotify_data = df



In [64]:
# Identify rows with any missing values
null_rows = spotify_data[spotify_data.isnull().any(axis=1)]

# Display rows with null values
print("\nRows with missing values:")
print(null_rows)

# Remove rows with any empty or missing values
spotify_data_cleaned = spotify_data.dropna()

# Display the shape of the dataset to see the number of rows remaining
print(f"\nDataset shape after removing rows with missing values: {spotify_data_cleaned.shape}")



Rows with missing values:
       Unnamed: 0                track_id artists track_name  popularity  \
65900       65900  1kR4gIb7nGxHPI3D2ifs59     NaN        NaN           0   

      track_genre  
65900       k-pop  

Dataset shape after removing rows with missing values: (113999, 6)


In [65]:
# Identify duplicate rows
duplicate_rows = spotify_data_cleaned[spotify_data_cleaned.duplicated()]

# Display duplicate rows
print("\nDuplicate rows:")
print(duplicate_rows)

# Remove duplicate rows
spotify_data_cleaned = spotify_data_cleaned.drop_duplicates()

# Display the shape of the dataset to see the number of rows remaining after removing duplicates
print(f"\nDataset shape after removing duplicate rows: {spotify_data_cleaned.shape}")



Duplicate rows:
Empty DataFrame
Columns: [Unnamed: 0, track_id, artists, track_name, popularity, track_genre]
Index: []

Dataset shape after removing duplicate rows: (113999, 6)


In [66]:
duplicate_track_ids = df[df.duplicated(['track_id'], keep=False)]

# Display duplicate track_id values
print("Duplicate track_id values:")
print(duplicate_track_ids)

Duplicate track_id values:
        Unnamed: 0                track_id  \
0                0  5SuOikwiRyPMVoIQDJUgSV   
1                1  4qPNDBW1i3p13qLCt0Ki3A   
5                5  01MVOl9KtVTNfFiBU9I7dc   
6                6  6Vc5wAMmXdKIAM7WUoEb7N   
7                7  1EzrEOXmMH3G43AXT1y7pA   
...            ...                     ...   
113572      113572  1saXUvvFlAQaefZUFVmhCn   
113605      113605  1Q5jFp1g2Ns4gBsHRpcqhu   
113617      113617  71dLJx3qHOTQMTvvoE2dmd   
113619      113619  6OG5TBCmuTOuWCzSGsETrE   
113641      113641  7xsirhcgFWOnItsGuBfrv9   

                                           artists  \
0                                      Gen Hoshino   
1                                     Ben Woodward   
5                                     Tyrone Wells   
6             A Great Big World;Christina Aguilera   
7                                       Jason Mraz   
...                                            ...   
113572                   Bethel Music;Paul

In [67]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, MinMaxScaler
from scipy.sparse import hstack, csr_matrix
from sklearn.preprocessing import normalize


# One-hot encoding for "track_genre"
genre_encoder = OneHotEncoder(sparse=True)
genre_encoded = genre_encoder.fit_transform(spotify_data_cleaned[['track_genre']])

# Label encoding for "artists"
artist_encoder = LabelEncoder()
spotify_data_cleaned['artists_encoded'] = artist_encoder.fit_transform(spotify_data_cleaned['artists'])

# Normalize "popularity"
scaler = MinMaxScaler()
spotify_data_cleaned['popularity'] = scaler.fit_transform(spotify_data_cleaned[['popularity']])

# Combine the numerical features into a single sparse feature matrix
artists_sparse = csr_matrix(spotify_data_cleaned[['artists_encoded']])
popularity_sparse = csr_matrix(spotify_data_cleaned[['popularity']])
features_sparse = hstack([genre_encoded, artists_sparse, popularity_sparse])

# Convert the sparse matrix to a dense matrix
features_dense = features_sparse.toarray()




In [115]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def get_recommendations(track_ids, data, features_dense, num_recommendations=10):
    """
    Get track recommendations based on given track IDs, ensuring a variety of genres in the recommendations.

    Parameters:
    - track_ids (str or list of str): One or more track IDs for which to generate recommendations.
    - data (DataFrame): The DataFrame containing track information and features.
    - features_dense (ndarray): The feature matrix for the tracks.
    - num_recommendations (int): Number of recommendations to generate.

    Returns:
    - recommendations (DataFrame): A DataFrame containing recommended tracks.
    """
    # Check if track_ids is a single string (track_id) or a list/array of track_ids
    if isinstance(track_ids, str):  # If track_ids is a single string (track_id)
        track_ids = [track_ids]  # Convert it to a list with one element

    # Initialize a list to store feature vectors of the given track_ids
    track_vectors = []

    # Loop through each track_id in the input list or array
    for track_id in track_ids:
        # Get all indices of the track that matches the track_id
        idxs = data[data['track_id'] == track_id].index
        if len(idxs) == 0:
            print(f"Track with ID {track_id} not found in the dataset.")
            continue
        for idx in idxs:
            # Add the feature vector to the list
            track_vectors.append(features_dense[idx])

    if not track_vectors:
        raise ValueError("No valid track IDs were provided.")

    # Compute the average feature vector
    avg_vector = np.mean(track_vectors, axis=0).reshape(1, -1)

    # Compute cosine similarity for the average vector
    sim_scores = cosine_similarity(avg_vector, features_dense).flatten()

    # Get the indices of the most similar tracks
    nearest_neighbors = np.argsort(-sim_scores)[:num_recommendations + len(track_ids)]  # Include input tracks
    nearest_neighbors = nearest_neighbors[np.isin(nearest_neighbors, data.index.values)]

    # Filter recommendations by ensuring a variety of genres
    genre_counts = data.iloc[nearest_neighbors]['track_genre'].value_counts()
    filtered_indices = []
    for idx in nearest_neighbors:
        if len(filtered_indices) >= num_recommendations:
            break
        if data.loc[idx, 'track_genre'] not in genre_counts or genre_counts[data.loc[idx, 'track_genre']] > 1:
            filtered_indices.append(idx)
            genre_counts[data.loc[idx, 'track_genre']] -= 1

    # Convert recommended indices set to a list
    recommended_indices = list(filtered_indices)

    # Create DataFrame of recommendations from the recommended indices
    recommendations = data.iloc[recommended_indices].reset_index(drop=True)

    # Drop duplicates based on track_id to avoid recommending the same track_id itself
    recommendations.drop_duplicates(subset=['track_id'], inplace=True)

    # Map the encoded artist labels back to the original artist names
    if 'artists_encoded' in data.columns:
        recommendations['artists'] = artist_encoder.inverse_transform(recommendations['artists_encoded'])

    return recommendations


In [114]:
# Example usage: Get recommendations for an array of track_ids
track_ids = []
track_ids = []

recommendations = get_recommendations(track_ids)


print("Recommendations:")
print(recommendations[['track_name','track_id', 'artists', 'track_genre', 'popularity']])


Recommendations:
                                    track_name                track_id  \
0                        Spies Are Watching Me  1UIqpCB0b56K7U0JJPfskN   
1                        More Mess on My Thing  3HgPOHHJWnMfv1Zw27Jh2V   
2                                  Ikeja Roads  6K8y9WcUbgv0j7RGceHvym   
3                                 Safari Strut  1lDELORqnHLLTe1bYmHLq7   
4                                   Move on Up  1W6txMoK3kxBkc5IttrwiA   
5                                     Freckles  1cGpjqxs56wNKE6LOBnUcD   
6                                Fantastic Man  1ETZhP9orTkDclKEyt0xqm   
7                                 Every Season  6HrYi2gwKxrwEtZX3Orype   
8                                   T.I.B.W.F.  1Bx3trnFdCStyiV5PF0vkj   
9  Atomic Bomb - William Onyeabor vs. Hot Chip  6G2M6vvToPz12ghZpeJSGe   

                      artists track_genre  popularity  
0            Voilaaa;Sir Jean    afrobeat        0.48  
1         The Poets Of Rhythm    afrobeat        0.57  
