In [44]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors

# Sample Data (replace this with your actual data)
user_item_matrix = pd.DataFrame({
    'user1': [1, 0, 3, 0, 0],
    'user2': [0, 2, 0, 4, 5],
    'user3': [5, 0, 0, 0, 1],
    'user4': [0, 0, 0, 3, 0],
    'user5': [2, 0, 1, 0, 0]
}, index=['item1', 'item2', 'item3', 'item4', 'item5'])

# Replace NaN with 0 for simplicity (you may handle missing values more appropriately)
user_item_matrix = user_item_matrix.fillna(0)

# Function to get similar users
def get_similar_users(user_item_matrix, user_id, k=5):
    # Use cosine similarity to find similar users
    similarities = cosine_similarity(user_item_matrix.T)  # Transpose the matrix
    
    # Get the index of the target user
    user_index = user_item_matrix.columns.get_loc(user_id)
    
    # Get the top k similar users (excluding the target user itself)
    similar_users = NearestNeighbors(n_neighbors=min(k+1, len(user_item_matrix.columns)), metric='cosine')
    similar_users.fit(similarities)
    
    distances, indices = similar_users.kneighbors(similarities[:, user_index].reshape(1, -1))
    
    # Exclude the first element, as it is the target user itself
    similar_user_indices = indices[0][1:]
    
    return user_item_matrix.columns[similar_user_indices]

# Function to recommend items to a user based on similar users
def recommend_items(user_item_matrix, user_id, similar_users):
    # Find items that the similar users liked but the target user hasn't interacted with
    user_interactions = user_item_matrix[user_id]
    similar_users_interactions = user_item_matrix[similar_users].sum(axis=1)
    
    # Filter out items already interacted with by the target user
    recommendations = similar_users_interactions[user_interactions == 0]
    
    # Sort the recommendations by interaction strength
    recommendations = recommendations.sort_values(ascending=False)
    
    return recommendations

# Example usage
target_user_id = 'user1'  # Replace with the actual user ID
similar_users = get_similar_users(user_item_matrix, target_user_id)
recommendations = recommend_items(user_item_matrix, target_user_id, similar_users)

print(f"Recommendations for User ID {target_user_id}:\n{recommendations}")


Recommendations for User ID user1:
item4    7
item5    6
item2    2
dtype: int64


In [45]:
import os
import pandas as pd
import numpy as np
import json
import spotipy
import spotipy.oauth2 as oauth2
from spotipy.oauth2 import SpotifyOAuth,SpotifyClientCredentials
import yaml
import re
from tqdm import tqdm
import multiprocessing as mp
import time
import random
import datetime
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
#from skimage import io
from sklearn.preprocessing import OneHotEncoder

In [46]:
df=pd.read_csv('1M_unique_processed_data.csv')

In [47]:
df.drop_duplicates(inplace=True)
df.drop_duplicates(subset='track_uri',inplace=True)
df.shape

(66243, 31)

In [48]:
# List of columns to drop
columns_to_drop = ['name', 'num_holdouts', 'pid', 'num_tracks', 'num_samples', 'pos', 'time_signature']

# Drop the specified columns
df.drop(columns=columns_to_drop, inplace=True)
# Dropping rows with missing values in specified columns
columns_to_keep = ['artist_name', 'track_name', 'artist_pop', 'popularity','release_date']
df = df.dropna(subset=columns_to_keep)

# Selecting only the columns of interest
df = df[columns_to_keep]


In [49]:
df['release_date']=pd.to_datetime(df['release_date'], errors='coerce')
df['year'] = df['release_date'].dt.year
df.dropna(subset=['year'], inplace=True)

df.drop(columns=['release_date'], inplace=True)

In [50]:
df = df.sample(frac=0.3, random_state=42)  # Adjust fraction size as needed

# Display the shape of the sampled dataset
print(df.shape)

(17476, 5)


In [51]:
df.isnull().sum()

artist_name    0
track_name     0
artist_pop     0
popularity     0
year           0
dtype: int64

In [52]:
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# One-hot encode the 'artist_name' column
encoder = OneHotEncoder(sparse=False)
artist_encoded = encoder.fit_transform(df[['artist_name']])

# Concatenate the one-hot encoded array with the 'popularity' column
popularity_column = df[['popularity']].values
year_column = df[['year']].values

X = np.concatenate([artist_encoded, popularity_column,year_column], axis=1)


# Initialize variables to store best parameters and score
best_params = None
best_score = -1  # Set initial best score to a low value

# Iterate through parameter combinations
for n_clusters in [4]:
    for init in ['k-means++', 'random']:
        for max_iter in [300, 500, 800]:
            # Create KMeans model with current parameters
            kmeans = KMeans(n_clusters=n_clusters, init=init, max_iter=max_iter, random_state=42)

            # Fit the model to the data
            kmeans.fit(X)

            # Calculate silhouette score
            score = silhouette_score(X, kmeans.labels_)

            # Update best parameters and score if necessary
            if score > best_score:
                best_params = {'n_clusters': n_clusters, 'init': init, 'max_iter': max_iter}
                best_score = score

print("Best Parameters:", best_params)
print("Best Silhouette Score:", best_score)


  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


Best Parameters: {'n_clusters': 4, 'init': 'k-means++', 'max_iter': 300}
Best Silhouette Score: 0.5272259741760454


In [53]:
# Get the best parameters
best_params 

# Initialize KMeans with the best parameters
best_kmeans = KMeans(n_clusters=best_params['n_clusters'], 
                     init=best_params['init'], 
                     max_iter=best_params['max_iter'], 
                     random_state=42)

# Fit the best KMeans model to the data
best_kmeans.fit(X)

# Assign clusters to the data using the best model
df['cluster'] = best_kmeans.labels_


  super()._check_params_vs_input(X, default_n_init=10)


In [54]:
# Spotify API credentials
client_id = 'd4eec2244fb2416ebce8ec488b1f9587'
client_secret = '0cf3c26eca6244e796a10a847dc11f17'

auth_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
sp = spotipy.client.Spotify(auth_manager=auth_manager)

In [55]:
# Extract track ID from Spotify URL
spotify_url = "https://open.spotify.com/track/2aPTvyE09vUCRwVvj0I8WK?si=7b06b8f1012041ce"
track_id = spotify_url.split('/')[-1].split('?')[0]


In [56]:
# Get the track information
track_info = sp.track(track_id)

# Extract track popularity
track_popularity = track_info['popularity']

# Get the artist information
artist_id = track_info['artists'][0]['id']
artist_info = sp.artist(artist_id)

# Extract artist popularity
artist_popularity = artist_info['popularity']

release_date = track_info['album']['release_date']
release_year = int(release_date.split('-')[0])


In [61]:
# Concatenate the selected features of the given track
track_features = np.concatenate([encoder.transform([[artist_info['name']]]), [[track_popularity, release_year]]], axis=1)

# Find the cluster of the given track based on selected features
track_cluster = best_kmeans.predict(track_features)[0]

# Select similar tracks from the same cluster
num_recommendations = 5  # Adjust the number of recommendations
similar_tracks = df[df['cluster'] == track_cluster].sample(num_recommendations)

# Show the recommended tracks
recommended_tracks = similar_tracks[['artist_name', 'track_name', 'year']]
print(recommended_tracks)


                artist_name            track_name    year
285991              Tantric          Down And Out  2008.0
272799      No Te Va Gustar                 Clara  2002.0
152514  Christopher Jackson         One Last Time  2015.0
127687       Arctic Monkeys         Dancing Shoes  2006.0
84204       Whitney Houston  You Light Up My Life  2002.0




In [64]:
def get_recommendations(track_url):
    # Extract the track ID from the URL
    track_id = track_url.split('/')[-1].split('?')[0]

    # Get the track information
    track_info = sp.track(track_id)

    # Extract the artist name
    artist_name = track_info['artists'][0]['name']

    # Filter the dataset for tracks by the extracted artist
    artist_tracks = df[df['artist_name'] == artist_name]

    # Get recommendations based on the artist's tracks
    num_recommendations = 3  # Adjust the number as needed
    recommendations = artist_tracks.sample(num_recommendations)

    # If 'release_date' is available, add 'release_year'; otherwise, set 'release_year' to NaN
    if 'release_date' in recommendations.columns:
        recommendations['release_date'] = recommendations['release_date'].astype(str)
        recommendations['release_year'] = recommendations['release_date'].str.split('-', expand=True)[0].astype(float)
    else:
        recommendations['release_year'] = float('nan')

    return recommendations[['artist_name', 'track_name', 'release_year']]

# Example usage
spotify_track_url = 'https://open.spotify.com/track/2aPTvyE09vUCRwVvj0I8WK?si=7b06b8f1012041ce'
recommendations = get_recommendations(spotify_track_url)
print(recommendations)


      artist_name                          track_name  release_year
29365  A$AP Rocky                            Pharsyde           NaN
30054  A$AP Rocky                       Better Things           NaN
29425  A$AP Rocky  Lord Pretty Flacko Jodye 2 (LPFJ2)           NaN


In [None]:
import joblib

joblib.dump(best_kmeans, 'Klusters_model.joblib')

# To load the model later
#loaded_model = joblib.load('Klusters_model.joblib')


['Klusters_model.joblib']

In [None]:

# Load the saved KMeans model
loaded_kmeans_model = joblib.load('Klusters_model.joblib')


In [None]:

new_track_features = np.concatenate([encoder.transform([[artist_info['name']]]), [[track_popularity]]], axis=1)

# Predict the cluster of the new track
new_track_cluster = loaded_kmeans_model.predict(new_track_features)[0]




In [None]:
num_recommendations = 5  # Adjust the number of recommendations
similar_tracks_new = df[df['cluster'] == new_track_cluster].sample(num_recommendations)

# Show the recommended tracks for the new track
recommended_tracks_new = similar_tracks_new[['artist_name', 'track_name']]
print(recommended_tracks_new)


          artist_name                                      track_name
212251            Nas                                     Nas Is Like
26380    Travis Scott                                            3500
42887           gnash       i hate u, i love u (feat. olivia o'brien)
232052   Cast Of Rent  Seasons Of Love - From The Motion Picture RENT
93887   Darius Rucker                                   If I Told You


In [None]:
def get_recommendations(track_url, kmeans_model):
    # Extract the track ID from the URL
    track_id = track_url.split('/')[-1].split('?')[0]

    # Get the track information
    track_info = sp.track(track_id)

    # Extract the artist name and track popularity
    artist_name = track_info['artists'][0]['name']
    track_popularity = track_info['popularity']

    # Concatenate the features of the given track
    track_features = np.concatenate([encoder.transform([[artist_name]]), [[track_popularity]]], axis=1)

    # Filter the dataset for tracks by the same artist
    artist_tracks = df[df['artist_name'] == artist_name]

    # Predict the cluster of the given track
    track_cluster = kmeans_model.predict(track_features)[0]

    # Select similar tracks from the same cluster
    num_recommendations = 3  # Adjust the number as needed
    similar_tracks = df[(df['cluster'] == track_cluster) & (df['artist_name'] == artist_name)].sample(num_recommendations)

    return similar_tracks[['artist_name', 'track_name']]


# Example usage
spotify_track_url = 'https://open.spotify.com/track/2aPTvyE09vUCRwVvj0I8WK?si=7b06b8f1012041ce'
recommendations = get_recommendations(spotify_track_url, loaded_kmeans_model)
print(recommendations)


      artist_name        track_name
29368  A$AP Rocky     Electric Body
29461  A$AP Rocky  F**kin' Problems
30003  A$AP Rocky   Ghetto Symphony


