# Recommendation System: Collaborative Filtering

In [1]:
import pandas as pd
import scipy.sparse as sparse
import numpy as np
from scipy.sparse.linalg import spsolve

## 1.1 Load The Needed Variables Here

In [None]:
tracks_df = pd.read_csv('tracks_df.csv')
interaction_df = pd.read_csv('interaction_df.csv')
interaction_matrix = sparse.load_npz("interaction_matrix.npz")

In [None]:
train_matrix = sparse.load_npz('/home/jovyan/Spotify/StoredVariables/train_matrix.npz')
test_matrix = sparse.load_npz('/home/jovyan/Spotify/StoredVariables/test_matrix.npz')
#altered_data = np.load('altered_users.npz')

In [3]:
%store interaction_df >store.txt

Writing 'interaction_df' (DataFrame) to file 'store.txt'.


In [7]:
playlists = list(np.sort(interaction_df['playlist_id'].unique()))
tracks = list(np.sort(interaction_df['track_uri'].unique()))

## 1.2 Creating Interaction Dataframe. 

In [None]:
# Map IDs to integer indices
playlist_to_idx = {playlist: i for i, playlist in enumerate(playlists)}
track_to_idx = {track: i for i, track in enumerate(tracks)}

# Map the interaction DataFrame values to integer indices
interaction_df['playlist_idx'] = interaction_df['playlist_id'].map(playlist_to_idx)
interaction_df['track_idx'] = interaction_df['track_uri'].map(track_to_idx)

In [19]:
interaction_df

Unnamed: 0,playlist_id,track_uri,rating,playlist_idx,track_idx
0,981000,spotify:track:7gKIt3rDGIMJDFVSPBnGmj,1,981000,2170204
1,981000,spotify:track:2eAAEa8pxKF7My0EO4rFgR,1,981000,771276
2,981000,spotify:track:5rnFOEEIBIWZ6dhHrY6zHh,1,981000,1705086
3,981000,spotify:track:7fwXWKdDNI5IutOMc5OKYw,1,981000,2168458
4,981000,spotify:track:19yIQRLAYMNxmEfdnnQDsS,1,981000,338658
...,...,...,...,...,...
66346423,930999,spotify:track:0O45fw2L5vsWpdsOdXwNAR,1,930999,113251
66346424,930999,spotify:track:08zJpaUQVi9FrKv2e32Bah,1,930999,42084
66346425,930999,spotify:track:7vx3CJaBaNdvkfz4lpj3IE,1,930999,2243141
66346426,930999,spotify:track:3wu9ADop1FXdhToPCxwBL8,1,930999,1148930


# Create Sparse Matrix

In [8]:
# Get the interaction values
rows = interaction_df['playlist_idx']
cols = interaction_df['track_idx']
ratings = interaction_df['rating']

# Create sparse matrix
interaction_matrix = sparse.csr_matrix((ratings, (rows, cols)), shape=(len(playlists), len(tracks)))

# Check the shape of the matrix
print(interaction_matrix.shape)

(1000000, 2262292)


In [9]:
interaction_matrix

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 65464776 stored elements and shape (1000000, 2262292)>

In [10]:
sparse.save_npz("interaction_matrix.npz", interaction_matrix)

# Check The Sparsity Of Matrix

In [16]:

matrix_size = interaction_matrix.shape[0] * interaction_matrix.shape[1]
# Count non-zero entries (actual interactions)
num_interactions = interaction_matrix.nnz
# Calculate sparsity percentage
sparsity = 100 * (1 - (num_interactions / matrix_size))

print(sparsity)


99.99710626320564


## Train/ Test Data Create
#####
    This function will take in the original user-item matrix and "mask" a percentage of the original ratings where a
    user-item interaction has taken place for use as a test set. The test set will contain all of the original ratings, 
    while the training set replaces the specified percentage of them with a zero in the original ratings matrix. 
    
    parameters: 
    
    ratings - the original ratings matrix from which you want to generate a train/test set. Test is just a complete
    copy of the original set. This is in the form of a sparse csr_matrix. 
    
    pct_test - The percentage of user-item interactions where an interaction took place that you want to mask in the 
    training set for later comparison to the test set, which contains all of the original ratings. 
    
    returns:
    
    training_set - The altered version of the original data with a certain percentage of the user-item pairs 
    that originally had interaction set back to zero.
    
    test_set - A copy of the original ratings matrix, unaltered, so it can be used to see how the rank order 
    compares with the actual interactions.
    
    user_inds - From the randomly selected user-item indices, which user rows were altered in the training data.
    This will be necessary later when evaluating the performance via AUC.
    '''

In [17]:
import random
def make_train(ratings, pct_test = 0.2):
    
    test_set = ratings.copy() # Make a copy of the original set to be the test set. 
    test_set[test_set != 0] = 1 # Store the test set as a binary preference matrix
    training_set = ratings.copy() # Make a copy of the original data we can alter as our training set. 
    nonzero_inds = training_set.nonzero() # Find the indices in the ratings data where an interaction exists
    nonzero_pairs = list(zip(nonzero_inds[0], nonzero_inds[1])) # Zip these pairs together of user,item index into list
    random.seed(0) # Set the random seed to zero for reproducibility
    num_samples = int(np.ceil(pct_test*len(nonzero_pairs))) # Round the number of samples needed to the nearest integer
    samples = random.sample(nonzero_pairs, num_samples) # Sample a random number of user-item pairs without replacement
    user_inds = [index[0] for index in samples] # Get the user row indices
    item_inds = [index[1] for index in samples] # Get the item column indices
    training_set[user_inds, item_inds] = 0 # Assign all of the randomly chosen user-item pairs to zero
    training_set.eliminate_zeros() # Get rid of zeros in sparse array storage after update to save space
    return training_set, test_set, list(set(user_inds)) # Output the unique

**Notes:** altered_users is the list of unique playlist IDs which have some of their interaction hidden.

In [18]:
train_matrix, test_matrix, altered_users = make_train(interaction_matrix, pct_test=0.2)

# Speed Up ALS Algorithm.
**Explaination:** Since our last is very slow becausse it has to make over million iterations, we can use Cython and parallelization to speed up

In [3]:
import implicit
from implicit.als import AlternatingLeastSquares

**Note:** Why Implcit library ALS model is fast:
* Optimized with Cython: The implicit library uses Cython, which compiles Python code into C, making it much faster.
* Parallel Processing: Paralellize the computation across different computer
* Efficient Memory Management: The library handles memory more efficiently, reducing overhead.
* Algorithmic Optimizations: Includes various optimizations that reduce the number of operations needed for each iteration.

In [8]:
factors = 20 #number of latent factors
regularization = 0.1
iterations = 50
alpha = 15

# Initialize the ALS model
model = implicit.als.AlternatingLeastSquares(factors=factors, regularization=regularization, iterations=iterations)

In [30]:
sparse.save_npz('train_matrix.npz', train_matrix)
sparse.save_npz('test_matrix.npz', test_matrix)

np.savez('altered_users.npz', altered_users=altered_users)

# Training Process
**Explaination:** During this training process, the model will begin the to the matrix factorization on the train_matrix( matrix which some of interactions are being hidden 

By using GPU to train the model, we save up so much time.

In [9]:
%%time
#Fit the model
model.fit(train_matrix)

  0%|          | 0/50 [00:00<?, ?it/s]

CPU times: user 29min 59s, sys: 3h 57min 52s, total: 4h 27min 52s
Wall time: 8min 57s


In [11]:
import pickle
with open("trained_model_CF.pkl", "wb") as f:
    pickle.dump(model, f)

In [8]:
#Load the model
import pickle
with open("/home/jovyan/Spotify/trained_model_CF.pkl", "rb") as f:
    model = pickle.load(f)

In [13]:
user_vecs = model.user_factors
item_vecs = model.item_factors

save the variables

In [17]:
import numpy as np
np.save('user_vecs.npy', user_vecs)
np.save('item_vecs.npy', item_vecs)

In [18]:
# retreueve variables 
user_vecs = np.load('/home/jovyan/Spotify/StoredVariables/user_vecs.npy')
item_vecs = np.load('/home/jovyan/Spotify/StoredVariables/item_vecs.npy')

In [19]:
print('playlists-latent vector dimension:', user_vecs.shape)
print('tracks-latent vector dimension:', item_vecs.shape)

playlists-latent vector dimension: (1000000, 20)
tracks-latent vector dimension: (2262292, 20)


**Note:** These vector represent the user-latent matrix abd item-latent matrix, there are k=20 number of letent factors in these matrix.

In [None]:
np.dot(user_vecs , item_vecs.T)

In [27]:
user_vecs

array([[ 1.3618277e-03, -3.9617908e-03,  9.9558837e-04, ...,
         6.6140518e-03,  6.4980127e-03, -1.2205641e-03],
       [ 1.2362348e-03,  2.0782896e-03, -6.9503947e-03, ...,
        -1.5989727e-03,  4.5161657e-03, -2.3391312e-03],
       [ 4.2069903e-05,  1.9816402e-05, -1.6046653e-05, ...,
        -1.1606354e-05,  2.7722837e-05, -1.8056815e-05],
       ...,
       [ 1.4189853e-03, -1.5575235e-03, -1.1973886e-03, ...,
        -1.7288451e-03, -2.5112028e-04,  3.4988173e-03],
       [ 8.0287813e-05, -3.1563719e-05,  4.2934629e-05, ...,
        -1.8698003e-05,  1.7525385e-06,  1.3175889e-05],
       [-9.2407361e-05, -1.3404517e-05, -1.3457901e-04, ...,
        -1.8812701e-04, -1.3406626e-04,  2.7947837e-05]], dtype=float32)

In [31]:
item_vecs

array([[ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
         0.0000000e+00,  0.0000000e+00,  0.0000000e+00],
       [ 4.2019007e-04, -3.0564028e-04,  4.6686901e-04, ...,
        -2.3443170e-04,  1.3594792e-04, -2.0024566e-04],
       [-4.6968160e-04, -2.8554298e-04,  6.6290231e-04, ...,
        -3.6986901e-05, -3.6965561e-04,  1.0054124e-04],
       ...,
       [ 2.9947736e-07,  5.1935382e-07,  1.1191055e-06, ...,
        -7.9690602e-07,  3.3849912e-07, -2.3909674e-06],
       [-5.7167224e-05, -4.7674027e-04, -7.6377182e-05, ...,
        -2.6562830e-04, -3.1905764e-04,  3.4172021e-04],
       [ 4.3145070e-04,  5.0962047e-04, -1.8311673e-04, ...,
         4.4828854e-04, -2.9734234e-04,  4.6400647e-04]], dtype=float32)

# Evaluating Recommeder System 


In [28]:
user_vecs_csr = sparse.csr_matrix(user_vecs)
item_vecs_csr = sparse.csr_matrix(item_vecs.T) #item_vec.t:(num feature, num item)

In [29]:
# save the variables
sparse.save_npz('user_vecs_csr.npz', user_vecs_csr)
sparse.save_npz('item_vecs_csr.npz', item_vecs_csr)

# Recommendation System Application

In [9]:
playlist_id = 110
recommendations = model.recommend(playlist_id, interaction_matrix[playlist_id], N=6)


In [10]:
# Convert the output to a DataFrame
recommendations_df = pd.DataFrame({
    'track_idx': recommendations[0],
    'score': recommendations[1]
})

recommendations_df

Unnamed: 0,track_idx,score
0,2256731,7.4e-05
1,1873536,7.3e-05
2,520944,7.3e-05
3,1158068,7.2e-05
4,662809,7.2e-05
5,1046469,6.1e-05


In [12]:
def cf_recommendations(playlist_id, model, interaction_matrix, interaction_df, tracks_df, N=5):
    # Get recommendations from the model
    recommendations = model.recommend(playlist_id, interaction_matrix[playlist_id], N=N)
    
    # Convert the output to a DataFrame
    recommendations_df = pd.DataFrame({
        'track_idx': recommendations[0],
        'score': recommendations[1]
    })
    
   
    interaction_df['track_idx'] = interaction_df['track_uri'].astype('category').cat.codes
    
    # Create a mapping of track_idx to track_uri
    idx_to_uri = dict(zip(interaction_df['track_idx'], interaction_df['track_uri']))
    
    # Map the track indices to track_uris
    recommendations_df['track_uri'] = recommendations_df['track_idx'].map(idx_to_uri)
    
    # Merge with track_af_dataframe to get detailed track information
    recommended_tracks = recommendations_df.merge(tracks_df, on='track_uri')

    recommended_tracks = recommended_tracks.drop_duplicates(subset=['track_uri'])
    
   
    return recommended_tracks[['track_uri','track_name', 'artist_name', 'album_name', 'score']]

In [15]:
playlist_id = int(input("Enter Playlist ID: "))  # Take user input for playlist ID
recommended_songs_df = cf_recommendations(playlist_id, model, interaction_matrix, interaction_df, tracks_df, N=5)
recommended_songs_df

Enter Playlist ID:  110


Unnamed: 0,track_uri,track_name,artist_name,album_name,score
0,spotify:track:7yq4Qj7cqayVTp3FF9CWbm,Riptide,Vance Joy,Dream Your Life Away,7.4e-05
28448,spotify:track:6RrXd9Hph4hYR4bf3dbM6H,My Girl,The Temptations,The Temptations Sing Smokey,7.3e-05
45566,spotify:track:1mqlc0vEP9mU1kZgTi6LIQ,September,"Earth, Wind & Fire","Now, Then & Forever",7.3e-05
68207,spotify:track:3yrSvpt2l1xhsV9Em88Pul,Brown Eyed Girl,Van Morrison,Blowin' Your Mind!,7.2e-05
88285,spotify:track:2H3ZUSE54pST4ubRd5FzFR,Ain't No Mountain High Enough,Marvin Gaye,United,7.2e-05
