In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

import seaborn as sns
from scipy import stats

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV

from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score
from sklearn.metrics import mean_squared_error,r2_score,explained_variance_score

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


## 1 Collaborative Filtering

##### Use this user-item matrix to:

##### A. Recommend 10 songs to users who have listened to 'u2' and 'pink floyd'. Use item-item collaborative filtering to find songs that are similar using spatial distance with cosine.

In [2]:
radio_songs = pd.read_csv ('radio_songs.csv', index_col='user')
radio_songs.shape

(100, 284)

In [3]:
radio_songs.sample(3)

Unnamed: 0_level_0,abba,ac/dc,adam green,aerosmith,afi,air,alanis morissette,alexisonfire,alicia keys,all that remains,...,timbaland,tom waits,tool,tori amos,travis,trivium,u2,underoath,volbeat,yann tiersen
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
422,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1589,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
62,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
############ REFERENCE
import pandas as pd
import numpy as np

# Create a user-item matrix dataframe
user_item_df = radio_songs.copy()

# Compute the item-item cosine similarity matrix
item_sim_matrix = pd.DataFrame(index=user_item_df.columns[1:], columns=user_item_df.columns[1:])
for i in range(len(item_sim_matrix.columns)):
    uu = np.sum(user_item_df.iloc[:, i+1]**2)
    for j in range(i+1, len(item_sim_matrix.columns)):
        uv = np.sum(user_item_df.iloc[:, i+1] * user_item_df.iloc[:, j+1])
        vv = np.sum(user_item_df.iloc[:, j+1]**2)
        if uu == 0 or vv == 0:
            item_sim_matrix.iloc[i, j] = 0
        else:
            item_sim_matrix.iloc[i, j] = 1 - uv / np.sqrt(uu * vv)
        item_sim_matrix.iloc[j, i] = item_sim_matrix.iloc[i, j]


We can use the <b>pdist and squareform functions</b> to compute the similarity between all pairs of items in a user-item matrix to obtain a similarity matrix in one go

In [8]:
from scipy.spatial.distance import pdist, squareform

# Get the transpose of the user-item matrix
item_user_matrix = radio_songs.T

# Compute the pairwise cosine similarity between items
item_similarity_matrix = 1 - squareform(pdist(item_user_matrix, metric='cosine'))

# Convert the similarity matrix to a DataFrame
item_similarity_df = pd.DataFrame(item_similarity_matrix, columns=radio_songs.columns, index=radio_songs.columns)

u2_pinkfloyd = item_similarity_df[['u2','pink floyd']]

# Filter the DataFrame to include only rows with non-zero values for 'u2' and 'pink floyd'
filtered_df = u2_pinkfloyd.loc[(u2_pinkfloyd['u2'] > 0) | (u2_pinkfloyd['pink floyd'] > 0)].copy()

# Create a new column that sums the values of 'u2' and 'pink floyd' using .loc to avoid warning message
filtered_df.loc[:, 'combined_score'] = filtered_df['u2'] + filtered_df['pink floyd']

# Sort the DataFrame by the 'combined_score' column in descending order, excluding 'u2' and 'pink floyd'
sorted_df = filtered_df.drop(['u2', 'pink floyd']).sort_values(by='combined_score', ascending=False)

# Create a boolean mask for entries where both the 'u2' and 'pink floyd' are not zero
both_non_zero_mask = (sorted_df['u2'] != 0) & (sorted_df['pink floyd'] != 0)

# Create a boolean mask for entries where at least one of the 'u2' or 'pink floyd' is zero
at_least_one_zero_mask = ~both_non_zero_mask

# Sort the DataFrame by the 'combined_score' column in descending order, excluding 'u2' and 'pink floyd'
sorted_df = sorted_df.loc[both_non_zero_mask].sort_values(by='combined_score', ascending=False).append(
            sorted_df.loc[at_least_one_zero_mask].sort_values(by='combined_score', ascending=False))


# Get the top 10 artists based on the combined score, excluding 'u2' and 'pink floyd'
top_artists = sorted_df.index[:10].tolist()

# Print the top 10 artists
print(f"The recommended top 10 songs for users who have listened to 'u2' and 'pink floyd' :\n")
print(top_artists)


The recommended top 10 songs for users who have listened to 'u2' and 'pink floyd' :

['robbie williams', 'johnny cash', 'audioslave', 'foo fighters', 'pearl jam', 'incubus', 'genesis', 'misfits', 'green day', 'depeche mode']


In [9]:
sorted_df.head(10)

Unnamed: 0,u2,pink floyd,combined_score
robbie williams,0.5,0.288675,0.788675
johnny cash,0.353553,0.204124,0.557678
audioslave,0.223607,0.258199,0.481806
foo fighters,0.301511,0.174078,0.475589
pearl jam,0.204124,0.235702,0.439826
incubus,0.144338,0.166667,0.311004
genesis,0.0,0.57735,0.57735
misfits,0.5,0.0,0.5
green day,0.433013,0.0,0.433013
depeche mode,0.408248,0.0,0.408248


##### Based on combined scores

['robbie williams', 'genesis', 'johnny cash', 'misfits', 'audioslave', 'foo fighters', 'pearl jam', 'green day', 'depeche mode', 'hans zimmer']


                           u2	        pink floyd	 combined_score
    robbie williams	    0.500000	  0.288675	   0.788675
    genesis	            0.000000	  0.577350	   0.577350
    johnny cash	        0.353553	  0.204124	   0.557678
    misfits	            0.500000	  0.000000	   0.500000
    audioslave	         0.223607	  0.258199	   0.481806
    foo fighters	       0.301511	  0.174078	   0.475589
    pearl jam	          0.204124	  0.235702	   0.439826
    green day	          0.433013	  0.000000	   0.433013
    depeche mode	       0.408248	  0.000000	   0.408248
    hans zimmer	        0.000000	  0.408248	   0.408248

##### B. Find user most similar to user 1606. Use user-user collaborative filtering with cosine similarity. List the recommended songs for user 1606 (Hint: find the songs listened to by the most similar user).



In [10]:
# Compute the pairwise cosine similarity between users
user_similarity_matrix = 1 - squareform(pdist(radio_songs, metric='cosine'))

# Convert the similarity matrix to a DataFrame
user_similarity_df = pd.DataFrame(user_similarity_matrix, columns=radio_songs.index, index=radio_songs.index)

# Find the user most similar to user 1606
most_similar_user = user_similarity_df[1606].drop(1606).idxmax()

print(f"The user most similar to user 1606 is user {most_similar_user}.")


The user most similar to user 1606 is user 1144.


In [11]:
# Find the songs listened to by the most similar user
most_similar_user_songs = radio_songs.loc[most_similar_user, radio_songs.loc[most_similar_user] == 1].index

# Find the songs that the target user (1606) has not listened to
target_user_unheard_songs = radio_songs.loc[1606, radio_songs.loc[1606] == 0].index

# Find the intersection of the two sets of songs
recommended_songs = most_similar_user_songs.intersection(target_user_unheard_songs).tolist()

print(f"The recommended songs for user 1606 based on most similar user {most_similar_user} are :\n")
print(recommended_songs)


The recommended songs for user 1606 based on most similar user 1144 are :

['beastie boys', 'bob dylan', 'bob marley & the wailers', 'david bowie', 'eric clapton', 'johnny cash', 'pearl jam', 'pink floyd', 'the doors', 'the rolling stones', 'tom waits']


##### C. How many of the recommended songs has already been listened to by user 1606?

In [12]:
print(f"The number of recommended songs already listened to by user 1606 is : \
{len(most_similar_user_songs) - len(recommended_songs)} \n\nThey are :")

print(set(most_similar_user_songs) ^ set(recommended_songs))

The number of recommended songs already listened to by user 1606 is : 2 

They are :
{'elvis presley', 'the beatles'}


##### D. Use a combination of user-item approach to build a recommendation score for each song for each user using the following steps for each user

This code iterates over each user and each song for the user, and for each song, it gets the top 10 similar songs and their similarity scores. It then gets a list of purchases for each of the top 10 similar songs, and calculates a recommendation score for the song based on the purchase history and similarity scores. Finally, it updates the recommendation scores DataFrame with the calculated recommendation score for each song for each user.

To get the top 5 song recommendations for user 1606, we simply filter the recommendation scores DataFrame for user 1606 and get the top 5 songs with the highest recommendation scores. These are the songs that are most likely to be of interest to user 1606 based on their purchase history and the purchase history of users who have similar preferences.

In [13]:
# Compute the pairwise cosine similarity between items
item_similarity_matrix = 1 - squareform(pdist(radio_songs.T, metric='cosine'))

# Convert the similarity matrix to a DataFrame
item_similarity_df = pd.DataFrame(item_similarity_matrix, columns=radio_songs.columns, index=radio_songs.columns)

# Initialize an empty DataFrame to store the recommendation scores
recommendation_scores_df = pd.DataFrame(columns=radio_songs.columns, index=radio_songs.index)

# Iterate over each user
for user in radio_songs.index:
    # Iterate over each song for the user
    for song in radio_songs.columns:
        # Get the top 10 similar songs and their similarity score
        top_10_similar_songs = item_similarity_df[song].drop(song).nlargest(10)
        
        # Get a list of purchases for each of the top 10 similar songs
        purchase_history = []
        for similar_song, similarity_score in top_10_similar_songs.iteritems():
            if radio_songs.loc[user, similar_song] == 1:
                purchase_history.append(similarity_score)
        
        # Calculate the recommendation score for the song
        if len(purchase_history) > 0:
            recommendation_score = sum(purchase_history) / sum(top_10_similar_songs)
        else:
            recommendation_score = 0
        
        # Update the recommendation scores DataFrame
        recommendation_scores_df.loc[user, song] = recommendation_score

    
# Convert the recommendation scores DataFrame to a numeric data type
recommendation_scores_df = recommendation_scores_df.apply(pd.to_numeric)

# Print the top 5 song recommendations for user 1606
top_recommendations = recommendation_scores_df.loc[1606].nlargest(5)
print(f"The top 5 song recommendations for user 1606 are:")
for song, score in top_recommendations.iteritems():
    print(f"{song} (score: {score})")

The top 5 song recommendations for user 1606 are:
elvis presley (score: 0.2893278354309895)
abba (score: 0.23902308185961815)
eric clapton (score: 0.20274011674755033)
frank sinatra (score: 0.2011393381145825)
howard shore (score: 0.17174865637166106)
