In [8]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.linear_model import LinearRegression

In [9]:
# Load data
data = pd.read_csv('songsDataset.csv', nrows=1000)

In [10]:
# Preprocess data
data.columns = data.columns.str.strip().str.replace("'", "")

In [11]:
data.head()

Unnamed: 0,userID,songID,rating
0,0,7171,5
1,0,8637,4
2,0,21966,4
3,0,35821,5
4,0,82446,5


In [12]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   userID  1000 non-null   int64
 1   songID  1000 non-null   int64
 2   rating  1000 non-null   int64
dtypes: int64(3)
memory usage: 23.6 KB


In [13]:
data.isna().sum()

userID    0
songID    0
rating    0
dtype: int64

In [14]:
data.describe()

Unnamed: 0,userID,songID,rating
count,1000.0,1000.0,1000.0
mean,49.5,69346.471,3.382
std,28.880514,39848.765547,1.53115
min,0.0,319.0,1.0
25%,24.75,34827.5,2.0
50%,49.5,70765.5,4.0
75%,74.25,103774.0,5.0
max,99.0,136507.0,5.0


In [15]:
# Create user-item matrix
user_item_matrix = data.pivot_table(index='userID', columns='songID', values='rating').fillna(0)
user_item_matrix.head()

songID,319,578,726,866,911,926,1048,1132,1244,1395,...,135481,135532,135764,135789,135792,136076,136196,136449,136479,136507
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
# Compute user similarities
user_similarity_matrix = cosine_similarity(user_item_matrix)
user_similarity_df = pd.DataFrame(user_similarity_matrix, index=user_item_matrix.index, columns=user_item_matrix.index)

In [17]:
# Compute item similarities
item_similarity_matrix = cosine_similarity(user_item_matrix.T)
item_similarity_df = pd.DataFrame(item_similarity_matrix, index=user_item_matrix.columns, columns=user_item_matrix.columns)

In [18]:
# Function to get similar users
def get_user_similarity(target_user, user_similarity_df, top_n=5):
    similar_scores = user_similarity_df[target_user].sort_values(ascending=False)
    similar_users = similar_scores.iloc[1:top_n+1].index.tolist()
    return similar_users


In [19]:
# Function to get similar items
def get_similar_items(songID, item_similarity_df, top_n=5):
    similar_scores = item_similarity_df[songID].sort_values(ascending=False)
    similar_items = similar_scores.iloc[1:top_n+1].index
    return similar_items

In [20]:
# Function to recommend songs using user-based CF
def recommend_songs_ubcf(userID, user_item_matrix, user_similarity_df, num_recs=5):
    similar_users = get_user_similarity(userID, user_similarity_df)
    recommended_songs = []
    for similar_userID in similar_users:
        for songID, rating in user_item_matrix.loc[similar_userID].items():
            if rating > 0:
                recommended_songs.append((songID, rating))
    recommended_songs = sorted(recommended_songs, key=lambda x: x[1], reverse=True)
    return [songID for songID, _ in recommended_songs[:num_recs]]

In [21]:
# Function to recommend songs using item-based CF
def recommend_songs_ibcf(userID, user_item_matrix, item_similarity_df, num_recs=5):
    user_ratings = user_item_matrix.loc[userID]
    recommended_songs = pd.Series(dtype=float)
    for songID, rating in user_ratings.items():
        if rating > 0:
            similar_items = get_similar_items(songID, item_similarity_df)
            for similar_item in similar_items:
                if similar_item in recommended_songs:
                    recommended_songs[similar_item] += rating
                else:
                    recommended_songs[similar_item] = rating
    recommended_songs = recommended_songs.sort_values(ascending=False)
    return recommended_songs.head(num_recs).index

In [22]:
# Function to create the meta-model training data
def create_meta_model_data(user_item_matrix, user_similarity_df, item_similarity_df, top_n=5):
    meta_data = []
    for userID in user_item_matrix.index:
        ubcf_recs = recommend_songs_ubcf(userID, user_item_matrix, user_similarity_df, num_recs=top_n*2)
        ibcf_recs = recommend_songs_ibcf(userID, user_item_matrix, item_similarity_df, num_recs=top_n*2)
        for songID in user_item_matrix.columns:
            ubcf_score = 1 if songID in ubcf_recs else 0
            ibcf_score = 1 if songID in ibcf_recs else 0
            actual_rating = user_item_matrix.at[userID, songID]
            meta_data.append([ubcf_score, ibcf_score, actual_rating])
    return pd.DataFrame(meta_data, columns=['ubcf_score', 'ibcf_score', 'rating'])

In [23]:
# Create the meta-model training data
meta_model_data = create_meta_model_data(user_item_matrix, user_similarity_df, item_similarity_df)

In [24]:
# Train the meta-model
X = meta_model_data[['ubcf_score', 'ibcf_score']]
y = meta_model_data['rating']

In [25]:
meta_model = LinearRegression()

In [26]:
meta_model.fit(X, y)

In [27]:
meta_model.score(X,y)

0.3532470512892748

In [28]:
# Hybrid recommendation function using stacking
def hybrid_recommendations(target_user, user_item_matrix, user_similarity_df, item_similarity_df, meta_model, top_n=5):
    ubcf_recs = recommend_songs_ubcf(target_user, user_item_matrix, user_similarity_df, num_recs=top_n*2)
    ibcf_recs = recommend_songs_ibcf(target_user, user_item_matrix, item_similarity_df, num_recs=top_n*2)

    recommendations = pd.Series(dtype=float)

    for songID in user_item_matrix.columns:
        ubcf_score = 1 if songID in ubcf_recs else 0
        ibcf_score = 1 if songID in ibcf_recs else 0
        hybrid_score = meta_model.predict(pd.DataFrame([[ubcf_score, ibcf_score]], columns=['ubcf_score', 'ibcf_score']))[0]
        recommendations[songID] = hybrid_score

    recommendations = recommendations.sort_values(ascending=False)
    return recommendations.head(top_n).index

In [81]:
# Test the hybrid model
target_user = int(input("Enter user ID: "))
recommended_songs = hybrid_recommendations(target_user, user_item_matrix, user_similarity_df, item_similarity_df, meta_model)
print(f"Recommendations for user {target_user}: {recommended_songs}")


Recommendations for user 5: Index([136507, 319, 578, 726, 866], dtype='int64')


In [None]:
# Recommendations for user 5: Index([95898, 24427, 99702, 98571, 33558], dtype='int64')

In [30]:
# from sklearn.metrics import precision_score, recall_score, f1_score

In [31]:

# # Function to evaluate model performance
# def evaluate_model_performance(target_user, user_item_matrix, user_similarity_df, item_similarity_df, meta_model, top_n=5):
#     # Get actual ratings for target user
#     actual_ratings = user_item_matrix.loc[target_user]
    
#     # Get recommended songs
#     recommended_songs = hybrid_recommendations(target_user, user_item_matrix, user_similarity_df, item_similarity_df, meta_model, top_n)
    
#     # Create a list of actual ratings for recommended songs
#     actual_ratings_recommended = actual_ratings[actual_ratings.index.isin(recommended_songs)]
    
#     # Convert actual ratings to binary (0/1) for classification metrics
#     actual_ratings_binary = (actual_ratings_recommended > 0).astype(int)
    
#     # Create a list of predicted ratings (1/0) for recommended songs
#     predicted_ratings = [1] * len(recommended_songs)
    
#     # Calculate precision, recall, and F1-score
#     precision = precision_score(actual_ratings_binary, predicted_ratings)
#     recall = recall_score(actual_ratings_binary, predicted_ratings)
#     f1 = f1_score(actual_ratings_binary, predicted_ratings)
    
#     return precision, recall, f1

In [32]:
# # Test the model performance
# target_user = int(input("Enter user ID: "))
# precision, recall, f1 = evaluate_model_performance(target_user, user_item_matrix, user_similarity_df, item_similarity_df, meta_model)
# print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1-score: {f1:.4f}")

Precision: 0.4000, Recall: 1.0000, F1-score: 0.5714


# Test 2

In [34]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.linear_model import LinearRegression

In [35]:
# Load data
data = pd.read_csv('songsDataset.csv', nrows=1000)

In [36]:
# Preprocess data
data.columns = data.columns.str.strip().str.replace("'", "")
data.head()
data.info()
data.isna().sum()
data.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   userID  1000 non-null   int64
 1   songID  1000 non-null   int64
 2   rating  1000 non-null   int64
dtypes: int64(3)
memory usage: 23.6 KB


Unnamed: 0,userID,songID,rating
count,1000.0,1000.0,1000.0
mean,49.5,69346.471,3.382
std,28.880514,39848.765547,1.53115
min,0.0,319.0,1.0
25%,24.75,34827.5,2.0
50%,49.5,70765.5,4.0
75%,74.25,103774.0,5.0
max,99.0,136507.0,5.0


In [37]:
# Create user-item matrix
user_item_matrix = data.pivot_table(index='userID', columns='songID', values='rating').fillna(0)
user_item_matrix.head()

songID,319,578,726,866,911,926,1048,1132,1244,1395,...,135481,135532,135764,135789,135792,136076,136196,136449,136479,136507
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [38]:
# Compute user similarities
user_similarity_matrix = cosine_similarity(user_item_matrix)
user_similarity_df = pd.DataFrame(user_similarity_matrix, index=user_item_matrix.index, columns=user_item_matrix.index)

In [39]:
# Compute item similarities
item_similarity_matrix = cosine_similarity(user_item_matrix.T)
item_similarity_df = pd.DataFrame(item_similarity_matrix, index=user_item_matrix.columns, columns=user_item_matrix.columns)

In [40]:
# Function to get similar users
def get_user_similarity(target_user, user_similarity_df, top_n=5):
    similar_scores = user_similarity_df[target_user].sort_values(ascending=False)
    similar_users = similar_scores.iloc[1:top_n+1].index.tolist()
    return similar_users

In [41]:
# Function to get similar items
def get_similar_items(songID, item_similarity_df, top_n=5):
    similar_scores = item_similarity_df[songID].sort_values(ascending=False)
    similar_items = similar_scores.iloc[1:top_n+1].index
    return similar_items

In [42]:
# Function to recommend songs using user-based CF
def recommend_songs_ubcf(userID, user_item_matrix, user_similarity_df, num_recs=5):
    similar_users = get_user_similarity(userID, user_similarity_df)
    recommended_songs = []
    for similar_userID in similar_users:
        for songID, rating in user_item_matrix.loc[similar_userID].items():
            if rating > 0:
                recommended_songs.append((songID, rating))
    recommended_songs = sorted(recommended_songs, key=lambda x: x[1], reverse=True)
    return [songID for songID, _ in recommended_songs[:num_recs]]

In [43]:
# Function to recommend songs using item-based CF
def recommend_songs_ibcf(userID, user_item_matrix, item_similarity_df, num_recs=5):
    user_ratings = user_item_matrix.loc[userID]
    recommended_songs = pd.Series(dtype=float)
    for songID, rating in user_ratings.items():
        if rating > 0:
            similar_items = get_similar_items(songID, item_similarity_df)
            for similar_item in similar_items:
                if similar_item in recommended_songs:
                    recommended_songs[similar_item] += rating
                else:
                    recommended_songs[similar_item] = rating
    recommended_songs = recommended_songs.sort_values(ascending=False)
    return recommended_songs.head(num_recs).index

In [44]:
def create_meta_model_data(user_item_matrix, user_similarity_df, item_similarity_df, top_n=5):
    meta_data = []
    for userID in user_item_matrix.index:
        for songID in user_item_matrix.columns:
            ubcf_score = 1  # default score
            ibcf_score = 1  # default score
            actual_rating = user_item_matrix.at[userID, songID]
            meta_data.append([ubcf_score, ibcf_score, actual_rating])
    print("meta_data shape:", len(meta_data))
    meta_model_data = pd.DataFrame(meta_data, columns=['ubcf_score', 'ibcf_score', 'rating'])
    print("meta_model_data shape:", meta_model_data.shape)
    X = meta_model_data[['ubcf_score', 'ibcf_score']]
    y = meta_model_data['rating']
    print("X shape:", X.shape)
    print("y shape:", y.shape)
    return X, y

In [60]:
X, y = create_meta_model_data(user_item_matrix, user_similarity_df, item_similarity_df)

meta_data shape: 96800
meta_model_data shape: (96800, 3)
X shape: (96800, 2)
y shape: (96800,)


In [65]:
# Create the meta-model training data
meta_model_data = create_meta_model_data(user_item_matrix, user_similarity_df, item_similarity_df)

meta_data shape: 96800
meta_model_data shape: (96800, 3)
X shape: (96800, 2)
y shape: (96800,)


In [66]:
meta_model = LinearRegression()

In [67]:
meta_model.fit(X, y)

In [68]:
meta_model.score(X,y)

0.0

In [69]:
# Hybrid recommendation function using stacking
def hybrid_recommendations(target_user, user_item_matrix, user_similarity_df, item_similarity_df, meta_model, top_n=5):
    ubcf_recs = recommend_songs_ubcf(target_user, user_item_matrix, user_similarity_df, num_recs=top_n*2)
    ibcf_recs = recommend_songs_ibcf(target_user, user_item_matrix, item_similarity_df, num_recs=top_n*2)

    recommendations = pd.Series(dtype=float)

    for songID in user_item_matrix.columns:
        ubcf_score = 1 if songID in ubcf_recs else 0
        ibcf_score = 1 if songID in ibcf_recs else 0
        hybrid_score = meta_model.predict(pd.DataFrame([[ubcf_score, ibcf_score]], columns=['ubcf_score', 'ibcf_score']))[0]
        recommendations[songID] = hybrid_score

    recommendations = recommendations.sort_values(ascending=False)
    return recommendations.head(top_n).index

In [71]:
# Test the hybrid model
target_user = int(input("Enter user ID: "))
recommended_songs = hybrid_recommendations(target_user, user_item_matrix, user_similarity_df, item_similarity_df, meta_model)
print(f"Recommendations for user {target_user}: {recommended_songs}")


Recommendations for user 3: Index([136507, 319, 578, 726, 866], dtype='int64')


In [52]:
# Recommendations for user 5: Index([319, 96037, 92459, 92523, 92547], dtype='int64')

