In [83]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
from numpy.linalg import norm
import plotly.graph_objects as go

### Data inladen

In [84]:
df = pd.read_csv("boek_reviews.csv")

df["Review"] = df['Review'].apply(lambda s:s.replace('"', ""))
df['Book'] = df['Book'].apply(lambda s:s.replace('"', ""))
df['User'] = df['User'].apply(lambda s:s.replace('User ', ""))

df['User'] = pd.to_numeric(df["User"])

In [85]:
df

Unnamed: 0,User,Book,Rating,Review
0,1,The Art of JavaScript: Mastering Backend Devel...,4,The Art of JavaScript: Mastering Backend Devel...
1,1,The Code Master: Unlocking the Secrets of Pro...,4,The Code Master: Unlocking the Secrets of Prog...
2,1,The Code Keeper: Navigating the Digital World ...,2,The Code Keeper: Navigating the Digital World ...
3,1,The Enchanted Adventures of Stakeholder Niek a...,3,The Enchanted Adventures of Stakeholder Niek a...
4,1,The Power of Backend: Mastering Programming fo...,1,The Power of Backend: Mastering Programming fo...
...,...,...,...,...
495,25,Unlocking the Power of Backend: Navigating Sta...,1,Unlocking the Power of Backend: Navigating Sta...
496,25,Code and Magic: The Unicorn Hacker Chronicles,2,Code and Magic: The Unicorn Hacker Chronicles ...
497,25,The Art of Frontend Coding: A Comprehensive Gu...,4,The Art of Frontend Coding is a comprehensive ...
498,25,Cracking the Code: A Comprehensive Guide to Ba...,2,Cracking the Code: A Comprehensive Guide to Ba...


In [86]:
df_ratings = pd.DataFrame(columns = pd.Series({c: df[c].unique() for c in df})['Book'])
df_ratings['User'] = pd.Series({c: df[c].unique() for c in df})['User']
df_ratings = df_ratings[df_ratings.columns.tolist()[-1:] + df_ratings.columns.tolist()[:-1]]

In [87]:
for _, row in df.iterrows():
    df_ratings[row['Book']][df_ratings['User']==row["User"]] = row['Rating']

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df_ratings[row['Book']][df_ratings['User']==row["User"]] = row['Rating']
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to

In [88]:
df_ratings

Unnamed: 0,User,The Art of JavaScript: Mastering Backend Development,The Code Master: Unlocking the Secrets of Programming,The Code Keeper: Navigating the Digital World with IT,The Enchanted Adventures of Stakeholder Niek and the Unicorn Realm,The Power of Backend: Mastering Programming for Efficiency,Unlocking Stakeholder Niek's Potential: A Javascript Guide,The Art of Frontend Programming: Mastering User-Interface Design and Development,Mastering JavaScript Programming: A Comprehensive Guide for Developers,Behind the Screen: Exploring the World of Backend Development,...,Mastering the Backend: A Comprehensive Guide to IT Infrastructure and Development,Code of the Unicorns: Harnessing Magical Technology,The Art of JavaScript: Mastering Programming Principles,The Code Master: Unlocking the Secrets of Programming.1,The Code Crusade: Navigating Stakeholder Niek's Influence in Software Development,Frontend Frenzy: Navigating Stakeholders with Niek,The Coding Chronicles: Niek's Stakeholder Saga,Mastering IT: A Comprehensive Guide to JavaScript Development,The Magical World of Stakeholder Niek and the Enchanted Unicorns,Balancing Act: Navigating the World of Backend and Frontend Development
0,1,4.0,4.0,2.0,3.0,1.0,4.0,2.0,4.0,2.0,...,,,,,,,,,,
1,2,,4.0,,2.0,,,5.0,4.0,5.0,...,,,,,,,,,,
2,3,,,2.0,,3.0,5.0,,3.0,,...,,,,,,,,,,
3,4,3.0,1.0,,4.0,,3.0,3.0,,,...,,,,,,,,,,
4,5,,4.0,,,,4.0,1.0,,,...,5.0,4.0,5.0,1.0,4.0,,,,,
5,6,,,,,5.0,2.0,1.0,1.0,,...,3.0,1.0,,,5.0,4.0,,,,
6,7,1.0,,,,,3.0,4.0,,1.0,...,,,,3.0,,,2.0,,,
7,8,,,,4.0,,,,,,...,,1.0,,,,4.0,,4.0,5.0,
8,9,,5.0,,,,3.0,,,2.0,...,,1.0,1.0,,,5.0,,4.0,2.0,
9,10,,4.0,,,2.0,,,2.0,4.0,...,2.0,,,4.0,3.0,2.0,,,,4.0


### Versie Bryan+Vincent

In [89]:
# Function to calculate cosine similarity considering only non-null values
def cosine_similarity_no_interpolation(row1, row2):
    mask = ~np.logical_or(pd.isnull(row1), pd.isnull(row2))
    row1_filtered = row1[mask]
    row2_filtered = row2[mask]
    
    dot_product = np.dot(row1_filtered, row2_filtered)
    norm1 = norm(row1_filtered)
    norm2 = norm(row2_filtered)
    
    return dot_product / (norm1 * norm2) if norm1 != 0 and norm2 != 0 else 0



In [90]:
# Calculate cosine similarity between all pairs of rows
books_df = df_ratings.iloc[:, 1:]

# Initialize an empty similarity matrix
n = len(books_df)
similarity_matrix = np.zeros((n, n))

# Compute the cosine similarity between each pair of rows
for i in range(n):
    for j in range(i, n):
        sim = cosine_similarity_no_interpolation(books_df.iloc[i], books_df.iloc[j])
        similarity_matrix[i, j] = sim
        similarity_matrix[j, i] = sim  # Since the matrix is symmetric

# Convert the similarity matrix to a DataFrame for easier visualization
similarity_df = pd.DataFrame(similarity_matrix, index=books_df.index, columns=books_df.index)

In [91]:
similarity_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
0,1.0,0.809723,0.94662,0.861423,0.982946,0.79546,0.781666,0.959832,0.903154,0.875509,...,0.671093,0.893043,0.829892,0.800272,0.774975,0.806779,0.867248,0.929669,0.848355,0.939558
1,0.809723,1.0,0.889975,0.75295,0.734511,0.627119,0.805011,0.837815,0.720082,0.87633,...,0.820823,0.807303,0.836691,0.948744,0.928084,0.93547,0.765587,0.853354,0.847115,0.755999
2,0.94662,0.889975,1.0,0.872494,0.757195,0.867802,0.883689,0.942305,0.967755,0.93312,...,0.832483,0.901561,0.942857,0.896857,0.915112,0.779222,0.890176,0.945611,0.979805,0.764719
3,0.861423,0.75295,0.872494,1.0,0.804522,0.889026,0.897741,0.891771,0.829516,0.702381,...,0.733779,0.810931,0.831128,0.823462,0.894427,0.885255,0.827844,0.938708,0.830028,0.924282
4,0.982946,0.734511,0.757195,0.804522,1.0,0.802377,0.746084,0.826574,0.695817,0.733049,...,0.870916,0.871091,0.616655,0.857321,0.775672,0.875004,0.678935,0.862015,0.696171,0.842942
5,0.79546,0.627119,0.867802,0.889026,0.802377,1.0,0.832995,0.855242,0.893427,0.890012,...,0.938436,0.787263,0.931857,0.946327,0.856795,0.793006,0.81448,0.933326,0.878395,0.753978
6,0.781666,0.805011,0.883689,0.897741,0.746084,0.832995,1.0,0.836743,0.935646,0.893905,...,0.918559,0.870791,0.852803,0.89878,0.811921,0.823886,0.896709,0.843632,0.684632,0.913482
7,0.959832,0.837815,0.942305,0.891771,0.826574,0.855242,0.836743,1.0,0.927252,0.590053,...,0.797083,0.870076,0.856548,0.805971,0.927651,0.805109,0.752384,0.927111,0.883715,0.801002
8,0.903154,0.720082,0.967755,0.829516,0.695817,0.893427,0.935646,0.927252,1.0,0.86937,...,0.91129,0.883349,0.938679,0.896295,0.905263,0.776899,0.865133,0.853325,0.697422,0.743311
9,0.875509,0.87633,0.93312,0.702381,0.733049,0.890012,0.893905,0.590053,0.86937,1.0,...,0.821442,0.764293,0.94399,0.96105,0.889832,0.957938,0.968689,0.850564,0.750391,0.704041


In [92]:
fig = go.Figure(data=go.Heatmap(
    z= similarity_df,
    x = similarity_df.index.values,
    y = similarity_df.index.values
))

fig.update_yaxes(autorange='reversed')

fig.show()

In [93]:
def calculate_recommendation_score(user, book, similarity_df = similarity_df, ratings_df = df_ratings):

    assert ratings_df[ratings_df['User']==user][book].isnull().values, "User already reviewed this book."
    
    filtered_ratings = ratings_df[~(ratings_df[book]).isnull()]
    users = np.array(filtered_ratings['User'])

    ratings = np.array(filtered_ratings[book])
    weights = similarity_df.iloc[user-1, users-1].values

    total_weights = sum(weights)
    total_value = np.dot(ratings, weights)

    recommendation_score = total_value/total_weights

    return recommendation_score

In [94]:
calculate_recommendation_score(3, "The Enchanted Adventures of Stakeholder Niek and the Unicorn Realm")

3.085981598372158

In [95]:
def give_recommendation(user,  num_recommendations = 5, ratings_df = df_ratings,):
    not_rated = ratings_df.columns[ratings_df.iloc[user-1].isna()].tolist()

    recommendations = {}
    for book in not_rated:
        recommendations[book] = calculate_recommendation_score(user, book)

    top_recommendations = dict(sorted(recommendations.items(), key=lambda item: item[1], reverse=True)[:num_recommendations])

    return top_recommendations

In [96]:
give_recommendation(5, 10)

{'The Code Mastermind: Unlocking the Secrets of Programming': 3.787453229869125,
 "The Coding Chronicles: Niek's Stakeholder Saga": 3.697380088518128,
 'Frontend Frenzy: Navigating Stakeholders with Niek': 3.593821533960243,
 'Mastering JavaScript Programming: A Comprehensive Guide for Developers': 3.5528798547376867,
 "The Stakeholder's Guide to Programming Success: Niek's Journey Towards Effective Collaboration": 3.434700429230114,
 'Front and Center: Navigating the World of Frontend IT': 3.4313983460715423,
 'Frontend Magic: Unicorns in the Digital Realm': 3.342830995019483,
 'Balancing Act: Navigating the World of Backend and Frontend Development': 3.2979804281466127,
 "The Code Master's Guide: Navigating the World of Programming and IT": 3.290079510491337,
 'The Magical World of Stakeholder Niek and the Enchanted Unicorns': 3.108402874532288}

### Versie chatGPT

In [97]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

# Create DataFrame
df = pd.read_csv("boek_reviews.csv")

df["Review"] = df['Review'].apply(lambda s:s.replace('"', ""))
df['Book'] = df['Book'].apply(lambda s:s.replace('"', ""))
df['User'] = df['User'].apply(lambda s:s.replace('User ', ""))

df['User'] = pd.to_numeric(df['User'])
df['Rating'] = pd.to_numeric(df['Rating'])

# Create user-item matrix
user_book_matrix = df.pivot(index='User', columns='Book', values='Rating').fillna(0)


# Perform Singular Value Decomposition (SVD)
U, sigma, Vt = np.linalg.svd(user_book_matrix, full_matrices=False)

# Convert sigma to a diagonal matrix
sigma = np.diag(sigma)

# Calculate the predicted ratings matrix
predicted_ratings = np.dot(np.dot(U, sigma), Vt)
predicted_ratings_df = pd.DataFrame(predicted_ratings, columns=user_book_matrix.columns)

# Function to get the predicted rating for a specific user and book
def predict_rating(user_id, book_title):
    user_index = user_id - 1  # Adjusting index for 0-based indexing
    if book_title in predicted_ratings_df.columns:
        predicted_rating = predicted_ratings_df.loc[user_index, book_title]
        return predicted_rating
    else:
        return f"Book '{book_title}' not found in the dataset."

# Example usage
user_id = 10
book_title = "The Enchanted Adventures of Stakeholder Niek and the Unicorn Realm"
predicted_rating = predict_rating(user_id, book_title)
print(f"Predicted rating for user {user_id} and book '{book_title}' is: {predicted_rating:.2f}")


Predicted rating for user 10 and book 'The Enchanted Adventures of Stakeholder Niek and the Unicorn Realm' is: 0.00


In [98]:
import pandas as pd
from surprise import Dataset, Reader, SVD
from surprise.model_selection import cross_validate

# Prepare data for Surprise library
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df[['User', 'Book', 'Rating']], reader)

# Use SVD algorithm
algo = SVD()

# Train the model
trainset = data.build_full_trainset()
algo.fit(trainset)

# Function to predict rating for a specific user and book
def predict_rating(user_id, book_title):
    prediction = algo.predict(user_id, book_title)
    return prediction.est

def get_top_recommendations(user_id, top_x=3):
    # Check if the user exists in the training set
    if user_id not in trainset.all_users():
        return f"User ID {user_id} not found."

    # Get the list of all books
    all_books = df['Book'].unique()

    # Get the list of books the user has already rated
    user_rated_books = df[df['User'] == user_id]['Book'].unique()

    # Get the list of books the user hasn't rated yet
    unread_books = [book for book in all_books if book not in user_rated_books]

    # Predict ratings for all unread books
    predictions = []
    for book in unread_books:
        predicted_rating = algo.predict(user_id, book).est
        predictions.append((book, predicted_rating))

    # Sort by predicted rating and get top x recommendations
    top_recommendations = sorted(predictions, key=lambda x: x[1], reverse=True)[:top_x]

    return top_recommendations


In [99]:
user_id = 12
book_title = "The Art of JavaScript: Mastering Backend Development"
predicted_rating = predict_rating(user_id, book_title)
print(f"Predicted rating for user {user_id} and book '{book_title}' is: {predicted_rating:.2f}")

Predicted rating for user 12 and book 'The Art of JavaScript: Mastering Backend Development' is: 2.31


In [110]:
# Example usage
user_id = 0 #user index
top_x = 10
recommendations = get_top_recommendations(user_id, top_x)
print(f"Top {top_x} recommendations for user {user_id}:")
for book, rating in recommendations:
    print(f"{book}: Predicted rating {rating:.2f}")

Top 10 recommendations for user 0:
Unicorns and Backend Magic: A Developer's Guide to Building Enchanting Websites: Predicted rating 3.62
Stakeholder Niek's Guide to JavaScript Success: A Comprehensive Approach to Engaging Your Stakeholders: Predicted rating 3.52
The Code Mastermind: Unlocking the Secrets of Programming: Predicted rating 3.46
The Code Crusade: Navigating Stakeholder Niek's Influence in Software Development: Predicted rating 3.41
The Coding Chronicles: Niek's Stakeholder Saga: Predicted rating 3.40
Mastering JavaScript Programming: A Comprehensive Guide for Developers: Predicted rating 3.39
Mastering the Backend: A Comprehensive Guide to IT Infrastructure and Development: Predicted rating 3.39
Frontend Frenzy: Navigating Stakeholders with Niek: Predicted rating 3.36
Frontend Magic: Unicorns in the Digital Realm: Predicted rating 3.28
Front and Center: Navigating the World of Frontend IT: Predicted rating 3.27


In [111]:
give_recommendation(1, 10) #User 1 heeft index 0

{"Unicorns and Backend Magic: A Developer's Guide to Building Enchanting Websites": 3.8442019171298263,
 'Mastering the Backend: A Comprehensive Guide to IT Infrastructure and Development': 3.8200590008413178,
 'The Code Mastermind: Unlocking the Secrets of Programming': 3.7962587335046103,
 "The Code Crusade: Navigating Stakeholder Niek's Influence in Software Development": 3.7669772790874743,
 "Stakeholder Niek's Guide to JavaScript Success: A Comprehensive Approach to Engaging Your Stakeholders": 3.703824880098472,
 "The Coding Chronicles: Niek's Stakeholder Saga": 3.64757550139392,
 'Frontend Frenzy: Navigating Stakeholders with Niek': 3.5288208135881534,
 'The Code Master: Unlocking the Secrets of Programming': 3.41730077094567,
 "The Stakeholder's Guide to Programming Success: Niek's Journey Towards Effective Collaboration": 3.3756942865763793,
 'Full Stack Foundations: Exploring the Frontend and Backend of Web Development': 3.3515197380656896}