In [1]:
import pandas as pd
import numpy as np

# Load data
raw_data = pd.read_csv("book.csv", encoding='latin-1').drop(columns='Unnamed: 0')
raw_data.rename(columns={'User.ID':'user_id','Book.Title':'book_title','Book.Rating':'book_rating'}, inplace=True)
raw_data.head()

Unnamed: 0,user_id,book_title,book_rating
0,276726,Classical Mythology,5
1,276729,Clara Callan,3
2,276729,Decision in Normandy,6
3,276736,Flu: The Story of the Great Influenza Pandemic...,8
4,276737,The Mummies of Urumchi,6


In [2]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   user_id      10000 non-null  int64 
 1   book_title   10000 non-null  object
 2   book_rating  10000 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 234.5+ KB


In [3]:
raw_data.user_id.nunique()

2182

In [4]:
raw_data.book_title.nunique()

9659

In [5]:

# Pivot table to get user-book matrix
user_books_df = raw_data.pivot_table(index='user_id', columns='book_title', values='book_rating', fill_value=0)
user_books_df

book_title,"Jason, Madison &amp",Other Stories;Merril;1985;McClelland &amp,Repairing PC Drives &amp,'48,'O Au No Keia: Voices from Hawai'I's Mahu and Transgender Communities,...AND THE HORSE HE RODE IN ON : THE PEOPLE V. KENNETH STARR,01-01-00: A Novel of the Millennium,"1,401 More Things That P*Ss Me Off",10 Commandments Of Dating,"100 Great Fantasy Short, Short Stories",...,Zora Hurston and the Chinaberry Tree (Reading Rainbow Book),\Even Monkeys Fall from Trees\ and Other Japanese Proverbs,\I Won't Learn from You\: And Other Thoughts on Creative Maladjustment,"\More More More,\ Said the Baby",\O\ Is for Outlaw,"\Surely You're Joking, Mr. Feynman!\: Adventures of a Curious Character","\Well, there's your problem\: Cartoons",iI Paradiso Degli Orchi,stardust,Ã?Â?bermorgen.
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
278846,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
278849,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
278851,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,0.0
278852,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
from sklearn.metrics import pairwise_distances

# Compute cosine similarity
cos_sim = 1 - pairwise_distances(user_books_df.values, metric='cosine')
cos_sim

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [7]:
# Store results in DataFrame
cos_sim_df = pd.DataFrame(cos_sim, index=user_books_df.index, columns=user_books_df.index)

# Fill diagonal with zeros
np.fill_diagonal(cos_sim, 0)

# Most similar users
most_similar_users = cos_sim_df.idxmax(axis=1)[:20]
most_similar_users

user_id
8          8
9          8
10         8
12         8
14         8
16         8
17         8
19    278418
22         8
26         8
32         8
39         8
42         8
44         8
51      3757
53      1996
56         8
64         8
67         8
69         8
dtype: int64

In [1]:
import pandas as pd
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine
import numpy as np

# Load data
books = pd.read_csv("book.csv", encoding="ISO-8859-1").rename(columns={'User.ID': 'UserId', 'Book.Title': 'Title', 'Book.Rating': 'Rating'})

# Pivot table
books_pivot = books.pivot_table(index='UserId', columns='Title', values='Rating', fill_value=0)

# Compute user similarity
user_similarity = 1 - pairwise_distances(books_pivot.values, metric='cosine')
np.fill_diagonal(user_similarity, 0)

# Convert to DataFrame
user_similarity_df = pd.DataFrame(user_similarity, index=books_pivot.index, columns=books_pivot.index)

# Function to recommend books
def recommend(user_id):
    similar_user_id = user_similarity_df.loc[user_id].idxmax()
    similar_user_books = set(books[books['UserId'] == similar_user_id].Title)
    user_books = set(books[books['UserId'] == user_id].Title)
    return similar_user_books - user_books

# Example recommendations
print(recommend(16))

{'The Western way: A practical guide to the Western mystery tradition', 'Keepers of the Earth Teachers Guide', 'Wings', 'Truckers', 'The Art Of Celtia', 'Ancient Celtic Romances', 'The Celts Activity Book'}
