# Books Recommender System Using Autoencoder for Feature Encoding



## Preprocessing (like original model)

In [1]:
# Importing necessary library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
books = pd.read_csv(
    'data/BX-Books.csv', 
    sep=";", on_bad_lines='skip',
    low_memory=False ,
    encoding='latin-1')

In [3]:
books.columns

Index(['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher',
       'Image-URL-S', 'Image-URL-M', 'Image-URL-L'],
      dtype='object')

In [4]:
# after remove
books = books[['ISBN','Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher','Image-URL-L']]

In [5]:
# Lets remane some wierd columns name
books.rename(columns={"Book-Title":'title',
                      'Book-Author':'author',
                     "Year-Of-Publication":'year',
                     "Publisher":"publisher",
                     "Image-URL-L":"image_url"},inplace=True)

In [6]:

users = pd.read_csv('data/BX-Users.csv', sep=";", on_bad_lines='skip',low_memory=False , encoding='latin-1')

In [7]:
# Lets remane some wierd columns name
users.rename(columns={"User-ID":'user_id',
                      'Location':'location',
                     "Age":'age'},inplace=True)

In [8]:
# Now load the third dataframe
ratings = pd.read_csv('data/BX-Book-Ratings.csv', sep=";", on_bad_lines='skip',low_memory=False , encoding='latin-1')

In [9]:
# Lets remane some wierd columns name
ratings.rename(columns={"User-ID":'user_id',
                      'Book-Rating':'rating'},inplace=True)

In [10]:
print(f'book dataframe {books.shape}\nusers dataframe {users.shape}\nratings dataframe {ratings.shape}')

book dataframe (271360, 6)
users dataframe (278858, 3)
ratings dataframe (1149780, 3)


In [11]:
# Lets store users who had at least rated more than 200 books
x = ratings['user_id'].value_counts() > 200
x = x[x]
y= x.index


In [12]:
ratings = ratings[ratings['user_id'].isin(y)]

In [13]:
ratings['user_id'].unique().shape

(899,)

# Join ratings with books

In [14]:
# Now join ratings with books by
# ISBN (International Standard Book Number)
ratings_with_books = ratings.merge(books, on='ISBN')

In [15]:
ratings_with_books.shape

(487671, 8)

In [16]:
number_rating = ratings_with_books.groupby('title')['rating'].count().reset_index()

In [17]:
number_rating.rename(columns={'rating':'num_of_rating'},inplace=True)

In [18]:
final_rating = ratings_with_books.merge(number_rating, on='title')

In [19]:
# Lets take those books which got at least 50 rating of user
final_rating = final_rating[final_rating['num_of_rating'] >= 50]

In [20]:
# lets drop the duplicates
final_rating.drop_duplicates(['user_id','title'],inplace=True)

In [21]:
final_rating.shape

(59850, 9)

# Encoder and training model

In [22]:
# Lets create a pivot table
book_pivot = final_rating.pivot_table(
    columns='user_id',
    index='title',
    values= 'rating'
    )

In [23]:
book_pivot

user_id,254,2276,2766,2977,3363,3757,4017,4385,6242,6251,...,274004,274061,274301,274308,274808,275970,277427,277478,277639,278418
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,9.0,,,,,,,,,,...,,,,,,0.0,,,,
1st to Die: A Novel,,,,,,,,,,,...,,,,,,,,,,
2nd Chance,,10.0,,,,,,,,,...,,,,0.0,,,,,0.0,
4 Blondes,,,,,,,,,,0.0,...,,,,,,,,,,
84 Charing Cross Road,,,,,,,,,,,...,,,,,,10.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Year of Wonders,,,,7.0,,,,,7.0,,...,,,,,,0.0,,,,
You Belong To Me,,,,,,,,,,,...,,,,,,,,,,
Zen and the Art of Motorcycle Maintenance: An Inquiry into Values,,,,,0.0,,,,,0.0,...,,,,,,0.0,,,,
Zoya,,,,,,,,,,,...,,,,,,,,,,


In [24]:
book_pivot.fillna(0, inplace=True)

In [25]:
book_pivot.head()

user_id,254,2276,2766,2977,3363,3757,4017,4385,6242,6251,...,274004,274061,274301,274308,274808,275970,277427,277478,277639,278418
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1st to Die: A Novel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2nd Chance,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4 Blondes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
84 Charing Cross Road,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0


# Building Autoencoder to encoder feature of book

In [26]:
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from tensorflow.keras.models import Model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense

In [27]:
from sklearn.preprocessing import StandardScaler 
from sklearn.model_selection import train_test_split
scaler = StandardScaler()
book_pivot_scaled = scaler.fit_transform(book_pivot)

X_train,X_test = train_test_split(book_pivot_scaled ,test_size=0.2 , random_state=42)

model_nn = Sequential([
        Input(shape=(book_pivot_scaled.shape[1],)),
        Dense(128, activation='relu'),
        Dense(64, activation='relu'),
        Dense(32, activation='relu'),
        Dense(64, activation='relu'),
        Dense(128, activation='relu'),
        Dense(book_pivot_scaled.shape[1], activation='linear')
    ])

Build the Autoencoder

In [28]:
model_nn.compile(optimizer='adam', loss='mse')

model_nn.fit(X_train, X_train, epochs=50, batch_size=32, validation_data=(X_test, X_test))

Epoch 1/50
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - loss: 0.9636 - val_loss: 0.8865
Epoch 2/50
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.9851 - val_loss: 0.8859
Epoch 3/50
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.9523 - val_loss: 0.8849
Epoch 4/50
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.9663 - val_loss: 0.8801
Epoch 5/50
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.8897 - val_loss: 0.8777
Epoch 6/50
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.8885 - val_loss: 0.8750
Epoch 7/50
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.8755 - val_loss: 0.8726
Epoch 8/50
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.8720 - val_loss: 0.8705
Epoch 9/50
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[

<keras.src.callbacks.history.History at 0x1a7be13cfb0>

In [29]:
model_nn.save('trained_model.keras')

In [30]:
from tensorflow.keras.models import load_model 
loaded_model = load_model('trained_model.keras')

embedding_model = Sequential(loaded_model.layers[:-3])
book_embeddings = loaded_model.predict(book_pivot_scaled)

[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step


In [31]:
def recommend_book_nn (book_index , top_n = 5) : 
    if book_index < 0 or book_index >= len(book_pivot.index):
        raise ValueError(f"Book index {book_index} is out of range.")
    
    book_name = book_pivot.index[book_index]
    
    book_embedding = book_embeddings[book_index].reshape(1,-1)
    similarities = cosine_similarity(book_embedding, book_embeddings).flatten()
    similar_indices = similarities.argsort()[-top_n-1:-1][::-1] 
    similar_books = [] 
    for similar_index in similar_indices:
        similar_books_name = book_pivot.index[similar_index]
        distance = np.linalg.norm(book_pivot.iloc[book_index] - book_pivot.iloc[similar_index])
        similar_books.append((similar_books_name, distance))
    total = 0 
    
    print(f"Books similar to'{book_name}':")
    for book, distance in similar_books:
       total += distance
       print(f"{book} - Distance {distance:.4f}")
    print(f"Average distance: {total/top_n:.4f}")

In [32]:
recommend_book_nn(0,5)

NameError: name 'cosine_similarity' is not defined