In [18]:
import numpy as np
import pandas as pd


In [19]:
books = pd.read_csv("Books.csv", low_memory=False)
users = pd.read_csv("Users.csv")
ratings = pd.read_csv("Ratings.csv")

In [None]:
books.head()

In [None]:
print(books.shape)
print(ratings.shape)
print(users.shape)

In [None]:
print("Books DataFrame Info:")
books.info()
print("\nRatings DataFrame Info:")
ratings.info()
print("\nUsers DataFrame Info:")
users.info()

### Checking for Missing Values

We use `.isnull().sum()` on each DataFrame to count how many missing values (`NaN`) are present in each column. This helps us understand which columns have incomplete data.

In [21]:
print("Missing values in 'books' DataFrame:")
display(books.isnull().sum())

print("\nMissing values in 'users' DataFrame:")
display(users.isnull().sum())

print("\nMissing values in 'ratings' DataFrame:")
display(ratings.isnull().sum())

Missing values in 'books' DataFrame:


ISBN                   0
Book-Title             0
Book-Author            2
Year-Of-Publication    0
Publisher              2
Image-URL-S            0
Image-URL-M            0
Image-URL-L            3
dtype: int64


Missing values in 'users' DataFrame:


User-ID          0
Location         0
Age         110762
dtype: int64


Missing values in 'ratings' DataFrame:


User-ID        0
ISBN           0
Book-Rating    0
dtype: int64

### Checking for Duplicate Entries

Next, we'll check for duplicate rows using `.duplicated().sum()`. This tells us if there are any identical rows in our datasets, which can sometimes indicate data entry errors or issues during data collection.

In [22]:
print("Number of duplicate rows in 'books' DataFrame:")
display(books.duplicated().sum())

print("\nNumber of duplicate rows in 'users' DataFrame:")
display(users.duplicated().sum())

print("\nNumber of duplicate rows in 'ratings' DataFrame:")
display(ratings.duplicated().sum())

Number of duplicate rows in 'books' DataFrame:


0


Number of duplicate rows in 'users' DataFrame:


0


Number of duplicate rows in 'ratings' DataFrame:


0

###Popularity Based Recommender System

In [24]:
ratings_with_name = ratings.merge(books,on='ISBN')

In [26]:
num_rating_df = ratings_with_name.groupby('Book-Title').count()['Book-Rating'].reset_index()
num_rating_df.rename(columns={'Book-Rating':'num_ratings'},inplace=True)

In [27]:
num_rating_df

Unnamed: 0,Book-Title,num_ratings
0,A Light in the Storm: The Civil War Diary of ...,4
1,Always Have Popsicles,1
2,Apple Magic (The Collector's series),1
3,"Ask Lily (Young Women of Faith: Lily Series, ...",1
4,Beyond IBM: Leadership Marketing and Finance ...,1
...,...,...
241066,Ã?Â?lpiraten.,2
241067,Ã?Â?rger mit Produkt X. Roman.,4
241068,Ã?Â?sterlich leben.,1
241069,Ã?Â?stlich der Berge.,3


In [28]:
ratings_with_name['Book-Rating'] = pd.to_numeric(ratings_with_name['Book-Rating'], errors='coerce')
#  REmoved some sring values  that are not converting to numeric dtype
avg_rating_df = ratings_with_name.groupby('Book-Title').mean(numeric_only=True)['Book-Rating'].reset_index()
avg_rating_df.rename(columns={'Book-Rating':'avg_ratings'},inplace=True)
avg_rating_df

Unnamed: 0,Book-Title,avg_ratings
0,A Light in the Storm: The Civil War Diary of ...,2.250000
1,Always Have Popsicles,0.000000
2,Apple Magic (The Collector's series),0.000000
3,"Ask Lily (Young Women of Faith: Lily Series, ...",8.000000
4,Beyond IBM: Leadership Marketing and Finance ...,0.000000
...,...,...
241066,Ã?Â?lpiraten.,0.000000
241067,Ã?Â?rger mit Produkt X. Roman.,5.250000
241068,Ã?Â?sterlich leben.,7.000000
241069,Ã?Â?stlich der Berge.,2.666667


In [29]:
popular_df = num_rating_df.merge(avg_rating_df, on='Book-Title')
popular_df

Unnamed: 0,Book-Title,num_ratings,avg_ratings
0,A Light in the Storm: The Civil War Diary of ...,4,2.250000
1,Always Have Popsicles,1,0.000000
2,Apple Magic (The Collector's series),1,0.000000
3,"Ask Lily (Young Women of Faith: Lily Series, ...",1,8.000000
4,Beyond IBM: Leadership Marketing and Finance ...,1,0.000000
...,...,...,...
241066,Ã?Â?lpiraten.,2,0.000000
241067,Ã?Â?rger mit Produkt X. Roman.,4,5.250000
241068,Ã?Â?sterlich leben.,1,7.000000
241069,Ã?Â?stlich der Berge.,3,2.666667


In [30]:
popular_df=popular_df[popular_df['num_ratings']>=240].sort_values('avg_ratings',ascending=False)

In [31]:
popular_df.merge(books,on='Book-Title').drop_duplicates('Book-Title')[['Book-Title', 'Book-Author', 'Image-URL-M','num_ratings','avg_ratings']]

Unnamed: 0,Book-Title,Book-Author,Image-URL-M,num_ratings,avg_ratings
0,Harry Potter and the Prisoner of Azkaban (Book 3),J. K. Rowling,http://images.amazon.com/images/P/0439136350.0...,428,5.852804
3,Harry Potter and the Goblet of Fire (Book 4),J. K. Rowling,http://images.amazon.com/images/P/0439139597.0...,387,5.824289
5,Harry Potter and the Sorcerer's Stone (Book 1),J. K. Rowling,http://images.amazon.com/images/P/0590353403.0...,278,5.737410
9,Harry Potter and the Order of the Phoenix (Boo...,J. K. Rowling,http://images.amazon.com/images/P/043935806X.0...,347,5.501441
13,Ender's Game (Ender Wiggins Saga (Paperback)),Orson Scott Card,http://images.amazon.com/images/P/0312853238.0...,249,5.409639
...,...,...,...,...,...
768,Whispers,BELVA PLAIN,http://images.amazon.com/images/P/0440216745.0...,286,2.199301
778,Presumed Innocent,Scott Turow,http://images.amazon.com/images/P/0446359866.0...,294,2.139456
784,Isle of Dogs,Patricia Cornwell,http://images.amazon.com/images/P/0425182908.0...,288,2.000000
790,Slow Waltz in Cedar Bend,Robert James Waller,http://images.amazon.com/images/P/0446601640.0...,248,1.895161


###Collabrative Filtering Based Recommendor system

In [32]:
x = ratings_with_name.groupby('User-ID').count()['Book-Rating'] >200
padhe_likhe_users = x[x].index

In [33]:
filtered_rating = ratings_with_name[ratings_with_name['User-ID'].isin(padhe_likhe_users)]

In [34]:
y = filtered_rating.groupby('Book-Title').count()['Book-Rating'] >=50
famous_books = y[y].index

In [35]:
filtered_rating['Book-Title'].isin(famous_books)

2          False
5          False
7          False
15         False
16         False
           ...  
1030883    False
1030884    False
1030885    False
1030886    False
1030887    False
Name: Book-Title, Length: 474007, dtype: bool

In [37]:
final_rating = filtered_rating[filtered_rating['Book-Title'].isin(famous_books)]

In [38]:
pt = final_rating.pivot_table(index='Book-Title',columns='User-ID', values='Book-Rating')

In [39]:
pt.fillna(0,inplace=True)

In [40]:
pt

User-ID,254,2276,2766,2977,3363,4017,4385,6251,6323,6543,...,271705,273979,274004,274061,274301,274308,275970,277427,277639,278418
Book-Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1st to Die: A Novel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2nd Chance,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4 Blondes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A Bend in the Road,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Year of Wonders,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
You Belong To Me,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zen and the Art of Motorcycle Maintenance: An Inquiry into Values,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zoya,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
!pip install scikit-learn

In [44]:
from sklearn.metrics.pairwise import cosine_similarity

In [45]:
similarity_scores = cosine_similarity(pt)

In [46]:
similarity_scores.shape

(706, 706)

In [None]:
def recommend(book_name):
  # fetching index
  try:
    index = np.where(pt.index== book_name)[0][0]
    similar_items =sorted(list(enumerate(similarity_scores[index])), key=lambda x:x[1] , reverse=True)[1:6]#0 is the same book
    for i in similar_items:
      print(pt.index[i[0]])
  except IndexError:
    print('Book name not found, please enter correct name')


In [None]:
recommend('You Belong To Me')

# Task
Modify the `recommend` function to return a list of dictionaries, each containing 'Book-Title', 'Book-Author', and 'Image-URL-M' for recommended books, rather than printing them. Then, test the modified `recommend` function with a sample book to demonstrate the new output format, and explain how this structured data can be used to build a frontend for displaying book recommendations.

## Modify Recommend Function

### Subtask:
Adjust the `recommend` function to return a list of dictionaries, where each dictionary contains the 'Book-Title', 'Book-Author', and 'Image-URL-M' for the recommended books instead of just printing the titles. This will make the data easily consumable for an external frontend.


**Reasoning**:
The first step is to create a new DataFrame named `book_info` by selecting the specified columns from the `books` DataFrame and removing duplicate entries based on 'Book-Title', as instructed. This DataFrame will serve as a lookup table for book details.



In [47]:
book_info = books[['Book-Title', 'Book-Author', 'Image-URL-M']].drop_duplicates('Book-Title')
print("Created book_info DataFrame:")
display(book_info.head())

Created book_info DataFrame:


Unnamed: 0,Book-Title,Book-Author,Image-URL-M
0,Classical Mythology,Mark P. O. Morford,http://images.amazon.com/images/P/0195153448.0...
1,Clara Callan,Richard Bruce Wright,http://images.amazon.com/images/P/0002005018.0...
2,Decision in Normandy,Carlo D'Este,http://images.amazon.com/images/P/0060973129.0...
3,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,http://images.amazon.com/images/P/0374157065.0...
4,The Mummies of Urumchi,E. J. W. Barber,http://images.amazon.com/images/P/0393045218.0...


**Reasoning**:
Now that the `book_info` DataFrame is ready, I will redefine the `recommend` function to retrieve book details from `book_info` and return a list of dictionaries for recommended books, as specified in the instructions. This involves iterating through similar items, looking up their details in `book_info`, and structuring the output as a list of dictionaries.



In [48]:
def recommend(book_name):
  recommended_books_data = []
  try:
    index = np.where(pt.index == book_name)[0][0]
    similar_items = sorted(list(enumerate(similarity_scores[index])), key=lambda x: x[1], reverse=True)[1:6]

    for item in similar_items:
      temp_book_title = pt.index[item[0]]
      book_details = book_info[book_info['Book-Title'] == temp_book_title]

      if not book_details.empty:
        recommended_books_data.append({
            'Book-Title': book_details['Book-Title'].iloc[0],
            'Book-Author': book_details['Book-Author'].iloc[0],
            'Image-URL-M': book_details['Image-URL-M'].iloc[0]
        })
    return recommended_books_data
  except IndexError:
    return []

# print("Modified 'recommend' function to return a list of dictionaries.")

Modified 'recommend' function to return a list of dictionaries.


**Reasoning**:
To verify that the modified `recommend` function works as expected and returns the recommendations in the specified dictionary format, I will call it with a sample book title. This will show the new structured output.



In [49]:
sample_book = 'You Belong To Me'
recommendations = recommend(sample_book)

print(f"Recommendations for '{sample_book}':")
if recommendations:
    for book in recommendations:
        print(book)
else:
    print("No recommendations found or book not in the dataset.")

Recommendations for 'You Belong To Me':
{'Book-Title': 'Loves Music, Loves to Dance', 'Book-Author': 'Mary Higgins Clark', 'Image-URL-M': 'http://images.amazon.com/images/P/0671758896.01.MZZZZZZZ.jpg'}
{'Book-Title': "I'll Be Seeing You", 'Book-Author': 'Mary Higgins Clark', 'Image-URL-M': 'http://images.amazon.com/images/P/0671888587.01.MZZZZZZZ.jpg'}
{'Book-Title': 'Before I Say Good-Bye', 'Book-Author': 'Mary Higgins Clark', 'Image-URL-M': 'http://images.amazon.com/images/P/0671004573.01.MZZZZZZZ.jpg'}
{'Book-Title': "Daddy's Little Girl", 'Book-Author': 'Mary Higgins Clark', 'Image-URL-M': 'http://images.amazon.com/images/P/0743206045.01.MZZZZZZZ.jpg'}
{'Book-Title': 'All Around the Town', 'Book-Author': 'Mary Higgins Clark', 'Image-URL-M': 'http://images.amazon.com/images/P/0671793489.01.MZZZZZZZ.jpg'}


In [50]:
import pickle
import numpy as np
import pandas as pd

# --- Re-create the objects you need, just to be sure ---

# Load data
books = pd.read_csv("Books.csv", low_memory=False)
ratings = pd.read_csv("Ratings.csv")

# Create book_info
book_info = books[['Book-Title', 'Book-Author', 'Image-URL-M']].drop_duplicates('Book-Title')

# Create ratings_with_name
ratings_with_name = ratings.merge(books, on='ISBN')

# Create collaborative filtering objects
x = ratings_with_name.groupby('User-ID').count()['Book-Rating'] > 200
padhe_likhe_users = x[x].index
filtered_rating = ratings_with_name[ratings_with_name['User-ID'].isin(padhe_likhe_users)]
y = filtered_rating.groupby('Book-Title').count()['Book-Rating'] >= 50
famous_books = y[y].index
final_rating = filtered_rating[filtered_rating['Book-Title'].isin(famous_books)]
pt = final_rating.pivot_table(index='Book-Title', columns='User-ID', values='Book-Rating')
pt.fillna(0, inplace=True)

# Calculate similarity
from sklearn.metrics.pairwise import cosine_similarity
similarity_scores = cosine_similarity(pt)

# --- This is the new part: Save the objects ---
pickle.dump(pt.to_dict(), open('pt.pkl', 'wb'))
pickle.dump(similarity_scores, open('similarity_scores.pkl', 'wb'))
pickle.dump(book_info.to_dict(), open('book_info.pkl', 'wb'))
pickle.dump(pt.index.to_list(), open('book_titles.pkl', 'wb')) # We save the titles as a list

print("Files saved: pt.pkl, similarity_scores.pkl, book_info.pkl, book_titles.pkl")

Files saved: pt.pkl, similarity_scores.pkl, book_info.pkl, book_titles.pkl


In [51]:
pip install Flask gunicorn pandas numpy

Collecting Flask
  Downloading flask-3.1.2-py3-none-any.whl.metadata (3.2 kB)
Collecting gunicorn
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting blinker>=1.9.0 (from Flask)
  Downloading blinker-1.9.0-py3-none-any.whl.metadata (1.6 kB)
Collecting click>=8.1.3 (from Flask)
  Downloading click-8.3.0-py3-none-any.whl.metadata (2.6 kB)
Collecting itsdangerous>=2.2.0 (from Flask)
  Downloading itsdangerous-2.2.0-py3-none-any.whl.metadata (1.9 kB)
Collecting werkzeug>=3.1.0 (from Flask)
  Downloading werkzeug-3.1.3-py3-none-any.whl.metadata (3.7 kB)
Downloading flask-3.1.2-py3-none-any.whl (103 kB)
Downloading gunicorn-23.0.0-py3-none-any.whl (85 kB)
Downloading blinker-1.9.0-py3-none-any.whl (8.5 kB)
Downloading click-8.3.0-py3-none-any.whl (107 kB)
Downloading itsdangerous-2.2.0-py3-none-any.whl (16 kB)
Downloading werkzeug-3.1.3-py3-none-any.whl (224 kB)
Installing collected packages: werkzeug, itsdangerous, gunicorn, click, blinker, Flask

   ----------------


[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
pip freeze > requirements.txt

Note: you may need to restart the kernel to use updated packages.
