<a href="https://colab.research.google.com/github/tushar-mahalya/Book-Recommender-System/blob/root/Untitled3_brs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Content based Recommender System** 

### **Initializing Data**

This CMU Book Summary Dataset contains plot summaries for 12,055 books extracted from Wikipedia, along with aligned metadata from Freebase, including book author, title, and genre.

**[Source](https://www.kaggle.com/datasets/ymaricar/cmu-book-summary-dataset)**

In [None]:
! pip install kaggle

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
! kaggle d download ymaricar/cmu-book-summary-dataset

Downloading cmu-book-summary-dataset.zip to /content
 92% 15.0M/16.2M [00:01<00:00, 18.7MB/s]
100% 16.2M/16.2M [00:01<00:00, 14.9MB/s]


In [None]:
! unzip /content/cmu-book-summary-dataset.zip

Archive:  /content/cmu-book-summary-dataset.zip
  inflating: booksummaries.txt       


In [None]:
# Importing important libraries

import re
import csv
import json
import nltk
import requests
import numpy as np
import pandas as pd
from tqdm import tqdm
from nltk.stem.porter import PorterStemmer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

### **Data Prepration**

In [None]:
data = []

with open("/content/booksummaries.txt", 'r') as f:
    reader = csv.reader(f, dialect='excel-tab')
    for row in tqdm(reader):
        data.append(row)

16559it [00:00, 24081.95it/s]


In [None]:
book_id = []
book_name = []
author = []
summary = []
genre = []

for i in tqdm(data):
    book_id.append(i[0])
    book_name.append(i[2])
    author.append(i[3])
    genre.append(i[5])
    summary.append(i[6])

books = pd.DataFrame({'book_id': book_id, 'book_name': book_name, 'author':author,
                       'genre': genre, 'summary': summary})

100%|██████████| 16559/16559 [00:00<00:00, 790186.93it/s]


In [None]:
books.drop(books[books['genre']==''].index, inplace=True)
books.drop(books[books['author']==''].index, inplace=True)

In [None]:
genres = []
for i in books['genre']:
    genres.append(list(json.loads(i).values()))
books['genre'] = genres

In [None]:
books.head()

Unnamed: 0,book_id,book_name,author,genre,summary
0,620,Animal Farm,George Orwell,"[Roman à clef, Satire, Children's literature, ...","Old Major, the old boar on the Manor Farm, ca..."
1,843,A Clockwork Orange,Anthony Burgess,"[Science Fiction, Novella, Speculative fiction...","Alex, a teenager living in near-future Englan..."
2,986,The Plague,Albert Camus,"[Existentialism, Fiction, Absurdist fiction, N...",The text of The Plague is divided into five p...
4,2080,A Fire Upon the Deep,Vernor Vinge,"[Hard science fiction, Science Fiction, Specul...",The novel posits that space around the Milky ...
5,2152,All Quiet on the Western Front,Erich Maria Remarque,"[War novel, Roman à clef]","The book tells the story of Paul Bäumer, a Ge..."


In [None]:
raw_books_df = books.copy()

In [None]:
books['genre'] = books['genre'].apply(lambda x:[i.replace(' ','') for i in x])
books['author'] = books['author'].apply(lambda x:x.replace(' ',''))

In [None]:
def clean_summary(text):
    text = re.sub("\'", "", text)
    text = re.sub("[^a-zA-Z]"," ",text)
    text = ' '.join(text.split())
    text = text.lower()
    return text

In [None]:
books['summary'] = books['summary'].apply(lambda x: clean_summary(x))

In [None]:
books['summary'] = books['summary'].apply(lambda x: x.split())
books['author'] = books['author'].apply(lambda x: [x])

In [None]:
books['tags'] = books['summary'] + books['genre'] + books['author']
books['tags'] = books['tags'].apply(lambda x : " ".join(x))

In [None]:
final_books = books[['book_name', 'tags']]

In [None]:
final_books.reset_index(inplace = True, drop = True)
raw_books_df.reset_index(drop = True, inplace = True)

In [None]:
final_books.head()

Unnamed: 0,book_name,tags
0,Animal Farm,old major the old boar on the manor farm calls...
1,A Clockwork Orange,alex a teenager living in near future england ...
2,The Plague,the text of the plague is divided into five pa...
3,A Fire Upon the Deep,the novel posits that space around the milky w...
4,All Quiet on the Western Front,the book tells the story of paul b umer a germ...


In [None]:
ps = PorterStemmer()

In [None]:
def stemming(text):
  lst = []
  for i in text.split():
    lst.append(ps.stem(i))
  
  return " ".join(lst)

In [None]:
final_books['tags'] = final_books['tags'].apply(stemming)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_books['tags'] = final_books['tags'].apply(stemming)


In [None]:
cv = CountVectorizer(max_features=5000, stop_words = 'english')

vectors = cv.fit_transform(final_books['tags'])
similarity = cosine_similarity(vectors)

In [None]:
def recommend(movie):
  book_index = final_books[final_books['book_name'] == movie].index[0]
  distances = similarity[book_index]
  book_lst = sorted(list(enumerate(distances)), reverse = True, key = lambda x:x[1])[1:11]
  for i in book_lst:
    print(final_books.iloc[i[0]].book_name)

In [None]:
recommend('Blade Runner 3: Replicant Night')

Blade Runner 4: Eye and Talon
The Fifth Man
Street of Shadows
Redgauntlet
The Rebels
Sufferings in Africa: The Incredible True Story of a Shipwreck, Enslavement, and Survival on the Sahara
The Hunters
Deathstalker
The Abyss
Shooting Script


In [None]:
def string_formater(title,author):
  title = title.split()
  author = author.split()
  if len(title) > 3:
    string = f'{title[0]}+{title[1]}+{title[2]}+{title[3]}' + '+inauthor:' + author[0]
  elif len(title) > 2:
    string = f'{title[0]}+{title[1]}+{title[2]}' + '+inauthor:' + author[0]
  elif len(title) > 1:
    string = f'{title[0]}+{title[1]}' + '+inauthor:' + author[0]
  else:
    string = title[0] + '+inauthor:' + author[0]
  return string


In [None]:
def image_extractor(df):
  img_lst = []
  for title,author in zip(df['book_name'],df['author']):
    try:
      url = "https://www.googleapis.com/books/v1/volumes?q=" + string_formater(title,author)
      data = requests.get(url).json()
      img = data['items'][0]['volumeInfo']['imageLinks']['thumbnail']
    except KeyError:
      img = 'https://drupal.nypl.org/sites-drupal/default/files/blogs/sJ3CT4V.gif'
    img_lst.append(img)
  return img_lst


In [None]:
abc = image_extractor(raw_books_df.iloc[:11,:])

http://books.google.com/books/content?id=Cxi8swEACAAJ&printsec=frontcover&img=1&zoom=1&source=gbs_api
http://books.google.com/books/content?id=fVnqpwphxeAC&printsec=frontcover&img=1&zoom=1&edge=curl&source=gbs_api
http://books.google.com/books/content?id=7_Blv5sqceoC&printsec=frontcover&img=1&zoom=1&source=gbs_api
http://books.google.com/books/content?id=fCCWWgZ7d6UC&printsec=frontcover&img=1&zoom=1&edge=curl&source=gbs_api
http://books.google.com/books/content?id=iPopAQAAMAAJ&printsec=frontcover&img=1&zoom=1&source=gbs_api
http://books.google.com/books/content?id=XOjkBwAAQBAJ&printsec=frontcover&img=1&zoom=1&edge=curl&source=gbs_api
http://books.google.com/books/content?id=VXR8EAAAQBAJ&printsec=frontcover&img=1&zoom=1&edge=curl&source=gbs_api
http://books.google.com/books/content?id=fcUZAgAAQBAJ&printsec=frontcover&img=1&zoom=1&edge=curl&source=gbs_api
http://books.google.com/books/content?id=MVzOsSFbo6kC&printsec=frontcover&img=1&zoom=1&edge=curl&source=gbs_api


In [None]:
abc

['http://books.google.com/books/content?id=Cxi8swEACAAJ&printsec=frontcover&img=1&zoom=1&source=gbs_api',
 'http://books.google.com/books/content?id=fVnqpwphxeAC&printsec=frontcover&img=1&zoom=1&edge=curl&source=gbs_api',
 'http://books.google.com/books/content?id=7_Blv5sqceoC&printsec=frontcover&img=1&zoom=1&source=gbs_api',
 'http://books.google.com/books/content?id=fCCWWgZ7d6UC&printsec=frontcover&img=1&zoom=1&edge=curl&source=gbs_api',
 'http://books.google.com/books/content?id=iPopAQAAMAAJ&printsec=frontcover&img=1&zoom=1&source=gbs_api',
 'http://books.google.com/books/content?id=XOjkBwAAQBAJ&printsec=frontcover&img=1&zoom=1&edge=curl&source=gbs_api',
 'http://books.google.com/books/content?id=VXR8EAAAQBAJ&printsec=frontcover&img=1&zoom=1&edge=curl&source=gbs_api',
 'https://drupal.nypl.org/sites-drupal/default/files/blogs/sJ3CT4V.gif',
 'http://books.google.com/books/content?id=fcUZAgAAQBAJ&printsec=frontcover&img=1&zoom=1&edge=curl&source=gbs_api',
 'http://books.google.com/boo