# Importing the file
####Through gzip so that it doesnt take a lot of memory.

In [1]:
import gzip

with gzip.open(r"C:\Users\91702\Downloads\goodreads_books.json.gz") as f:
    line = f.readline()

In [2]:
line

b'{"isbn": "0312853122", "text_reviews_count": "1", "series": [], "country_code": "US", "language_code": "", "popular_shelves": [{"count": "3", "name": "to-read"}, {"count": "1", "name": "p"}, {"count": "1", "name": "collection"}, {"count": "1", "name": "w-c-fields"}, {"count": "1", "name": "biography"}], "asin": "", "is_ebook": "false", "average_rating": "4.00", "kindle_asin": "", "similar_books": [], "description": "", "format": "Paperback", "link": "https://www.goodreads.com/book/show/5333265-w-c-fields", "authors": [{"author_id": "604031", "role": ""}], "publisher": "St. Martin\'s Press", "num_pages": "256", "publication_day": "1", "isbn13": "9780312853129", "publication_month": "9", "edition_information": "", "publication_year": "1984", "url": "https://www.goodreads.com/book/show/5333265-w-c-fields", "image_url": "https://images.gr-assets.com/books/1310220028m/5333265.jpg", "book_id": "5333265", "ratings_count": "3", "work_id": "5400751", "title": "W.C. Fields: A Life on Film", "t

### Importing json

In [3]:
import json

json.loads(line)

{'isbn': '0312853122',
 'text_reviews_count': '1',
 'series': [],
 'country_code': 'US',
 'language_code': '',
 'popular_shelves': [{'count': '3', 'name': 'to-read'},
  {'count': '1', 'name': 'p'},
  {'count': '1', 'name': 'collection'},
  {'count': '1', 'name': 'w-c-fields'},
  {'count': '1', 'name': 'biography'}],
 'asin': '',
 'is_ebook': 'false',
 'average_rating': '4.00',
 'kindle_asin': '',
 'similar_books': [],
 'description': '',
 'format': 'Paperback',
 'link': 'https://www.goodreads.com/book/show/5333265-w-c-fields',
 'authors': [{'author_id': '604031', 'role': ''}],
 'publisher': "St. Martin's Press",
 'num_pages': '256',
 'publication_day': '1',
 'isbn13': '9780312853129',
 'publication_month': '9',
 'edition_information': '',
 'publication_year': '1984',
 'url': 'https://www.goodreads.com/book/show/5333265-w-c-fields',
 'image_url': 'https://images.gr-assets.com/books/1310220028m/5333265.jpg',
 'book_id': '5333265',
 'ratings_count': '3',
 'work_id': '5400751',
 'title': '

### Writing a fn for choosing desired data

In [4]:
def parse_fields(line):
    data = json.loads(line)
    return {
        "book_id": data["book_id"], 
        "title": data["title_without_series"], 
        "ratings": data["ratings_count"], 
        "url": data["url"], 
        "cover_image": data["image_url"]
    }

### Function for choosing rating count above 15 only 

In [5]:
books_titles = []
with gzip.open(r"C:\Users\91702\Downloads\goodreads_books.json.gz") as f:
    while True:
        line = f.readline()
        if not line:
            break
        fields = parse_fields(line)
        try:
            ratings = int(fields["ratings"])
        except ValueError:
            continue
        if ratings > 15:
            books_titles.append(fields)

#### Creating a DataFrame

In [6]:
import pandas as pd
titles = pd.DataFrame.from_dict(books_titles)

In [7]:
titles.head()
titles["ratings"].dtype

dtype('O')

In [8]:
titles["ratings"] = pd.to_numeric(titles["ratings"])
titles["ratings"].dtype

dtype('int64')

####  Preprocessing the book title, to avoid search problems

In [9]:
#  titles will be like:
# HARRY POTTER
# Harry Potter
# harry potter
# Harry-PottER
# w/o preprocessing model will treat this as different thiings
# replacing any character such as (,-,/,. with nothing

In [10]:
titles["mod_title"] = titles["title"].str.replace("[^a-zA-Z0-9]"," ",regex=True)

In [11]:
titles.head()

Unnamed: 0,book_id,title,ratings,url,cover_image,mod_title
0,7327624,"The Unschooled Wizard (Sun Wolf and Starhawk, ...",140,https://www.goodreads.com/book/show/7327624-th...,https://images.gr-assets.com/books/1304100136m...,The Unschooled Wizard Sun Wolf and Starhawk ...
1,6066819,Best Friends Forever,51184,https://www.goodreads.com/book/show/6066819-be...,https://s.gr-assets.com/assets/nophoto/book/11...,Best Friends Forever
2,287141,The Aeneid for Boys and Girls,46,https://www.goodreads.com/book/show/287141.The...,https://s.gr-assets.com/assets/nophoto/book/11...,The Aeneid for Boys and Girls
3,6066812,All's Fairy in Love and War (Avalon: Web of Ma...,98,https://www.goodreads.com/book/show/6066812-al...,https://images.gr-assets.com/books/1316637798m...,All s Fairy in Love and War Avalon Web of Ma...
4,287149,The Devil's Notebook,986,https://www.goodreads.com/book/show/287149.The...,https://images.gr-assets.com/books/1328768789m...,The Devil s Notebook


In [12]:
titles["mod_title"] = titles["mod_title"].str.lower()

In [13]:
# Removing more than one spaces
titles["mod_title"] = titles["mod_title"].str.replace("\s+"," ",regex=True)

In [14]:
# DROPPING NULL TITLES
titles = titles[titles["mod_title"].str.len() > 0]

In [15]:
titles.to_json("book_titles.json")

In [16]:
titles.head()

Unnamed: 0,book_id,title,ratings,url,cover_image,mod_title
0,7327624,"The Unschooled Wizard (Sun Wolf and Starhawk, ...",140,https://www.goodreads.com/book/show/7327624-th...,https://images.gr-assets.com/books/1304100136m...,the unschooled wizard sun wolf and starhawk 1 2
1,6066819,Best Friends Forever,51184,https://www.goodreads.com/book/show/6066819-be...,https://s.gr-assets.com/assets/nophoto/book/11...,best friends forever
2,287141,The Aeneid for Boys and Girls,46,https://www.goodreads.com/book/show/287141.The...,https://s.gr-assets.com/assets/nophoto/book/11...,the aeneid for boys and girls
3,6066812,All's Fairy in Love and War (Avalon: Web of Ma...,98,https://www.goodreads.com/book/show/6066812-al...,https://images.gr-assets.com/books/1316637798m...,all s fairy in love and war avalon web of magi...
4,287149,The Devil's Notebook,986,https://www.goodreads.com/book/show/287149.The...,https://images.gr-assets.com/books/1328768789m...,the devil s notebook


###  Importing vectorizer
Term Frequency, inverse document frequency, is used to convert titles into numbers and compare them. Its useful in search engines.

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

In [18]:
# CREATING A TFIDF FOR MODTITLE
tfidf = vectorizer.fit_transform(titles["mod_title"])


#### Importing cosine similarity 

In [19]:
# to compare search query and mod title we use cosine similarity
# Using numpy arg partition to choose 10 most similar titles

In [20]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import re

In [39]:
def make_clickable(val):
    return '<a target="_blank" href="{}">Goodreads</a>'.format(val,val)
def show_image(val):
    return '<a href="{}"><img src="{}" width=50></img></a>'.format(val,val)

def search(query,vectorizer):
    query = query
    processed = re.sub("[^a-zA-Z0-9]"," ",query.lower())
    query_vec = vectorizer.transform([processed])
    similarity = cosine_similarity(query_vec,tfidf).flatten()
    indices = np.argpartition(similarity,-10)[-10:]
    results = titles.iloc[indices]
    results = results.sort_values("ratings",ascending=False)
    return results.head(5).style.format({'url' : make_clickable,'cover_image':show_image})


In [50]:
search("Sherlock",vectorizer)

Unnamed: 0,book_id,title,ratings,url,cover_image,mod_title
95878,20662423,Sherlock: Chronicles,702,Goodreads,,sherlock chronicles
56267,100964,Sherlock in Love,401,Goodreads,,sherlock in love
811567,17316533,Sherlock Bones 1,245,Goodreads,,sherlock bones 1
376790,30280513,"Sherlock: A Study in Pink (Sherlock, #1)",203,Goodreads,,sherlock a study in pink sherlock 1
1187288,29223627,"Sherlock: The Blind Banker (Sherlock, #2)",138,Goodreads,,sherlock the blind banker sherlock 2


In [51]:
liked_books = ["818056","22010842","16050719","18114797","893172",'20662423']