In [18]:
# load books csv data file 

import gzip 

with gzip.open("goodreads_books.json.gz", "r") as f:
  line = f.readline()

In [19]:
line

b'{"isbn": "0312853122", "text_reviews_count": "1", "series": [], "country_code": "US", "language_code": "", "popular_shelves": [{"count": "3", "name": "to-read"}, {"count": "1", "name": "p"}, {"count": "1", "name": "collection"}, {"count": "1", "name": "w-c-fields"}, {"count": "1", "name": "biography"}], "asin": "", "is_ebook": "false", "average_rating": "4.00", "kindle_asin": "", "similar_books": [], "description": "", "format": "Paperback", "link": "https://www.goodreads.com/book/show/5333265-w-c-fields", "authors": [{"author_id": "604031", "role": ""}], "publisher": "St. Martin\'s Press", "num_pages": "256", "publication_day": "1", "isbn13": "9780312853129", "publication_month": "9", "edition_information": "", "publication_year": "1984", "url": "https://www.goodreads.com/book/show/5333265-w-c-fields", "image_url": "https://images.gr-assets.com/books/1310220028m/5333265.jpg", "book_id": "5333265", "ratings_count": "3", "work_id": "5400751", "title": "W.C. Fields: A Life on Film", "t

In [20]:
import json

json.loads(line)

{'isbn': '0312853122',
 'text_reviews_count': '1',
 'series': [],
 'country_code': 'US',
 'language_code': '',
 'popular_shelves': [{'count': '3', 'name': 'to-read'},
  {'count': '1', 'name': 'p'},
  {'count': '1', 'name': 'collection'},
  {'count': '1', 'name': 'w-c-fields'},
  {'count': '1', 'name': 'biography'}],
 'asin': '',
 'is_ebook': 'false',
 'average_rating': '4.00',
 'kindle_asin': '',
 'similar_books': [],
 'description': '',
 'format': 'Paperback',
 'link': 'https://www.goodreads.com/book/show/5333265-w-c-fields',
 'authors': [{'author_id': '604031', 'role': ''}],
 'publisher': "St. Martin's Press",
 'num_pages': '256',
 'publication_day': '1',
 'isbn13': '9780312853129',
 'publication_month': '9',
 'edition_information': '',
 'publication_year': '1984',
 'url': 'https://www.goodreads.com/book/show/5333265-w-c-fields',
 'image_url': 'https://images.gr-assets.com/books/1310220028m/5333265.jpg',
 'book_id': '5333265',
 'ratings_count': '3',
 'work_id': '5400751',
 'title': '

In [21]:
# function that returns a dictionary from json data
def parse_fields(line):
  data = json.loads(line)
  return {
      "book_id": data["book_id"],
      "title": data["title_without_series"],
      "ratings": data["ratings_count"],
      "url": data["url"],
      "cover_image": data["image_url"]
  }

In [22]:
# opens books csv and stores book titles into an array 

books_titles = []

with gzip.open("goodreads_books.json.gz", 'r') as f:
  while True:
    line = f.readline()
    if not line:
      break
    fields = parse_fields(line)

    try:
        ratings = int(fields["ratings"])
    except ValueError:
        continue
    if ratings > 15:
      books_titles.append(fields)
    

In [23]:
# convert book titles into a dataframe

import pandas as pd

titles = pd.DataFrame.from_dict(books_titles)

In [24]:
# turns title ratings into its own numerical column

titles["ratings"] = pd.to_numeric(titles["ratings"])

In [25]:
# modifies titles to minimize search space (removes special characters)
# mod_title values will be used for retrieving values within the search engine

titles["mod_title"] = titles["title"].str.replace("^a-zA-Z0-9 ]", "", regex=True)

In [26]:
# makes the modified titles all lower case

titles["mod_title"] = titles["mod_title"].str.lower()

In [27]:
# formats spaces

titles["mod_title"] = titles["mod_title"].str.replace("\s+", " ", regex=True)

In [28]:
# remove titles that have nothing

titles = titles[titles["mod_title"].str.len() > 0]

In [36]:
import pickle
import pickletools
import gzip 

with gzip.open("ml/titles.pkl", "wb") as f:
    pickled = pickle.dumps(titles)
    optimized_pickle = pickletools.optimize(pickled)
    f.write(optimized_pickle)

In [30]:
# Vectorizor creates a Term Frequency Inverse Document Freq. Matrix
# Fits to titles dataframe

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

tfidf = vectorizer.fit_transform(titles["mod_title"])


In [37]:
import pickle
import pickletools
import gzip 

with gzip.open("ml/vectorizer.pkl", "wb") as f:
    pickled = pickle.dumps(vectorizer)
    optimized_pickle = pickletools.optimize(pickled)
    f.write(optimized_pickle)

with gzip.open("ml/tfidf.pkl", "wb") as f:
    pickled = pickle.dumps(tfidf)
    optimized_pickle = pickletools.optimize(pickled)
    f.write(optimized_pickle)

In [32]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import re

def make_clickable(val):
    return '<a target="_blank" href="{}">Goodreads</a>'.format(val)

def show_image(val):
    return '<img src="{}" width=50></img>'.format(val)
def search(query):
    # processes the input similarily to the dataset
    processed = re.sub("a-zA-Z0-9 ]", "", query.lower())

    # transforms it into a vectorizer
    query_vec = vectorizer.transform([processed])

    # determine similarity of the two vectors
    similarity = cosine_similarity(query_vec, tfidf).flatten()

    # finds 10 largest similarity values
    indicies = np.argpartition(similarity, -10)[-10:]
    results = titles.iloc[indicies]

    # sorts values in descending order
    results = results.sort_values("ratings", ascending=False)
    
    return results.head(5)

In [33]:
search("Lost World")

Unnamed: 0,book_id,title,ratings,url,cover_image,mod_title
296281,11052224,The Lost World,2199,https://www.goodreads.com/book/show/11052224-t...,https://images.gr-assets.com/books/1327872290m...,the lost world
65052,227286,A World Lost,620,https://www.goodreads.com/book/show/227286.A_W...,https://s.gr-assets.com/assets/nophoto/book/11...,a world lost
38643,6506153,The Lost World,140,https://www.goodreads.com/book/show/6506153-th...,https://s.gr-assets.com/assets/nophoto/book/11...,the lost world
952478,1191487,Lost World,96,https://www.goodreads.com/book/show/1191487.Lo...,https://s.gr-assets.com/assets/nophoto/book/11...,lost world
1117526,367293,Lost World,93,https://www.goodreads.com/book/show/367293.Los...,https://s.gr-assets.com/assets/nophoto/book/11...,lost world
