In [10]:
import pandas as pd
df=pd.read_csv('goodreads_interactions.csv')    # we are reading the csv file which is about 4 GB of size.
df


Unnamed: 0,user_id,book_id,is_read,rating,is_reviewed
0,0,948,1,5,0
1,0,947,1,5,1
2,0,946,1,5,0
3,0,945,1,5,0
4,0,944,1,5,0
...,...,...,...,...,...
228648337,876144,24772,0,0,0
228648338,876144,23847,1,4,0
228648339,876144,23950,1,3,0
228648340,876144,374106,1,5,1


In [3]:
import gzip
with gzip.open("goodreads_books.json.gz",'r') as f:
    line=f.readline()

In the above code the size of zipped json file is about approx 2GB, when we extract it is almost 10GB. So to work with this much big json file we need to have more than 25X of RAM, which is atleast 125 GB RAM. So insteasd of loading the entire file at once in the memory we intended to read only on single line at a time so that memory is managed much more efficiently and we can able to work with much lower computation machines.

In [9]:
line

b'{"isbn": "0312853122", "text_reviews_count": "1", "series": [], "country_code": "US", "language_code": "", "popular_shelves": [{"count": "3", "name": "to-read"}, {"count": "1", "name": "p"}, {"count": "1", "name": "collection"}, {"count": "1", "name": "w-c-fields"}, {"count": "1", "name": "biography"}], "asin": "", "is_ebook": "false", "average_rating": "4.00", "kindle_asin": "", "similar_books": [], "description": "", "format": "Paperback", "link": "https://www.goodreads.com/book/show/5333265-w-c-fields", "authors": [{"author_id": "604031", "role": ""}], "publisher": "St. Martin\'s Press", "num_pages": "256", "publication_day": "1", "isbn13": "9780312853129", "publication_month": "9", "edition_information": "", "publication_year": "1984", "url": "https://www.goodreads.com/book/show/5333265-w-c-fields", "image_url": "https://images.gr-assets.com/books/1310220028m/5333265.jpg", "book_id": "5333265", "ratings_count": "3", "work_id": "5400751", "title": "W.C. Fields: A Life on Film", "t

In [7]:
# importing json function from python for using a load function so that it can display the value in a dictionary format.
import json                 
json.loads(line)

{'isbn': '0312853122',
 'text_reviews_count': '1',
 'series': [],
 'country_code': 'US',
 'language_code': '',
 'popular_shelves': [{'count': '3', 'name': 'to-read'},
  {'count': '1', 'name': 'p'},
  {'count': '1', 'name': 'collection'},
  {'count': '1', 'name': 'w-c-fields'},
  {'count': '1', 'name': 'biography'}],
 'asin': '',
 'is_ebook': 'false',
 'average_rating': '4.00',
 'kindle_asin': '',
 'similar_books': [],
 'description': '',
 'format': 'Paperback',
 'link': 'https://www.goodreads.com/book/show/5333265-w-c-fields',
 'authors': [{'author_id': '604031', 'role': ''}],
 'publisher': "St. Martin's Press",
 'num_pages': '256',
 'publication_day': '1',
 'isbn13': '9780312853129',
 'publication_month': '9',
 'edition_information': '',
 'publication_year': '1984',
 'url': 'https://www.goodreads.com/book/show/5333265-w-c-fields',
 'image_url': 'https://images.gr-assets.com/books/1310220028m/5333265.jpg',
 'book_id': '5333265',
 'ratings_count': '3',
 'work_id': '5400751',
 'title': '

In [8]:
# we create a dictionary for showing the required columns of book dataset.
# parsing a json file is to read the raw json file. we are converting a json object(txt form) to use it insida a program
# this enitre code parse a single line
def parse_fields(line):    
    data=json.loads(line)   # loads - convert json string to python object
    return{                  # load - used to directly read a json file into python onj
        "book_id":data["book_id"],        
        "title":data["title_without_series"],
        "ratings":data["ratings_count"],
        "url":data["url"],
        "cover_image":data["image_url"]

    }

In [18]:
# this code will parse each and every line
# we are also choosing a book with more ratings instead of lesser ones
books_titles=[]
with gzip.open("goodreads_books.json.gz",'r') as f:
    while True:
        lines=f.readline()
        if not line:
            break
        fields=parse_fields(line)
        
        try:
            ratings=int(fields["ratings"])
        except ValueError:
            continue
        if ratings>20:         # if books has more than 20 ratings then we consider the book for suggestions
            books_titles.append(fields)

In [14]:
import pandas as pd
titles=pd.DataFrame.from_dict(books_titles)

In [15]:
titles["ratings"]=pd.to_numeric(titles["ratings"])

In [16]:
# constructing a search engine
titles["mod_title"]=titles["title"].str.replace("[^a-zA-Z0-9 ]","",regex=True) # regex - Replace each occurrence of pattern/regex in the Series/Index.

In [17]:
titles

Unnamed: 0,book_id,title,ratings,url,cover_image,mod_title
0,18279130,Minä Zlatan,23,https://www.goodreads.com/book/show/18279130-m...,https://images.gr-assets.com/books/1375540853m...,Min Zlatan
1,18279130,Minä Zlatan,23,https://www.goodreads.com/book/show/18279130-m...,https://images.gr-assets.com/books/1375540853m...,Min Zlatan
2,18279130,Minä Zlatan,23,https://www.goodreads.com/book/show/18279130-m...,https://images.gr-assets.com/books/1375540853m...,Min Zlatan
3,18279130,Minä Zlatan,23,https://www.goodreads.com/book/show/18279130-m...,https://images.gr-assets.com/books/1375540853m...,Min Zlatan
4,18279130,Minä Zlatan,23,https://www.goodreads.com/book/show/18279130-m...,https://images.gr-assets.com/books/1375540853m...,Min Zlatan
...,...,...,...,...,...,...
273097,18279130,Minä Zlatan,23,https://www.goodreads.com/book/show/18279130-m...,https://images.gr-assets.com/books/1375540853m...,Min Zlatan
273098,18279130,Minä Zlatan,23,https://www.goodreads.com/book/show/18279130-m...,https://images.gr-assets.com/books/1375540853m...,Min Zlatan
273099,18279130,Minä Zlatan,23,https://www.goodreads.com/book/show/18279130-m...,https://images.gr-assets.com/books/1375540853m...,Min Zlatan
273100,18279130,Minä Zlatan,23,https://www.goodreads.com/book/show/18279130-m...,https://images.gr-assets.com/books/1375540853m...,Min Zlatan
