## First split it into workable chunks
You need the .gz files from https://sites.google.com/eng.ucsd.edu/ucsdbookgraph/home

In [5]:
import gzip
import json
import re
import os
import sys
import numpy as np
import pandas as pd
import unicodedata as ud

In [1]:
# to delete all unnecessary columns and select all english books of "goodreads_books.json.gz" 
lang = ['en', 'en-CA', 'en-US', 'en-GB', 'eng']
toDel = ['country_code', 'is_ebook', 'kindle_asin',  'link', 'url','publication_day', 'publication_month', 'num_pages', 'popular_shelves', 'publisher', 'title_without_series', 'format']

def load_data(file_name, start, end):
    count = 0
    data = []
    with gzip.open(file_name) as fin:
        for l in fin:
            if count < start:
                count += 1
                continue
            elif count >= end:
                break
            else:
                d = json.loads(l)
                if d['language_code'] in lang:
                    for element in toDel:
                        d.pop(element)
                    data.append(d)
                count += 1
        print(len(data))
        return data
DIR = './'

In [None]:
# books = load_data(os.path.join(DIR, 'goodreads_books.json.gz'),0,500000)
# books = load_data(os.path.join(DIR, 'goodreads_books.json.gz'),500000,1000000)
# books = load_data(os.path.join(DIR, 'goodreads_books.json.gz'),1000000,1500000)
# books = load_data(os.path.join(DIR, 'goodreads_books.json.gz'),1500000,2000000)
# books = load_data(os.path.join(DIR, 'goodreads_books.json.gz'),2000000, 2370000)
books = load_data(os.path.join(DIR, 'goodreads_books.json.gz'),0, 2370000)

In [None]:
with open('../../GoodReadsDatagoodreadsBooks.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(books, ensure_ascii=False))

### Genre and Authors don't need to be split so lets just store it

In [None]:
def load_all_data(file_name):
    data = []
    with gzip.open(file_name) as fin:
        for l in fin:
            d = json.loads(l)
            data.append(d)
            
        print(len(data))
        return data
DIR = './'

In [None]:
genre = load_all_data(os.path.join(DIR, 'goodreads_book_genres_initial.json.gz'))
with open('../../GoodReadsDatagoodreadsGenre.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(genre, ensure_ascii=False))


In [None]:
author = load_all_data(os.path.join(DIR, 'goodreads_book_authors.json.gz'))
with open('../../GoodReadsDatagoodreadsAuthors.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(author, ensure_ascii=False))

### Here we split the 15+ million reviews into 4 parts of 4 million

In [4]:
toDel = ['review_text', 'date_added', 'date_updated', 'read_at', 'started_at', 'n_votes', 'n_comments', 'review_id']

def load_data(file_name, start, end):
    count = 0
    data = []
    with gzip.open(file_name) as fin:
        for l in fin:
            if count < start:
                count += 1
                continue
            elif count >= end:
                break
            else:
                d = json.loads(l)
                for element in toDel:
                    d.pop(element)
                data.append(d)
                count += 1
        print(len(data))
        return data
DIR = './'

review = load_data(os.path.join(DIR, 'goodreads_reviews_dedup.json.gz'),0, 4000000)
review = load_data(os.path.join(DIR, 'goodreads_reviews_dedup.json.gz'),4000000, 8000000)
review = load_data(os.path.join(DIR, 'goodreads_reviews_dedup.json.gz'),8000000, 12000000)
review = load_data(os.path.join(DIR, 'goodreads_reviews_dedup.json.gz'),12000000, 16000000)

739967


In [5]:
with open('../../GoodReadsData/goodreadsReview1.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(review, ensure_ascii=False))

## Now lets clean each part of the data set

In [2]:
books = pd.read_json("../../GoodReadsData/goodreadsBooks.json")

In [4]:
#getting rid of books without title which also don't have other datas 
books = books[books['title'] != ""]


In [None]:
#detect for latin words in title. because there are other language books labeled as english
latin_letters= {}

def is_latin(uchr):
    try: return latin_letters[uchr]
    except KeyError:
         return latin_letters.setdefault(uchr, 'LATIN' in ud.name(uchr))

def only_roman_chars(unistr):
    return all(is_latin(uchr)
           for uchr in unistr
           if uchr.isalpha())

In [None]:
for index, row in books.iterrows():
    if only_roman_chars(row["title"]) != True:
        books.drop(index, inplace=True)

In [None]:
books = books.reset_index(drop=True)
#By sorting in decending the duplicates with empty isbn or isbn13 will be under the ones with isbn or isbn13 
#So the one without those code will be flagged as duplicates
books = books.sort_values(by=['isbn', 'isbn13','title'],ascending=False)

# remove duplicates, it is possbile of two same title name with different publication_year
books = books.drop_duplicates(subset=['publication_year','title'])
print("Length after deleting duplicate: " + str(len(books)))
books = books.sort_values(by=['title'])
books = books.reset_index(drop=True)
books.pop("language_code")

### Depends on your memory usage you can store it into a new json file 


In [None]:
books.to_json('../../GoodReadsData/goodreadsBooks.json',orient='records')
#restart kernel en run the imports block and then continue from this next block

In [None]:
books = pd.read_json('../../GoodReadsData/goodreadsBooks.json')
bookIds = books['book_id'].to_list()

### OR ###

# bookIds = books['book_id'].to_list()

# lets drop the bookids in genre that doen't exist anymore in book.json
genres = pd.read_json("../../GoodReadsData/goodreadsGenre.json")
genres = genres[genres['book_id'].isin(bookIds)]
genres = genres.reset_index(drop=True)

#then lets find all the books without genres
noGenre = genres[genres['genres']=={}]
noGenre = noGenre["book_id"].to_list()

#remove every book without genre. we would scrape them but with the given time it is impposible
books = books[~books['book_id'].isin(noGenre)]
books = books.reset_index(drop=True)\
books = books.sort_values(by=['book_id'])

genres = genres[genres['genres']!={}]
genres = genres.sort_values(by=['book_id'])

# this might be an unneeded step but it is easier for project mate to make use of the genre for ML
genresOnly = genres['genres'].tolist()
newGenre = []
for obj in genresOnly:
    temp = []
    for key in obj:
        if ", " in key:
            nested = key.split(", ")
            for single in nested:
                temp.append(single)
        else:
            temp.append(key)
    newGenre.append(temp)
    
genres["categories"]  = newGenre
genres = genres.drop(['genres'], axis=1)

books['categories'] = newGenre

genres.to_json('../../GoodReadsData/goodreadsGenre.json',orient='records')
books.to_json('../../GoodReadsData/goodreadsBooks.json',orient='records')

## Depends on your memory usage you might wanna reset kernel run the import block and work after this line

In [None]:
#lets clean the reviews since we reduce 2million books to 700k books there should be also less reviews

books = pd.read_json('../../GoodReadsData/goodreadsBooks.json')
bookIds = books['book_id'].to_list()

### OR ###

# bookIds = books['book_id'].to_list()

review1 = pd.read_json("../../GoodReadsData/goodreadsReview1.json")
review2 = pd.read_json("../../GoodReadsData/goodreadsReview2.json")
review3 = pd.read_json("../../GoodReadsData/goodreadsReview3.json")
review4 = pd.read_json("../../GoodReadsData/goodreadsReview4.json")

review1 = review1[review1["book_id"].isin(bookid)]
review2 = review2[review2["book_id"].isin(bookid)]
review3 = review3[review3["book_id"].isin(bookid)]
review4 = review4[review4["book_id"].isin(bookid)]

reviews = review1.append(review1 ,ignore_index = True)
reviews = reviews.append(review2 ,ignore_index = True)
reviews = reviews.append(review3 ,ignore_index = True)
reviews = reviews.append(review4 ,ignore_index = True)

reviews.to_json("../../GoodReadsData/goodreadsReviews.json", orient='records')