In [1]:
import pandas as pd
import numpy as np
import gzip
import json
import re
import os
import sys
import matplotlib.pyplot as plt
import seaborn as sns

sns.set()


%matplotlib inline

# Section 1. Cleaning up the books data

Using the json file from UCSD which contains about 2.6M book entries, clean up the data.

This consists of:
1. Removing non-English and children books

This is achieved by extracting the books where language_id is not null or eng. Somehow this entry is not correctly labelled all the time, and English books do not necessarily have a correct language_id (it can be void). If child is found in the shelves names (or tags) remove them as well.

2. Cleaning tags

Tags are redundant and very messy. Previous EDA showed that the first most used 500 tags are enough to cover about 90% of the number of books (confirmer), and that after the third tag generally speaking more errors happen.
2.1 Remove non-informative tags such as to-read etc...
2.2 Recover abbreviations, ex ya for young-adult


In [2]:
# Some useful methods related to json files

# Reading the input json file
def load_data(inputdata):
    books = []
    with gzip.open(inputdata) as json_file:
        for line in json_file:
            book = json.loads(line)
            books.append(book)
    return books

# Extracting subset of the data
def split_data(inputdata, cutoff):
    books = []
    count = 0
    with gzip.open(inputdata) as json_file:
        for line in json_file:
            if count <= cutoff:
                book = json.loads(line)
                count += 1
                books.append(book)
    return books

# get data needed from json
def get_books_data(inputdata):
    books = []
    with open(inputdata) as json_file:
        for lines in json_file:
            book = {}
            line = json.loads(lines)
            names = []
            counts = []                   
            names = [elm['name'] for elm in line['popular_shelves']] 
            counts = [elm['count'] for elm in line['popular_shelves']]
            book['isbn'] = line['isbn']
            book['book_id'] = line['book_id']
            book['popular_shelves'] = names
            book['count_shelves'] = counts
            book['title'] = line['title']
            book['num_pages'] = line['num_pages']
            book['publication_year'] = line['publication_year']
            book['average_rating'] = line['average_rating']
            book['ratings_count'] = line['ratings_count']
            books.append(book)
    return books


## 1.1 Getting English books


Converting the json file into a pandas dataframe, total number of books in the database is: 2,360,655

In [3]:
def clean_data_language(inputdata, outputfile):
    eng_lan = ['en', 'enm', 'en-US', 'en-GB', '']
    child_books = ['child', 'children', 'children-s', 'childrens', 'kids-books', 'childrens-s-books']

    output = open(outputfile, 'w')
    with gzip.open(inputdata) as json_file:
        for line in json_file:
            book = json.loads(line)
            names = [elm['name'] for elm in book['popular_shelves']]
            if (book['language_code'] in eng_lan) and (not set(child_books).isdisjoint(names[:6]) == False):
                output.write('{}\n'.format(json.dumps(book)))
    output.close()



In [4]:
# First get english and non-children books
books_json = clean_data_language('goodreads_books.json.gz', 'books_en_nochild.json')

After removing these entries, the number of books is now up to: 1,156,654

In [5]:
books_en = get_books_data('books_en_nochild.json')
booksdf = pd.DataFrame(books_en)

In [6]:
len(booksdf)

1156654

## 2.1 Removing unwanted tags

In [8]:
def clean_data_nasty_tags(dfin):

    popular_shelves = dfin['popular_shelves']
    count_shelves = dfin['count_shelves']
    
    popnew = []
    cnew = []

    for elm, elmc in zip(popular_shelves, count_shelves):    
        to_remove = []
        for i in range(len(elm)):
            if (len(elm[i]) == 1) or (bool(re.match('.*book*', elm[i])) == True)\
            or (bool(re.match('.*read*', elm[i])) == True) or (bool(re.match('.*favorite*', elm[i])) == True)\
            or (bool(re.match('.*need*', elm[i])) == True) or (bool(re.match('.*own*', elm[i])) == True)\
            or (bool(re.match('.*shelve*', elm[i])) == True) or (bool(re.match('.*like*', elm[i])) == True)\
            or (bool(re.match('.*shelf*', elm[i])) == True) or (bool(re.match('.*buy*', elm[i])) == True)\
            or (bool(re.match('tbr', elm[i])) == True) or (bool(re.match('.*finish*', elm[i])) == True)\
            or (bool(re.match('.*kindle*', elm[i])) == True) or (bool(re.match('.*list*', elm[i])) == True)\
            or (bool(re.match('.*year*', elm[i])) == True) or (bool(re.match('.*audio*', elm[i])) == True)\
            or (bool(re.match('.*library*', elm[i])) == True) or \
            (bool(re.match('[^\x00-\x7F\x80-\xFF\u0100-\u017F\u0180-\u024F\u1E00-\u1EFF]', elm[i])) == True):
                to_remove.append(i) 
        
        popnew.append([elm[e] for e in range(len(elm)) if e not in to_remove])
        cnew.append([elmc[e] for e in range(len(elmc)) if e not in to_remove])
 
    return popnew, cnew

In [None]:
popnew, cnew = clean_data_nasty_tags(booksdf)
new_column_p = pd.Series(popnew, name='popular_shelves', index=range(len(popnew)))
new_column_c = pd.Series(cnew, name='count_shelves', index=range(len(cnew)))


In [None]:
#list_tags = alldf['popular_shelves'].to_list()
test = [e for elm in popnew for e in elm]
for i in test:
    if i == 'business':
        print(i)

In [None]:

booksdf.update(new_column_p)
booksdf.update(new_column_c)