In [48]:
import os
import random
import pickle
import requests
import faker
import pandas as pd

In [2]:
def download_images(books):
    for book in books.iterrows():
        filename = os.path.join('media', 'book_covers', str(book[0] + 1) + '.png')
        with open(filename, 'wb') as file:
            image = requests.get(book[1].image_url)
            file.write(image.content)

In [3]:
books = pd.read_csv('data/books.csv')

In [4]:
books.head()

Unnamed: 0,id,book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
0,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,...,4780653,4942365,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...
1,2,3,3,4640799,491,439554934,9780440000000.0,"J.K. Rowling, Mary GrandPré",1997.0,Harry Potter and the Philosopher's Stone,...,4602479,4800065,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...
2,3,41865,41865,3212258,226,316015849,9780316000000.0,Stephenie Meyer,2005.0,Twilight,...,3866839,3916824,95009,456191,436802,793319,875073,1355439,https://images.gr-assets.com/books/1361039443m...,https://images.gr-assets.com/books/1361039443s...
3,4,2657,2657,3275794,487,61120081,9780061000000.0,Harper Lee,1960.0,To Kill a Mockingbird,...,3198671,3340896,72586,60427,117415,446835,1001952,1714267,https://images.gr-assets.com/books/1361975680m...,https://images.gr-assets.com/books/1361975680s...
4,5,4671,4671,245494,1356,743273567,9780743000000.0,F. Scott Fitzgerald,1925.0,The Great Gatsby,...,2683664,2773745,51992,86236,197621,606158,936012,947718,https://images.gr-assets.com/books/1490528560m...,https://images.gr-assets.com/books/1490528560s...


In [57]:
def drop_columns(data):
    return data.drop(['book_id', 'best_book_id', 'work_id', 'books_count', 'isbn', 'title', 'language_code', 
               'isbn13', 'ratings_count', 'work_ratings_count', 'work_text_reviews_count', 'id',
               'ratings_1', 'ratings_2', 'ratings_3', 'ratings_4', 'ratings_5', 'small_image_url'],
             axis=1)

def drop_na(data):
    return data.dropna()

def rename_columns(data):
    return data.rename(columns={'original_publication_year': 'publication_year', 'original_title': 'title',
                 'average_rating': 'rating'})

def year_as_int(data):
    data['publication_year'] = data['publication_year'].astype(int)
    return data

def drop_empty_images(data):
    return data.drop(data[data['image_url'].str.contains('nophoto', na = False)].index, axis=0)

def url_to_large_imgs(data):
    
    def switch_letters(image_url):
        url_parts = image_url.split('/')
        url_parts[-2] = url_parts[-2].replace('m', 'l')
        return '/'.join(url_parts)
    
    data['image_url'] = data['image_url'].apply(switch_letters)
    return data

def popular_authors_books(data, authors):
    output = pd.DataFrame(columns=data.columns)
    for author in authors:
        output = output.append(data[data['authors'].str.contains(author)],
                               ignore_index=True)
    return output

def image_url_to_path(data):
    data.insert(0, 'cover_img', ['book_covers/' + str(x) + '.png' for x in range(1, len(data)+1)])
    return data.drop('image_url', axis=1)
    

""" Utils """

def get_top_authors(data, *, n=200):
    authors = {}
    for book in books['authors'].tolist():
        for author in book.split(', '):
            if author not in authors:
                authors[author] = 0
            authors[author] += 1
            
    return dict(sorted(authors.items(),
                       key=lambda x: x[1],
                       reverse=True)
                [:200]).keys()

In [58]:
books = pd.read_csv('data/books.csv')
authors = get_top_authors(books)

books = (books
    .pipe(drop_columns) 
    .pipe(drop_na)
    .pipe(rename_columns) 
    .pipe(year_as_int) 
    .pipe(drop_empty_images)
    .pipe(url_to_large_imgs)
    .pipe(popular_authors_books, authors)
    .pipe(image_url_to_path)
)

books

Unnamed: 0,cover_img,authors,publication_year,title,rating
0,book_covers/1.png,James Patterson,2005,The Angel Experiment,4.08
1,book_covers/2.png,James Patterson,2006,School's Out Forever,4.16
2,book_covers/3.png,"James Patterson, Keith David, Anthony Heald",1997,Cat & Mouse,3.96
3,book_covers/4.png,"James Patterson, Maxine Paetro",2006,The 5th Horseman,4.03
4,book_covers/5.png,"James Patterson, Maxine Paetro",2007,The 6th Target,4.02
...,...,...,...,...,...
2225,book_covers/2226.png,"Brian Herbert, Kevin J. Anderson",2000,Dune: House Harkonnen,3.63
2226,book_covers/2227.png,"Brian Herbert, Kevin J. Anderson",2002,Dune: The Machine Crusade,3.73
2227,book_covers/2228.png,Kevin J. Anderson,1994,Jedi Search (Star Wars: The Jedi Academy Trilo...,3.64
2228,book_covers/2229.png,"Brian Herbert, Kevin J. Anderson",2001,Dune: House Corrino,3.64


In [60]:
books.to_csv(os.path.join('data', 'books_cleaned.csv'), index=False)

In [25]:
with open(os.path.join('data', 'authors.pickle'), 'wb') as file:
    pickle.dump(sorted(list(authors)), file)

In [67]:
sorted(list(authors))

['Abbi Glines',
 'Agatha Christie',
 'Alexander McCall Smith',
 'Ally Carter',
 'Anita Shreve',
 'Anne McCaffrey',
 'Anne Rice',
 'Anne Tyler',
 'Anonymous',
 'Anthony Horowitz',
 'Arthur C. Clarke',
 'Arthur Conan Doyle',
 'Bernard Cornwell',
 'Beverly Cleary',
 'Bill Bryson',
 'Bill Watterson',
 'Bill Willingham',
 'Brad Thor',
 'Brandon Sanderson',
 'Brett Helquist',
 'Brian Jacques',
 'Brian K. Vaughan',
 'C.S. Lewis',
 'Carl Hiaasen',
 'Cassandra Clare',
 'Cecelia Ahern',
 'Charlaine Harris',
 'Charles Bukowski',
 'Charles Dickens',
 'Charlie Adlard',
 'Christine Feehan',
 'Christopher Moore',
 'Chuck Palahniuk',
 'Cliff Rathburn',
 'Clive Barker',
 'Clive Cussler',
 'Colleen Hoover',
 'D.J. MacHale',
 'Daniel Silva',
 'Darren Shan',
 'David Baldacci',
 'David Eddings',
 'Dean Koontz',
 'Dennis Lehane',
 'Diana Gabaldon',
 'Douglas Adams',
 'Douglas Preston',
 'Dr. Seuss',
 'Edgar Allan Poe',
 'Elin Hilderbrand',
 'Eoin Colfer',
 'Erin Hunter',
 'Ernest Hemingway',
 'Francine Rive

In [42]:
book = dict(next(books.iterrows())[1])