In [1]:
# Load necessary libraries
import urllib.request, urllib.parse
from urllib.error import HTTPError, URLError
import json
import pandas as pd
import requests
import numpy as np
import re

In [2]:
# Read my Books dataset which I had retrieved & cleaned from Wikipedia webpage
books_df = pd.read_csv("Website_Books_cleaned.csv")
len_df = books_df.shape[0]                          # Calculate the number of rows in the csv file

In [3]:
def get_data(book_name, book_author):
    url = "https://www.googleapis.com/books/v1/volumes?key=AIzaSyDE-6s4_rLLaOpJorMwpmRYYZfKatU3bhk&q=intitle:"+book_name+"+"+book_author

    querystring = ("")
    # Set header for the http request
    headers = {'cache-control':'no-cache'}
    # connect and receive the response back from API
    response = requests.request("GET", url, headers=headers, params=querystring)
    if response.status_code == 200:
        # Parse JSON response to text
        response_text = json.loads(response.text)
        return response_text
    else:
        return None

In [4]:
def build_dataframe(json_data):
    """
    Call to this function will build dataframe from json_data
    """
    
    # Define empty dictionary with headers of your choice
    MF_dict = {'Book_Title':[], 'Author':[], 'Publisher':[], 'Published_Date':[], 'ISBN_10':[], 'ISBN_13':[], 'Page_Count':[], 
             'Average_Rating':[], 'Rating_Count':[], 'Sale_Country':[], 'Sale_Amount':[], 'Sale_Currency':[]}
    
    # Loop for each dictionary element in JSON list
    try:
        for d in json_data['items']:
        
            MF_dict['Book_Title'].append(d['volumeInfo']['title'])
        
            try:
                MF_dict['Author'].append(d['volumeInfo']['authors'])
            except KeyError:
                MF_dict['Author'].append(np.nan)

            try:
                MF_dict['Publisher'].append(d['volumeInfo']['publisher'])
            except KeyError:
                MF_dict['Publisher'].append(np.nan)

            try:
                MF_dict['Published_Date'].append(d['volumeInfo']['publishedDate'])
            except KeyError:
                MF_dict['Published_Date'].append(np.nan)

            try:
                for e in (d['volumeInfo']['industryIdentifiers']):
                    if e['type'] == 'ISBN_10':
                        MF_dict['ISBN_10'].append(e['identifier'])
                for e in (d['volumeInfo']['industryIdentifiers']):
                    if e['type'] == 'ISBN_13':
                        MF_dict['ISBN_13'].append(e['identifier'])
            except KeyError:
                MF_dict['ISBN_10'].append(np.nan)
                MF_dict['ISBN_13'].append(np.nan)

            try:
                MF_dict['Page_Count'].append(d['volumeInfo']['pageCount'])
            except KeyError:
                MF_dict['Page_Count'].append(np.nan)

            try:
                MF_dict['Average_Rating'].append(d['volumeInfo']['averageRating'])
            except KeyError:
                MF_dict['Average_Rating'].append(np.nan)

            try:
                MF_dict['Rating_Count'].append(d['volumeInfo']['ratingsCount'])
            except KeyError:
                MF_dict['Rating_Count'].append(np.nan)

            if d['saleInfo']['saleability'] == 'NOT_FOR_SALE':
                MF_dict['Sale_Country'].append(np.nan)
                MF_dict['Sale_Amount'].append(np.nan)
                MF_dict['Sale_Currency'].append(np.nan)
            else:
                try:
                    MF_dict['Sale_Country'].append(d['saleInfo']['country'])
                    MF_dict['Sale_Amount'].append(d['saleInfo']['listPrice']['amount'])
                    MF_dict['Sale_Currency'].append(d['saleInfo']['listPrice']['currencyCode'])
                except KeyError:
                    MF_dict['Sale_Country'].append(np.nan)
                    MF_dict['Sale_Amount'].append(np.nan)
                    MF_dict['Sale_Currency'].append(np.nan)

        df = pd.DataFrame.from_dict(MF_dict, orient='index')  # Build Dataframe from dictionary with index
        return df
    except KeyError:
        df = pd.DataFrame.from_dict(MF_dict, orient='index')  # Build Empty Dataframe
        return df

In [5]:
# Get Data from API
i = 0

# Perform a for loop to get data for all the books in the csv file
for x in range(0, len_df-1):
    book_name = books_df['Book_Name'][x]
    book_author = books_df['Author_Name'][x]

# Format the book name to remove unnessary data & spaces from title
    book_name = re.sub("[\(\[].*?[\)\]]", "", book_name)
    book_name = book_name.rstrip()
    book_name = book_name.lstrip()
    
    book_author = re.sub("[\(\[].*?[\)\]]", "", book_author)
    book_author = book_author.rstrip()
    book_author = book_author.lstrip()
    
    data = get_data(book_name, book_author)
    
    if data == None:
        print('Data not found for book: ', book_name)
        i = i + 1
    else:
        df1 = build_dataframe(data)
        if df1.empty:
            pass
        else:
# Format data into a more readable format using transpose()
            df2 = df1.transpose()
            if x == 0:
                df3 = df2
            else:
                df3 = df3.append(df2, ignore_index=True)

In [6]:
print('Total Number of books not found on Google API = ', i)

Total Number of books not found on Google API =  0


In [7]:
df3

Unnamed: 0,Book_Title,Author,Publisher,Published_Date,ISBN_10,ISBN_13,Page_Count,Average_Rating,Rating_Count,Sale_Country,Sale_Amount,Sale_Currency
0,A Tale of Two Cities,[Charles Dickens],,1902,0791092933,9780791092934,324,4,160,US,,
1,Charles Dickens's A Tale of Two Cities,[Harold Bloom],Infobase Publishing,2007-01,1853260398,9781853260391,131,,,,,
2,A Tale of Two Cities,[Charles Dickens],Wordsworth Editions,1993,041528760X,9780415287609,307,3.5,3,,,
3,Charles Dickens's A Tale of Two Cities,[Ruth F. Glancy],Psychology Press,2006,087891949X,9780878919499,174,,,,,
4,Works of Charles Dickens: A tale of two cities,[Charles Dickens],,1867,1650377665,9781650377667,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
1551,Spark Notes - Angela's Ashes,[Frank McCourt],Spark Publishing Group,2002-07-15,1586634690,9781586634698,84,,,,,
1552,A Study Guide for 'Angela's Ashes' by Frank Mc...,[Carol Alexander],,2002,1553199898,9781553199892,31,,,,,
1553,Angela's Ashes: (Accelerated Reader),[Frank McCourt],,,1586634690,9781586634698,364,,,,,
1554,Angela's Ashes - Literature Kit Gr. 9-12,[Paul Bramley],Classroom Complete Press,2012-12-06,,,57,,,US,9.99,USD


In [8]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1556 entries, 0 to 1555
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Book_Title      1521 non-null   object
 1   Author          1459 non-null   object
 2   Publisher       1161 non-null   object
 3   Published_Date  1473 non-null   object
 4   ISBN_10         1277 non-null   object
 5   ISBN_13         1287 non-null   object
 6   Page_Count      1344 non-null   object
 7   Average_Rating  606 non-null    object
 8   Rating_Count    606 non-null    object
 9   Sale_Country    416 non-null    object
 10  Sale_Amount     381 non-null    object
 11  Sale_Currency   381 non-null    object
dtypes: object(12)
memory usage: 146.0+ KB


In [9]:
# Identify & Remove bad data
df3 = df3[df3['Book_Title'].notna()]
df3 = df3[df3['Author'].notna()]
df3 = df3[df3['ISBN_13'].notna()]
df3 = df3[df3['ISBN_10'].notna()]

In [10]:
df3

Unnamed: 0,Book_Title,Author,Publisher,Published_Date,ISBN_10,ISBN_13,Page_Count,Average_Rating,Rating_Count,Sale_Country,Sale_Amount,Sale_Currency
0,A Tale of Two Cities,[Charles Dickens],,1902,0791092933,9780791092934,324,4,160,US,,
1,Charles Dickens's A Tale of Two Cities,[Harold Bloom],Infobase Publishing,2007-01,1853260398,9781853260391,131,,,,,
2,A Tale of Two Cities,[Charles Dickens],Wordsworth Editions,1993,041528760X,9780415287609,307,3.5,3,,,
3,Charles Dickens's A Tale of Two Cities,[Ruth F. Glancy],Psychology Press,2006,087891949X,9780878919499,174,,,,,
4,Works of Charles Dickens: A tale of two cities,[Charles Dickens],,1867,1650377665,9781650377667,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
1549,QBD Angela's Ashes,[Frank McCourt],Fourth Estate,2016-09-01,0007956304,9780007956302,,,,,,
1550,Angela's Ashes and 'Tis,[Frank McCourt],Scribner,2000-11-01,0743204018,9780743204019,736,5,1,,,
1551,Spark Notes - Angela's Ashes,[Frank McCourt],Spark Publishing Group,2002-07-15,1586634690,9781586634698,84,,,,,
1552,A Study Guide for 'Angela's Ashes' by Frank Mc...,[Carol Alexander],,2002,1553199898,9781553199892,31,,,,,


In [11]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1239 entries, 0 to 1553
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Book_Title      1239 non-null   object
 1   Author          1239 non-null   object
 2   Publisher       1006 non-null   object
 3   Published_Date  1218 non-null   object
 4   ISBN_10         1239 non-null   object
 5   ISBN_13         1239 non-null   object
 6   Page_Count      1130 non-null   object
 7   Average_Rating  545 non-null    object
 8   Rating_Count    545 non-null    object
 9   Sale_Country    335 non-null    object
 10  Sale_Amount     320 non-null    object
 11  Sale_Currency   320 non-null    object
dtypes: object(12)
memory usage: 125.8+ KB


**I like the dataframe currently extracted and I haven't cleaned it a lot since I wouldn't want to drop rows which would be a match when I try to join the above dataframe with the data extracted from the website and the flat file data.**

**I am planning on using the ISBN number match between my flat file data & the above dataframe and use fuzzy matching and match the book titles from all three sources.**

**Hence I am skeptical of doing any further data wrangling clean up on this dataframe.**

In [12]:
# Write Final Dataframe into CSV file
df3.to_csv('API_Books_cleaned.csv')