In [106]:
# uncomment to install libraries if not previously installed 
#!pip install pandas
#!pip install zipfile
#!pip install kaggle


import pandas as pd
import zipfile
import kaggle

In [107]:
# download shopping trends dataset from Kaggle
!kaggle datasets download -d asaniczka/amazon-kindle-books-dataset-2023-130k-books

amazon-kindle-books-dataset-2023-130k-books.zip: Skipping, found more recently modified local copy (use --force to force download)


In [108]:
# extract contents of zip file in order to access csv
zipfile_name = 'amazon-kindle-books-dataset-2023-130k-books.zip'
with zipfile.ZipFile(zipfile_name, 'r') as file:
    file.extractall()


In [109]:
# read in shopping trends csv
df = pd.read_csv('kindle_data-v2.csv')


In [110]:
# check to see column information and if data types need to be converted to perform analysis
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 133102 entries, 0 to 133101
Data columns (total 16 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   asin               133102 non-null  object 
 1   title              133102 non-null  object 
 2   author             132677 non-null  object 
 3   soldBy             123869 non-null  object 
 4   imgUrl             133102 non-null  object 
 5   productURL         133102 non-null  object 
 6   stars              133102 non-null  float64
 7   reviews            133102 non-null  int64  
 8   price              133102 non-null  float64
 9   isKindleUnlimited  133102 non-null  bool   
 10  category_id        133102 non-null  int64  
 11  isBestSeller       133102 non-null  bool   
 12  isEditorsPick      133102 non-null  bool   
 13  isGoodReadsChoice  133102 non-null  bool   
 14  publishedDate      84086 non-null   object 
 15  category_name      133102 non-null  object 
dtypes:

In [111]:
# check for null values in each column in the event of needing imputation or otherwise
df.isnull().sum()

asin                     0
title                    0
author                 425
soldBy                9233
imgUrl                   0
productURL               0
stars                    0
reviews                  0
price                    0
isKindleUnlimited        0
category_id              0
isBestSeller             0
isEditorsPick            0
isGoodReadsChoice        0
publishedDate        49016
category_name            0
dtype: int64

In [112]:
df.shape

(133102, 16)

In [113]:
# creating subset of relevant columns for analysis
my_cols = {'title', 'author', 'stars', 'price', 'isBestSeller', 'category_name', 'isGoodReadsChoice', 'isGoodReadsChoice', 'publishedDate'}

# renaming relevant columns to appropriate names to improve readability
new_cols= {
    'title': 'title',
    'author': 'author',
    'stars': 'rating',
    'price': 'price $',
    'isBestSeller': 'isBestSeller',
    'category_name': 'genre',
    'isGoodReadsChoice': 'isGoodReadsChoice',
    'publishedDate' : 'publishedDate'
}

In [114]:
# indexing desired columns and renaming as per the mapping above
df = df[my_cols]
df.rename(new_cols, axis=1, inplace=True)


In [115]:
# check to see if correct columns were indexed and renamed as stated
df.head()

Unnamed: 0,title,price $,genre,isGoodReadsChoice,rating,author,publishedDate,isBestSeller
0,Adult Children of Emotionally Immature Parents...,9.99,Parenting & Relationships,False,4.8,Lindsay C. Gibson,2015-06-01,True
1,"From Strength to Strength: Finding Success, Ha...",16.99,Parenting & Relationships,False,4.4,Arthur C. Brooks,2022-02-15,False
2,Good Inside: A Guide to Becoming the Parent Yo...,16.99,Parenting & Relationships,False,4.8,Becky Kennedy,2022-09-13,False
3,Everything I Know About Love: A Memoir,9.95,Parenting & Relationships,False,4.2,Dolly Alderton,2020-02-25,False
4,The Seven Principles for Making Marriage Work:...,13.99,Parenting & Relationships,False,4.7,John Gottman,2015-05-05,False


In [116]:
# writing dataframe to excel file to use inside of Tableau to conduct visualisations
df.to_excel('amazon_kindle_final.xlsx', sheet_name='Data')