In [1]:
#Prevent Large File crashing on Jupyter Notebook
import os
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'

In [None]:
import pandas as pd
import numpy as np
#import vaex, numpy as np
from matplotlib.font_manager import FontProperties
import seaborn as sns
from matplotlib import rcParams
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.ticker as ticker
import datetime
import warnings 
#sns.set_style("darkgrid",{"axes.axisbelow" : False })
warnings.simplefilter('ignore')
import string

# Data Exploration

In [None]:
amazon = pd.read_csv('am_metadata_sample.csv')
gr     = pd.read_csv('gr_metadata_sample.csv')

### Checking missing value

In [None]:
amazon.isnull().sum()

In [None]:
gr.isnull().sum()

# Plot the distribution of the average rating 

In [None]:
amazon.describe()

In [None]:
gr.describe()

In [None]:
fig,ax=plt.subplots(1, 2, figsize=(15,4))

# Distribution of Amazon Average Rating
sns.distplot(amazon.average, hist=True, kde=False,  
             bins=28, color = 'blue',
             hist_kws={'edgecolor':'black'},
            ax=ax[0])
ax[0].set_title('Distribution of Amazon Average Rating')
ax[0].axvline(amazon.describe().average['mean'], 0, color='darkblue', linestyle='--')

# Distribution of Goodreads Average Rating
sns.distplot(gr.average_rating, hist=True, kde=False,  
             bins=28, color = 'blue',
             hist_kws={'edgecolor':'black'},
            ax = ax[1])
ax[1].set_title('Distribution of Goodreads Average Rating')
ax[1].axvline(gr.describe().average_rating['mean'], 0, color='darkblue', linestyle='--')

# Making plots uniform for easier side-by-side comparison
for i in range(2):
    ax[i].set_xlabel('Average rating')
    ax[i].set_ylabel('Frequency (Number of books)')
    ax[i].set_xticks(np.arange(1,5.5,0.5))
    ax[i].set_ylim(0,1000,1000)

plt.show()

In [None]:
fig,ax=plt.subplots(1, 2, figsize=(15,4))

# Distribution of Amazon Average Rating by Number of Stars
sns.countplot(x=np.round(amazon.average,0),  order=list(range(1,6)), color='lightskyblue', ax=ax[0])
ax[0].set_title('Distribution of Amazon Average Rating by Number of Stars')

# Distribution of Goodreads Average Rating by Number of Stars
sns.countplot(x=np.round(gr.average_rating,0), order=list(range(1,6)), color='lightskyblue', ax=ax[1])
ax[1].set_title('Distribution of Goodreads Average Rating by Number of Stars')

# Making plots uniform for easier side-by-side comparison
for i in range(2):
    ax[i].set_xlabel('Average rating (grouped by number of stars)')
    ax[i].set_ylabel('Frequency (Number of books)')
    ax[i].set_ylim(0,5000,5000)

plt.show()

### Average rating vs number of reviews

In [None]:
plt.scatter(amazon.average,amazon.rating_count)
plt.title('Amazon Average Rating vs Number of Reviews', size = 13)
plt.axvline(amazon.describe().average['mean'], 0, color='darkblue', linestyle='--')
plt.xlabel('Average rating')
plt.ylabel('Number of reviews')
plt.xticks(np.arange(1,5.5,0.5))
plt.show()

In [None]:
plt.scatter(gr.average_rating,gr.total_text_reviews_count)
plt.title('Goodreads Average Rating vs Number of Reviews', size = 13)
plt.axvline(gr.describe().average_rating['mean'], 0, color='darkblue', linestyle='--')
plt.xlabel('Average rating')
plt.ylabel('Number of reviews')
plt.xticks(np.arange(1,5.5,0.5))
plt.show()

## Amazon rating - Goodreads rating

In [None]:
amazon_gr = pd.merge(amazon, gr, how = 'inner', on ='asin')

In [None]:
amazon_gr['rating_diff'] = amazon_gr['average'] -  amazon_gr['average_rating']

In [None]:
am_gr = pd.DataFrame(amazon_gr, columns = ['asin','rating_diff','average','rating_count', 'text_reviews_count_x','total_ratings_count',
                                           'total_text_reviews_count', 'average_rating','genres'])

In [None]:
am_gr = am_gr.rename(columns = {'average':'amazon_average', 'rating_count': 'amazon_rating_count', 
                                'text_reviews_count_x': 'amazon_text_reviews_count', 'total_ratings_count': 'gr_rating_count',
                                'total_text_reviews_count':'gr_text_reviews_count', 'average_rating': 'gr_average'})

In [None]:
am_gr

In [None]:
am_gr.describe()

In [None]:
sns.distplot(am_gr.rating_diff, hist=True, kde=False,  
             bins=28, color = 'blue',
             hist_kws={'edgecolor':'black'})
plt.xlabel('Average rating difference')
plt.ylabel('Frequency (Number of books)')
plt.title('Distribution of Difference in Ratings between Amazon and Goodreads')
plt.xlim(-2,2)
plt.axvline(am_gr.describe().rating_diff['mean'], 0, color='darkblue', linestyle='--')
plt.show()

In [None]:
am_gr['rating_count'] = am_gr['amazon_rating_count']+am_gr['gr_rating_count']
am_gr['text_reviews_count'] = am_gr['amazon_text_reviews_count']+am_gr['gr_text_reviews_count']
am_gr

In [None]:
plt.scatter(am_gr.rating_diff, am_gr.rating_count)
plt.title('Rating Difference vs Total Number of Ratings', size = 17)
plt.xlabel('Average rating difference', size=10)
plt.ylabel('Total number of ratings (Amazon + Goodreads)', size=10)
plt.axvline(am_gr.describe().rating_diff['mean'], 0, color='darkblue', linestyle='--')
plt.show()

In [None]:
plt.scatter(am_gr.rating_diff, am_gr.text_reviews_count)
plt.title('Rating Difference vs Total Number of Text Reviews', size = 17)
plt.xlabel('Average rating difference', size =10)
plt.ylabel('Total number of text reviews (Amazon + Goodreads)', size=10)
plt.axvline(am_gr.describe().rating_diff['mean'], 0, color='darkblue', linestyle='--')
plt.show()

In [None]:
#genres = am_gr.genres

In [None]:
'''unique_genres = dict()
for genre in genres:
    genre_split = genre.split(',')
    for g in genre_split:
        if g in unique_genres:
            unique_genres[g] += 1
        else:
            unique_genres[g] = 1'''

In [None]:
#unique_genres

In [None]:
'''filtered_genres = dict()
for k,v in unique_genres.items():
    if v > 1000:
        filtered_genres[k] = v'''

# Book Metadata

In [None]:
book = pd.read_csv('gr_metadata_sample.csv')
book

In [None]:
book.describe()

In [None]:
book_am_gr = pd.merge(am_gr, book, how = 'inner', on ='asin')

In [None]:
plt.figure(figsize=(8,12)) 
sns.violinplot(x="rating_diff", y="format", data=book_am_gr, figsize=(20,50))
plt.xlabel('Average rating difference')
plt.ylabel('Book format')
plt.title('Rating Difference vs Book Format')
plt.show()

In [None]:
plt.figure(figsize=(8,12)) 
sns.violinplot(x="amazon_average", y="format", data=book_am_gr, figsize=(20,50))
plt.xlabel('Amazon average rating')
plt.ylabel('Book format')
plt.title('Amazon Average Rating vs Book Format')
plt.show()

In [None]:
plt.figure(figsize=(8,12)) 
sns.violinplot(x="gr_average", y="format", data=book_am_gr, figsize=(20,50))
plt.xlabel('Goodreads average rating')
plt.ylabel('Book format')
plt.title('Goodreads Average Rating vs Book Format')
plt.show()

In [None]:
book_am_gr = book_am_gr.rename(columns = {'genres_x':'amazon_genres', 'genres_y': 'gr_genres'})

In [None]:
plt.figure(figsize=(8,12)) 
sns.violinplot(x="rating_diff", y="format", data=book_am_gr, figsize=(20,50))
plt.xlabel('Average rating difference')
plt.ylabel('Book format')
plt.title('Rating Difference vs Book Format')
plt.show()