# Goodreads descriptive analysis

In [37]:
import gzip
import json
import numpy as np
import pandas as pd
import os
pd.options.display.float_format = '{:,}'.format

import seaborn as sns
sns.set_style("whitegrid")
sns.set(font_scale=1.8)

RANDOM = 2021

figure_path = '/home/weiss/git/thesis/doc/figures/'

DIR = '/home/weiss/rs_data/goodreads/'

In [38]:
def count_lines(file_name):
    """count number of lines in a given file"""
    print('counting file:', file_name)
    count = 0
    with gzip.open(file_name) as fin:
        for l in fin:
            count += 1
    print('done!')
    return count

In [39]:
#n_book = count_lines(os.path.join(DIR, 'goodreads_books.json.gz'))
#n_work = count_lines(os.path.join(DIR, 'goodreads_book_works.json.gz'))
#n_author = count_lines(os.path.join(DIR, 'goodreads_book_authors.json.gz'))
#n_series = count_lines(os.path.join(DIR, 'goodreads_book_series.json.gz'))

In [40]:
def load_data(file_name, head = 500):
    """Load a given GZIP compressed JSON file"""
    count = 0
    data = []
    with gzip.open(file_name) as fin:
        for l in fin:
            d = json.loads(l)
            count += 1
            data.append(d)

            # break if reaches the 100th line
            if (head is not None) and (count > head):
                break
    return data

In [41]:
#df_book_stats = pd.DataFrame([n_book, n_work, n_author, n_series], dtype = float,
#                             columns = ['count'],
#                             index = ['# book', '# work', '# author', '# series'])
#print(df_book_stats)

In [42]:
reviews = load_data(os.path.join(DIR, 'goodreads_reviews_dedup.json.gz'))
np.random.choice(reviews)

{'user_id': '01ec1a320ffded6b2dd47833f2c8e4fb',
 'book_id': '31450633',
 'review_id': 'e1b920bc35c0df165f97094281148eab',
 'rating': 4,
 'review_text': '4.5 Stars \n "If you can\'t trust yourself, who can you trust?" \n OMG!! Intelligent and thrilling page-turner that I read in almost one sitting! The rather abrupt ending was a little unsatisfying BUT the clever story kept me guessing. \n I think most people do something or fail to take action that may cause some guilt. Well this story begins with Cass taking a shortcut late at night and failing to stop and help a woman who is stopped on the side of the road. \n The motorist ends up dead. Cass feels guilty, but just as significant are the little things she begins to forget...how to run the washing machine, making dinner plans with friends, and even rather significant purchases. Her mother suffered from early-onset Alzheimer\'s. What is happening to Cass? \n Cass\'s mental well-being soon spirals out of control and she fears she will su

In [None]:
# the relevant file is 'goodreads_interactions.csv'

PATH_IN = os.path.join(DIR, 'goodreads_interactions.csv')
df_interactions = pd.read_csv(PATH_IN)

In [None]:
print('=== first 5 records ===')
print(df_interactions.head())
print(f'size {df_interactions.size}')
print(f'unique users {df_interactions.user_id.unique().size}')
print(f'unique books {df_interactions.book_id.unique().size}')

In [None]:
print('=== Any empty cells ===')
print(df_interactions.isnull().any())

In [None]:
print('=== duplicated records ===')
print(df_interactions[df_interactions.duplicated(['user_id', 'book_id'], keep=False)])

In [None]:
rating_count_by_user = df_interactions.groupby('user_id').size().sort_values(ascending=False)
print(rating_count_by_user)  # contains unrealistic users (way too many ratings)
size = rating_count_by_user.size

df = pd.DataFrame(dict(sorted_user_id=range(1,size+1), rating_counts=rating_count_by_user.values))

sns.set(style='whitegrid', font_scale=1.5, rc={'xtick.labelsize':12, 'ytick.labelsize':12})
image = sns.relplot(data=df,
                    x="sorted_user_id",
                    y="rating_counts",
                    kind="line",
                    height=4,
                    aspect=1.5
                    )

image.set(xlabel="users sorted desc. by # ratings",
          ylabel="# ratings"
          )

image.savefig(figure_path + '/goodreads-long-tail-distribution.png', dpi=300, bbox_inches='tight')

In [None]:
# rating distribution before cleanup

sns.set(style='whitegrid', font_scale=1.3)
g = sns.factorplot("rating", data=df_interactions, kind='count')
g.set_ylabels("Total number of ratings")
g.savefig(figure_path + '/goodreads-rating-distribution-with-null.png', dpi=300, bbox_inches='tight')

In [None]:
print('=== first 5 records ===')
print(df_interactions.head())
print(f'size {df_interactions.rating.size}')
print(f'unique users {df_interactions.user_id.unique().size}')
print(f'unique books {df_interactions.book_id.unique().size}')

In [None]:
# remove ratings with value 0
df_interactions= df_interactions[df_interactions['rating'] != 0]
df_interactions

In [None]:
# rating distribution after cleanup
sns.set(style='whitegrid', font_scale=1.3)
g = sns.factorplot("rating", data=df_interactions, kind='count')
g.set_ylabels("# ratings")
g.savefig(figure_path + '/goodreads-rating-distribution.png', dpi=300, bbox_inches='tight')

In [None]:
# Keep entries where the user has rated more than n items and less than m items

n = 50
m = 1000

counts = df_interactions['user_id'].value_counts()
mask = (counts >= n) & (counts <= m)
print(mask.value_counts())
df_interactions = df_interactions[df_interactions['user_id'].isin(mask[mask == True].index)]

In [None]:
print('=== first 5 records ===')
print(df_interactions.head())
print(f'size {df_interactions["rating"].size}')
print(f'unique users {df_interactions["user_id"].unique().size}')
print(f'unique books {df_interactions.book_id.unique().size}')

In [None]:
rating_count_by_user = df_interactions.groupby('user_id').size().sort_values(ascending=False)
print(rating_count_by_user)
print(type(rating_count_by_user))
size = rating_count_by_user.size
print(size)

df = pd.DataFrame(dict(sorted_user_id=range(1,size+1), rating_counts=rating_count_by_user.values))

sns.set(style='whitegrid', font_scale=1.3, rc={'xtick.labelsize':12, 'ytick.labelsize':12})
image = sns.relplot(data=df,
                    x="sorted_user_id",
                    y="rating_counts",
                    kind="line",
                    )

image.set(xlabel="users sorted desc. by # ratings",
          ylabel="# ratings"
          )


image.savefig(figure_path + '/goodreads-long-tail-distribution-cut.png', dpi=300, bbox_inches='tight')


In [None]:
df_interactions['rating'].mean()

In [None]:
df_interactions['rating'].median()