# Goodreads descriptive analysis

In [4]:
import gzip
import json
import numpy as np
import pandas as pd
import os
pd.options.display.float_format = '{:,}'.format

import seaborn as sns
import matplotlib.pyplot as plt
sns.set_style("whitegrid")

RANDOM = 2021

figure_path = '/home/weiss/git/thesis/doc/figures/'

DIR = '/home/weiss/rs_data/goodreads/'

In [5]:
def count_lines(file_name):
    """count number of lines in a given file"""
    print('counting file:', file_name)
    count = 0
    with gzip.open(file_name) as fin:
        for l in fin:
            count += 1
    print('done!')
    return count

In [None]:
n_book = count_lines(os.path.join(DIR, 'goodreads_books.json.gz'))
n_work = count_lines(os.path.join(DIR, 'goodreads_book_works.json.gz'))
n_author = count_lines(os.path.join(DIR, 'goodreads_book_authors.json.gz'))
n_series = count_lines(os.path.join(DIR, 'goodreads_book_series.json.gz'))

counting file: /home/weiss/rs_data/goodreads/goodreads_books.json.gz


In [None]:
def load_data(file_name, head = 500):
    """Load a given GZIP compressed JSON file"""
    count = 0
    data = []
    with gzip.open(file_name) as fin:
        for l in fin:
            d = json.loads(l)
            count += 1
            data.append(d)

            # break if reaches the 100th line
            if (head is not None) and (count > head):
                break
    return data

In [None]:
df_book_stats = pd.DataFrame([n_book, n_work, n_author, n_series], dtype = float,
                             columns = ['count'],
                             index = ['# book', '# work', '# author', '# series'])
print(df_book_stats)

In [None]:
reviews = load_data(os.path.join(DIR, 'goodreads_reviews_dedup.json.gz'))
np.random.choice(reviews)

In [None]:
# the relevant file is 'goodreads_interactions.csv'

PATH_IN = os.path.join(DIR, 'goodreads_interactions.csv')
df_interactions = pd.read_csv(PATH_IN)

In [None]:
print('=== first 5 records ===')
print(df_interactions.head())
print(f'size {df_interactions.size}')
print(f'unique users {df_interactions.user_id.unique().size}')
print(f'unique books {df_interactions.book_id.unique().size}')

In [None]:
print('=== Any empty cells ===')
print(df_interactions.isnull().any())

In [None]:
print('=== duplicated records ===')
print(df_interactions[df_interactions.duplicated(['user_id', 'book_id'], keep=False)])

In [None]:
rating_count_by_user = df_interactions.groupby('user_id').size().sort_values(ascending=False)
print(rating_count_by_user)  # contains unrealistic users (way too many ratings)
size = rating_count_by_user.size

df = pd.DataFrame(dict(sorted_user_id=range(1,size+1), rating_counts=rating_count_by_user.values))

image = sns.relplot(data=df,
                    x="sorted_user_id",
                    y="rating_counts",
                    kind="line",
                    height=4,
                    aspect=1.5
                    )

image.set(xlabel="users sorted desc. by # ratings",
          ylabel="# ratings"
          )

image.savefig(figure_path + '/goodreads-long-tail-distribution.png', dpi=300, bbox_inches='tight')

In [None]:
# Keep entries where the user has rated more than n items and less than m items

n = 50
m = 1000

counts = df_interactions['user_id'].value_counts()
mask = (counts >= n) & (counts <= m)
print(mask.value_counts())
df_interactions = df_interactions[df_interactions['user_id'].isin(mask[mask == True].index)]

In [None]:
print('=== first 5 records ===')
print(df_interactions.head())
print(f'size {df_interactions.size}')
print(f'unique users {df_interactions.user_id.unique().size}')
print(f'unique books {df_interactions.book_id.unique().size}')

In [None]:
rating_count_by_user = df_interactions.groupby('user_id').size().sort_values(ascending=False)
print(rating_count_by_user)
print(type(rating_count_by_user))
size = rating_count_by_user.size
print(size)

df = pd.DataFrame(dict(sorted_user_id=range(1,size+1), rating_counts=rating_count_by_user.values))

image = sns.relplot(data=df,
                    x="sorted_user_id",
                    y="rating_counts",
                    kind="line",
                    height=4,
                    aspect=1.5
                    )

image.set(xlabel="users sorted desc. by # ratings",
          ylabel="# ratings"
          )

image.savefig(figure_path + '/goodreads-long-tail-distribution-cut.png', dpi=300, bbox_inches='tight')

In [None]:
# rating distribution

with sns.axes_style('white'):
    g = sns.factorplot("rating", data=df_interactions, aspect=2.0,kind='count')
    g.set_ylabels("Total number of ratings")
    g.savefig(figure_path + '/goodreads-rating-distribution.png', dpi=300, bbox_inches='tight')