In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import ast
import os
import dask.dataframe as dd
import os
from dotenv import load_dotenv
load_dotenv()

CLOUDFLARE_ENDPOINT_URL = os.getenv('CLOUDFLARE_ENDPOINT_URL')
CLOUDFLARE_ACCESS_KEY_ID = os.getenv('CLOUDFLARE_ACCESS_KEY_ID')
CLOUDFLARE_SECRET_ACCESS_KEY = os.getenv('CLOUDFLARE_SECRET_ACCESS_KEY')

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)


In [5]:
import boto3
from botocore.config import Config


## Load Data

## Books

In [None]:
#books_df = pd.read_json("data/goodreads_books.json", lines=True)
#books_df = pd.read_pickle("data/books_df.pkl")
#books_df.to_parquet("data/books_df.parquet")
#books_df = pd.read_pickle("data/books_df.pkl")

books_df = pd.read_parquet("../data/books_df.parquet")

print("\n\nAuthors for first 5 records:")
print(books_df['authors'].head())



## Interactions CSV

In [None]:
interactions_df = pd.read_csv("data/goodreads_interactions.csv")
#interactions_df.to_pickle("data/interactions_df.pkl")
#interactions_df = pd.read_pickle("data/interactions_df.pkl")

In [None]:
book_id_map_df = pd.read_csv("../data/book_id_map.csv")
user_id_map_df = pd.read_csv("../data/user_id_map.csv")

# book_id_map_df.head()
# user_id_map_df.head()


In [None]:
# # Map user_id_csv to actual user_id
interactions_df['user_id'] = interactions_df['user_id'].map(user_id_map_df.set_index('user_id_csv')['user_id'])

# # Map book_id to actual book_id
interactions_df['book_id'] = interactions_df['book_id'].map(book_id_map_df.set_index('book_id_csv')['book_id'])

In [None]:
interactions_df.to_parquet("data/interactions_df.parquet")

In [None]:
interactions_df.head()

In [None]:
len(interactions_df)

## Interactions Dedup JSON

In [None]:
# Process JSON in chunks and save to parquet, tracking progress
import pyarrow.parquet as pq
import pyarrow as pa

chunk_size = 150000
output_path = "data/interactions_dedup.parquet"
progress_path = "data/chunk_progress.txt"

# Get starting chunk from progress file if it exists
start_chunk = 0
if os.path.exists(progress_path):
    with open(progress_path) as f:
        start_chunk = int(f.read())

# Create ParquetWriter for appending
writer = None

for chunk_count, chunk in enumerate(pd.read_json("data/goodreads_interactions_dedup.json", lines=True, chunksize=chunk_size)):
    if chunk_count < start_chunk:
        continue
        
    table = pa.Table.from_pandas(chunk)
    
    if writer is None:
        writer = pq.ParquetWriter(output_path, table.schema)
    
    writer.write_table(table)
    
    # Save progress
    with open(progress_path, 'w') as f:
        f.write(str(chunk_count + 1))

# Close the writer when done
if writer:
    writer.close()

In [2]:
# Or read sample of rows
df = dd.read_parquet("../data/interactions_dedup.parquet")

In [3]:
len(df.index.compute())

228648342

In [4]:
df_read = df[df['is_read']==True]
len(df_read)

112131203

In [5]:
df_read.head()

Unnamed: 0,user_id,book_id,review_id,is_read,rating,review_text_incomplete,date_added,date_updated,read_at,started_at
13,8842281e1d1347389f2ab93d60773d4d,25735618,ea74f2b6645b7d16f3ede2aca10226f0,True,0,,Fri Aug 25 13:55:10 -0700 2017,Tue Oct 17 23:53:44 -0700 2017,,Tue Oct 17 09:23:10 -0700 2017
14,8842281e1d1347389f2ab93d60773d4d,24375664,5cd416f3efc3f944fce4ce2db2290d5e,True,5,Mind blowingly cool. Best science fiction I've...,Fri Aug 25 13:55:02 -0700 2017,Mon Oct 09 08:55:59 -0700 2017,Sat Oct 07 00:00:00 -0700 2017,Sat Aug 26 00:00:00 -0700 2017
21,8842281e1d1347389f2ab93d60773d4d,18245960,dfdbb7b0eb5a7e4c26d59a937e2e5feb,True,5,This is a special book. It started slow for ab...,Sun Jul 30 07:44:10 -0700 2017,Wed Aug 30 00:00:26 -0700 2017,Sat Aug 26 12:05:52 -0700 2017,Tue Aug 15 13:23:18 -0700 2017
22,8842281e1d1347389f2ab93d60773d4d,6392944,5e212a62bced17b4dbe41150e5bb9037,True,3,I haven't read a fun mystery book in a while a...,Mon Jul 24 02:48:17 -0700 2017,Sun Jul 30 09:28:03 -0700 2017,Tue Jul 25 00:00:00 -0700 2017,Mon Jul 24 00:00:00 -0700 2017
23,8842281e1d1347389f2ab93d60773d4d,22078596,fdd13cad0695656be99828cd75d6eb73,True,4,"Fun, fast paced, and disturbing tale of murder...",Mon Jul 24 02:33:09 -0700 2017,Sun Jul 30 10:23:54 -0700 2017,Sun Jul 30 15:42:05 -0700 2017,Tue Jul 25 00:00:00 -0700 2017


## Reviews

In [None]:
books_works_df = pd.read_json("data/goodreads_book_works.json", lines=True)
books_works_df.to_parquet("data/books_works_df.parquet")
books_works_df.head()


## Authors


In [None]:
# Read the gzipped JSON file
authors_df = pd.read_json("data/authors.json", lines=True)
authors_df.to_pickle("data/authors_df.pkl")
# Display the first few rows
authors_df.head()

In [None]:
#authors_df.shape