# Reducing the data
There are 238 million interactions and 2.3 million books in this dataset. For our purposes of training and tuning Neural Collaborative Filtering and Transformer models, this size is impossible to manage in terms of memory and compute.

Therefore, we reduce the dataset to only include interactions with explicit ratings of 4 and 5, users with 100+ interactions, and books with 500+ interactions.

## 1. Configuration and Setup

In [None]:
import pandas as pd
import numpy as np
import dask.dataframe as dd
import pyarrow.parquet as pq

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

### Load the original datasets

In [None]:
interactions_dedup_df = dd.read_parquet("../data/interactions_dedup.parquet")
books_df = dd.read_parquet("../data/new_books.parquet")
books_works_df = dd.read_parquet("../data/books_works_df.parquet")
user_id_map = pd.read_csv("../data/user_id_map.csv")
book_id_map = pd.read_csv("../data/book_id_map.csv")

## 2. Filtering based on interactions

In [4]:
# Filter interactions with rating >= 4
interactions_filtered_rating_df = interactions_dedup_df[interactions_dedup_df['rating'] >= 4]
interactions_filtered_rating_df = interactions_filtered_rating_df

# Calculate user interaction counts
user_counts = interactions_filtered_rating_df.groupby('user_id').size().compute()

# Filter users with 100+ interactions
valid_users = user_counts[user_counts >= 100].index

# Calculate book interaction counts
book_counts = interactions_filtered_rating_df.groupby('book_id').size().compute()
# Filter books with 500+ interactions
valid_books = book_counts[book_counts >= 500].index

# Save valid user IDs to a CSV file
pd.Series(valid_users, name='user_id').to_csv("../data/reduced_user_ids.csv", index=False)
print("Valid user IDs saved to data/reduced_user_ids.csv")

# Save valid book IDs to a CSV file
pd.Series(valid_books, name='book_id').to_csv("../data/reduced_book_ids.csv", index=False)
print("Valid book IDs saved to data/reduced_book_ids.csv")

interactions_filtered_rating_df = interactions_filtered_rating_df.compute()

Valid user IDs saved to data/reduced_user_ids.csv
Valid book IDs saved to data/reduced_book_ids.csv


## 3. Reducing Interactions Dataframe

In [5]:
interactions_filtered_df = interactions_filtered_rating_df[
    interactions_filtered_rating_df['user_id'].isin(valid_users) &
    interactions_filtered_rating_df['book_id'].isin(valid_books)
]

In [6]:
interactions_filtered_df.to_parquet("data/reduced_interactions.parquet", index=False)

## 4. Reducing Books Dataframe

In [7]:
reduced_book_ids = pd.read_csv("../data/reduced_book_ids.csv")
# Get the list of book IDs to keep
book_ids_to_keep = reduced_book_ids['book_id'].unique()

# Filter the Dask DataFrame
filtered_books_df = books_df[books_df['book_id'].isin(book_ids_to_keep)]

# Optionally, trigger computation and view the head
filtered_books_df = filtered_books_df.compute()
len(filtered_books_df)

17663

### Filtering the `similar_books` attribute

In [8]:
# Convert book_ids_to_keep to a set for efficient lookup
# Ensure the IDs in the set are integers, matching the type in reduced_book_ids
book_ids_to_keep_set = set(book_ids_to_keep.astype(int))

# Define a function to filter the similar_books array
def filter_array(arr):
    # Check if the input is a numpy array and not empty
    if isinstance(arr, np.ndarray) and arr.size > 0:
        # Convert string IDs in the array to integers for comparison
        # Keep only those IDs present in the book_ids_to_keep_set
        return np.array([book_id for book_id in arr if int(book_id) in book_ids_to_keep_set], dtype=object)
    # Return an empty numpy array if input is not valid or empty
    return np.array([], dtype=object)

# Apply the filtering function to the 'similar_books' column
filtered_books_df['similar_books'] = filtered_books_df['similar_books'].apply(filter_array)

# Display the head to verify the changes (optional)
print(filtered_books_df[['book_id', 'similar_books']].head())

      book_id                                      similar_books
3     6066819                        [2285777, 5941079, 3134684]
15      89375                     [53817, 254389, 8964, 8139321]
479  11731782  [10950666, 11948797, 12711899, 11187203, 12901...
583     54270                                          [3831344]
807     38568                     [225669, 50789, 46481, 780878]


In [9]:
filtered_books_df.to_parquet("data/reduced_books.parquet")

## 5. Reducing Reviews Dataframe

In [10]:
reviews_df = dd.read_parquet("../data/reviews_dedup.parquet")
reduced_user_ids = pd.read_csv("../data/reduced_user_ids.csv")
user_ids_to_keep = reduced_user_ids['user_id'].unique()

In [11]:
# Filter reviews_df
filtered_reviews_df = reviews_df[
    reviews_df['user_id'].isin(user_ids_to_keep) &
    reviews_df['book_id'].isin(book_ids_to_keep)
]

pd_reviews_df = filtered_reviews_df.compute()
pd_reviews_df.to_parquet("data/reduced_reviews.parquet")

print(len(pd_reviews_df))
print(pd_reviews_df.head())

4602214
                             user_id   book_id  \
10  8842281e1d1347389f2ab93d60773d4d     16981   
12  8842281e1d1347389f2ab93d60773d4d  28684704   
13  8842281e1d1347389f2ab93d60773d4d  27161156   
15  8842281e1d1347389f2ab93d60773d4d  32283133   
17  8842281e1d1347389f2ab93d60773d4d  28119237   

                           review_id  rating  \
10  a5d2c3628987712d0e05c4f90798eb67       3   
12  2ede853b14dc4583f96cf5d120af636f       3   
13  ced5675e55cd9d38a524743f5c40996e       0   
15  8e4d61801907e591018bdc3442a9cf2b       0   
17  7a8dc8ab7f3c0084be8150d7e5bd40cb       4   

                                          review_text  \
10  Recommended by Don Katz. Avail for free in Dec...   
12  A fun, fast paced science fiction thriller. I ...   
13  Recommended reading to understand what is goin...   
15           http://www.telegraph.co.uk/culture/10...   
17  A fascinating book about community and belongi...   

                        date_added                    date_