In [2]:
import pandas as pd
import os
import sys

In [3]:
raw_data_dir = '../data/raw/'
processed_data_dir = '../data/processed/'

reviews_input_file = os.path.join(raw_data_dir, 'reviews_dedup.parquet')

## 1. Load Raw Data

In [4]:
print(f"\nLoading reviews data from: {reviews_input_file}")
try:
    reviews_df = pd.read_parquet(reviews_input_file)
    print("Reviews DataFrame loaded successfully.")
    print("\nReviews Info:")
    reviews_df.info(memory_usage='deep')
except FileNotFoundError:
    print(f"ERROR: Reviews file not found at {reviews_input_file}", file=sys.stderr)
except Exception as e:
    print(f"ERROR: Failed to load reviews file: {e}", file=sys.stderr)


Loading reviews data from: ../data/raw/reviews_dedup.parquet
Reviews DataFrame loaded successfully.

Reviews Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15739967 entries, 0 to 15739966
Data columns (total 11 columns):
 #   Column        Dtype 
---  ------        ----- 
 0   user_id       object
 1   book_id       int64 
 2   review_id     object
 3   rating        int64 
 4   review_text   object
 5   date_added    object
 6   date_updated  object
 7   read_at       object
 8   started_at    object
 9   n_votes       int64 
 10  n_comments    int64 
dtypes: int64(4), object(7)
memory usage: 18.2 GB


In [5]:
reviews_df.head(1)

Unnamed: 0,user_id,book_id,review_id,rating,review_text,date_added,date_updated,read_at,started_at,n_votes,n_comments
0,8842281e1d1347389f2ab93d60773d4d,24375664,5cd416f3efc3f944fce4ce2db2290d5e,5,Mind blowingly cool. Best science fiction I've...,Fri Aug 25 13:55:02 -0700 2017,Mon Oct 09 08:55:59 -0700 2017,Sat Oct 07 00:00:00 -0700 2017,Sat Aug 26 00:00:00 -0700 2017,16,0


## 2. Drop Not Needed Columns

In [6]:
columns_to_keep = ['user_id', 'book_id', 'rating']
interactions_df = reviews_df[columns_to_keep].copy()

interactions_df.head(1)

Unnamed: 0,user_id,book_id,rating
0,8842281e1d1347389f2ab93d60773d4d,24375664,5


## 3. Map Ids to Integer Indeces

In [7]:
# Use factorize to get integer codes and the unique original IDs
user_codes, unique_user_ids = pd.factorize(interactions_df['user_id'])

# Add the new user index column to the DataFrame
interactions_df['user_idx'] = user_codes

# Create mapping dictionaries
user_id_to_idx = {user_id: idx for idx, user_id in enumerate(unique_user_ids)}
idx_to_user_id = {idx: user_id for user_id, idx in user_id_to_idx.items()}

# Get the total number of unique users
n_users = len(unique_user_ids)

print(f"Mapped {n_users} unique users to indices 0-{n_users-1}.")

# Use factorize again for book_id
item_codes, unique_item_ids = pd.factorize(interactions_df['book_id'])

# Add the new item index column
interactions_df['item_idx'] = item_codes

# Create mapping dictionaries
item_id_to_idx = {item_id: idx for idx, item_id in enumerate(unique_item_ids)}
idx_to_item_id = {idx: item_id for item_id, idx in item_id_to_idx.items()}

# Get the total number of unique items
n_items = len(unique_item_ids)

print(f"Mapped {n_items} unique items (books) to indices 0-{n_items-1}.")
print(f"\nTotal unique users: {n_users}")
print(f"Total unique items: {n_items}")
print(f"Total interactions: {len(reviews_df)}")
interactions_df.head()

Mapped 465323 unique users to indices 0-465322.
Mapped 2080190 unique items (books) to indices 0-2080189.

Total unique users: 465323
Total unique items: 2080190
Total interactions: 15739967


Unnamed: 0,user_id,book_id,rating,user_idx,item_idx
0,8842281e1d1347389f2ab93d60773d4d,24375664,5,0,0
1,8842281e1d1347389f2ab93d60773d4d,18245960,5,0,1
2,8842281e1d1347389f2ab93d60773d4d,6392944,3,0,2
3,8842281e1d1347389f2ab93d60773d4d,22078596,4,0,3
4,8842281e1d1347389f2ab93d60773d4d,6644782,4,0,4


## 4. Create Positive & Negative Only Rating DataFrames

In [8]:
positive_threshold = 4
negative_threshold = 2

positive_interactions_df = interactions_df[interactions_df['rating'] >= positive_threshold].copy()
negative_interactions_df = interactions_df[interactions_df['rating'] <= negative_threshold].copy()

positive_rating_counts = positive_interactions_df['rating'].value_counts()
negative_rating_counts = negative_interactions_df['rating'].value_counts()

positive_rating_counts = positive_rating_counts.sort_index()
negative_rating_counts = negative_rating_counts.sort_index()

print("Total Positive Interactions: ", len(positive_interactions_df))
print(positive_rating_counts)

print("\nTotal Negative Interactions: ", len(negative_interactions_df))
print(negative_rating_counts)

Total Positive Interactions:  10515445
rating
4    5250205
5    5265240
Name: count, dtype: int64

Total Negative Interactions:  2106997
rating
0     551885
1     448016
2    1107096
Name: count, dtype: int64


## 5. Save Interactions as .parquet

In [12]:
def save_df_to_parquet(df, df_name, output_path):
    print(f"Attempting to save '{df_name}' to: {output_path}")
    try:
        # Check if the DataFrame variable exists and is a DataFrame
        if df_name not in globals() or not isinstance(df, pd.DataFrame):
             print(f"ERROR: DataFrame '{df_name}' is not defined or is not a DataFrame.", file=sys.stderr)
             return False
        if df.empty:
             print(f"Warning: DataFrame '{df_name}' is empty. Skipping save.")
             return True 

        df.to_parquet(output_path, index=False)
        print(f"Successfully saved {os.path.basename(output_path)}")
        return True
    except Exception as e:
        print(f"ERROR: Failed to save '{os.path.basename(output_path)}': {e}", file=sys.stderr)
        return False

In [13]:
interactions_output_filename = 'interactions.parquet' 
positive_interactions_output_filename = 'positive_interactions.parquet' 
negative_interactions_output_filename = 'negative_interactions.parquet' 

interactions_output_path = os.path.join(processed_data_dir, interactions_output_filename)
positive_interactions_output_path = os.path.join(processed_data_dir, positive_interactions_output_filename)
negative_interactions_output_path = os.path.join(processed_data_dir, negative_interactions_output_filename)

save_successful = True

# Save the full interactions DataFrame (with mapped IDs)
if not save_df_to_parquet(interactions_df, 'interactions_df', interactions_output_path):
    save_successful = False

print("-" * 10)

# Save the positive interactions DataFrame
if not save_df_to_parquet(positive_interactions_df, 'positive_interactions_df', positive_interactions_output_path):
    save_successful = False

print("-" * 10)

# Save the negative interactions DataFrame
if not save_df_to_parquet(negative_interactions_df, 'negative_interactions_df', negative_interactions_output_path):
    save_successful = False

print("-" * 30)
if save_successful:
    print("All defined DataFrames saved successfully (or skipped if empty).")
else:
    print("Some DataFrames failed to save. Please check errors above.")

Attempting to save 'interactions_df' to: ../data/processed/interactions.parquet
Successfully saved interactions.parquet
----------
Attempting to save 'positive_interactions_df' to: ../data/processed/positive_interactions.parquet
Successfully saved positive_interactions.parquet
----------
Attempting to save 'negative_interactions_df' to: ../data/processed/negative_interactions.parquet
Successfully saved negative_interactions.parquet
------------------------------
All defined DataFrames saved successfully (or skipped if empty).


## 6. Load Saved Interactions to Verify

In [14]:
def load_and_display(file_path, df_name):
    print(f"\n--------------------- Loading '{os.path.basename(file_path)}' ---------------------")
    try:
        df = pd.read_parquet(file_path)
        print(f"Successfully loaded {len(df)} rows into '{df_name}'.")
        df_counts = df['rating'].value_counts()
        df_counts = df_counts.sort_index()
        print(df_counts)
        print(df.head())
        return df
    except FileNotFoundError:
        print(f"ERROR: File not found at {file_path}", file=sys.stderr)
        return None
    except Exception as e:
        print(f"ERROR: Failed to load {os.path.basename(file_path)}: {e}", file=sys.stderr)
        return None

In [15]:
loaded_interactions_df = load_and_display(interactions_output_path, 'loaded_interactions_df')
loaded_positive_interactions_df = load_and_display(positive_interactions_output_path, 'loaded_positive_interactions_df')
loaded_negative_interactions_df = load_and_display(negative_interactions_output_path, 'loaded_negative_interactions_df')


--------------------- Loading 'interactions.parquet' ---------------------
Successfully loaded 15739967 rows into 'loaded_interactions_df'.
rating
0     551885
1     448016
2    1107096
3    3117525
4    5250205
5    5265240
Name: count, dtype: int64
                            user_id   book_id  rating  user_idx  item_idx
0  8842281e1d1347389f2ab93d60773d4d  24375664       5         0         0
1  8842281e1d1347389f2ab93d60773d4d  18245960       5         0         1
2  8842281e1d1347389f2ab93d60773d4d   6392944       3         0         2
3  8842281e1d1347389f2ab93d60773d4d  22078596       4         0         3
4  8842281e1d1347389f2ab93d60773d4d   6644782       4         0         4

--------------------- Loading 'positive_interactions.parquet' ---------------------
Successfully loaded 10515445 rows into 'loaded_positive_interactions_df'.
rating
4    5250205
5    5265240
Name: count, dtype: int64
                            user_id   book_id  rating  user_idx  item_idx
0  8842281e1