In [1]:
import os
import sys
sys.path.append("..")
import pandas as pd

from src.utils.preprocessing import DataPreprocessor, handle_missing_values, create_implicit_ratings

In [2]:
# ====================== Load Data ======================
orders = pd.read_csv("../data/raw/orders.csv")
prior = pd.read_csv("../data/raw/order_products__prior.csv")
products = pd.read_csv("../data/raw/products.csv")

In [3]:
# ====================== Handle Missing Values ======================
print("Checking for missing values...")
print(orders.isnull().sum())

orders_clean = handle_missing_values(orders)
print("\n✓ Missing values handled")

Checking for missing values...
order_id                       0
user_id                        0
eval_set                       0
order_number                   0
order_dow                      0
order_hour_of_day              0
days_since_prior_order    206209
dtype: int64

✓ Missing values handled


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['days_since_prior_order'].fillna(30, inplace=True)


In [4]:
# ====================== Preprocess Data ======================
preprocessor = DataPreprocessor(
    min_user_orders=5,
    min_product_orders=20,
    max_basket_size=100
)

orders_filtered, prior_filtered = preprocessor.fit_transform(orders_clean, prior)

print("\nPreprocessing Statistics:")
for key, value in preprocessor.stats.items():
    if isinstance(value, float):
        print(f"{key}: {value:.2%}")
    else:
        print(f"{key}: {value:,}")

INFO:src.utils.preprocessing:Fitting preprocessor...
INFO:src.utils.preprocessing:Active users: 162,633 / 206,209 (78.9%)
INFO:src.utils.preprocessing:Popular products: 35,922 / 49,677 (72.3%)
INFO:src.utils.preprocessing:Transforming data...
INFO:src.utils.preprocessing:Filtered orders: 2,916,243
INFO:src.utils.preprocessing:Filtered transactions: 30,718,986



Preprocessing Statistics:
total_users: 206,209
active_users: 162,633
user_retention_rate: 78.87%
total_products: 49,677
popular_products: 35,922
product_retention_rate: 72.31%


In [5]:
# ====================== Preprocess Data ======================
preprocessor = DataPreprocessor(
    min_user_orders=5,
    min_product_orders=20,
    max_basket_size=100
)

orders_filtered, prior_filtered = preprocessor.fit_transform(orders_clean, prior)

print("\nPreprocessing Statistics:")
for key, value in preprocessor.stats.items():
    if isinstance(value, float):
        print(f"{key}: {value:.2%}")
    else:
        print(f"{key}: {value:,}")

INFO:src.utils.preprocessing:Fitting preprocessor...
INFO:src.utils.preprocessing:Active users: 162,633 / 206,209 (78.9%)
INFO:src.utils.preprocessing:Popular products: 35,922 / 49,677 (72.3%)
INFO:src.utils.preprocessing:Transforming data...
INFO:src.utils.preprocessing:Filtered orders: 2,916,243
INFO:src.utils.preprocessing:Filtered transactions: 30,718,986



Preprocessing Statistics:
total_users: 206,209
active_users: 162,633
user_retention_rate: 78.87%
total_products: 49,677
popular_products: 35,922
product_retention_rate: 72.31%


In [6]:
# ====================== Create Implicit Ratings ======================
implicit_ratings = create_implicit_ratings(orders_filtered, prior_filtered)

print("\nImplicit Ratings Sample:")
print(implicit_ratings.head(10))

INFO:src.utils.preprocessing:Creating implicit ratings from reorder behavior...
INFO:src.utils.preprocessing:Created 12,083,736 implicit ratings
INFO:src.utils.preprocessing:Rating distribution:
count    1.208374e+07
mean     2.371667e-01
std      2.707734e-01
min      3.030303e-03
25%      1.500000e-02
50%      4.285714e-02
75%      4.765568e-01
max      1.000000e+00
Name: final_rating, dtype: float64



Implicit Ratings Sample:
   user_id  product_id  final_rating
0        1         196      0.930000
1        1       10258      0.892222
2        1       10326      0.030000
3        1       12427      0.930000
4        1       13032      0.556667
5        1       13176      0.410000
6        1       14084      0.030000
7        1       17122      0.030000
8        1       25133      0.852500
9        1       26088      0.410000


In [7]:
# ====================== Save Processed Data ======================
import os
os.makedirs("../data/processed", exist_ok=True)

orders_filtered.to_csv("../data/processed/orders_filtered.csv", index=False)
prior_filtered.to_csv("../data/processed/prior_filtered.csv", index=False)
implicit_ratings.to_csv("../data/processed/implicit_ratings.csv", index=False)

print("\n✓ Processed data saved to data/processed/")


✓ Processed data saved to data/processed/


In [9]:
# Track with DVC
import subprocess
subprocess.run(["dvc", "add", "data/processed/"])
subprocess.run(["git", "add", "data/processed.dvc", ".gitignore"])
subprocess.run(["git", "commit", "-m", "Track processed data with DVC"])

print("✓ Data tracked with DVC")

On branch master
Untracked files:
  (use "git add <file>..." to include in what will be committed)
	../data/processed/
	./
	../src/

nothing added to commit but untracked files present (use "git add" to track)
✓ Data tracked with DVC


ERROR: stage working dir '/Users/alex_z/Desktop/product-recommendation-engine/notebooks/data' does not exist
fatal: pathspec 'data/processed.dvc' did not match any files
