In [3]:
import pandas as pd
import numpy as np

In [4]:
transaction = pd.read_csv("C:\\Users\\kyrie\\Downloads\\sampled_transactions_local.csv")

In [5]:
baby_products = pd.read_csv("C:\\Users\\kyrie\\Desktop\\ML_AI_scale\\Group_project\\group_project2\\baby_products.csv")

In [6]:
print(transaction.head())

             trans_id    trans_dt  store_id      cust_id      prod_id  \
0  180130010200406503  2018-01-30      1020   1123940312  20070132001   
1  181115010200505064  2018-11-15      1020  33221937660  20721046001   
2  180117010202108471  2018-01-17      1020   1135643825     20097083   
3  171011010200402432  2017-10-11      1020   1125634180     20776239   
4  180810010200500575  2018-08-10      1020   1113651393     20313182   

   sales_amt  sales_qty  sales_wgt  
0       2.49          1        0.0  
1       2.49          1        0.0  
2       2.49          1        0.0  
3       2.49          1        0.0  
4       2.49          1        0.0  


In [7]:
print(baby_products.head())

    prod_id                       prod_desc prod_section prod_category  \
0  21302291        KIDS SWING 2 SET COVERME         Baby          Baby   
1  21168731  NOSH MUNCHABLES BROC PEAR KALE         Baby          Baby   
2  20979671                     SG 6X WIPES         Baby          Baby   
3  21106746               SG DIAPERS SBP S6         Baby          Baby   
4  20979628               SG DIAPERS SBP S5         Baby          Baby   

     prod_subcategory               prod_type prod_mfc_brand_cd  \
0    Baby Accessories  Baby Swings & Bouncers              0KDS   
1      Infant Feeding                    Baby              0NOS   
2          Baby Wipes               5x Refill              7GEN   
3  Disposable Diapers                Big Pack              7GEN   
4  Disposable Diapers                Big Pack              7GEN   

   prod_unit_qty_count prod_count_uom  prod_uom_value  
0                    1             EA             1.0  
1                    1              G   

In [8]:
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import csr_matrix
from sklearn.preprocessing import LabelEncoder


# Merge product metadata into transaction data
df = transaction.merge(baby_products, on="prod_id", how="inner")
print(df.head())


             trans_id    trans_dt  store_id     cust_id      prod_id  \
0  180327010210100894  2018-03-27      1021  1133144909     21027588   
1  180317010211100147  2018-03-17      1021  1142318362  20888821003   
2  190315010210305505  2019-03-15      1021  1131650845  20888821002   
3  190629010280506213  2019-06-29      1028  1145479936     21167647   
4  180728010280609799  2018-07-28      1028  1071868432     21054223   

   sales_amt  sales_qty  sales_wgt                      prod_desc  \
0       2.49          1        0.0  LVCH BABY FOOD MANGO CHKN ORG   
1       2.49          1        0.0             ACSE APL STRW PUFF   
2       2.49          1        0.0        ACSE PURP CAR BLUB PUFF   
3       2.49          1        0.0   LOVE CHILD HEARTY BLGNSE 6M+   
4       2.49          1        0.0   LCO SNACKS LV DUCKS CHS HRBS   

  prod_section prod_category prod_subcategory      prod_type  \
0         Baby          Baby   Infant Feeding  Baby Strained   
1         Baby          

In [9]:
print(df.head())

             trans_id    trans_dt  store_id     cust_id      prod_id  \
0  180327010210100894  2018-03-27      1021  1133144909     21027588   
1  180317010211100147  2018-03-17      1021  1142318362  20888821003   
2  190315010210305505  2019-03-15      1021  1131650845  20888821002   
3  190629010280506213  2019-06-29      1028  1145479936     21167647   
4  180728010280609799  2018-07-28      1028  1071868432     21054223   

   sales_amt  sales_qty  sales_wgt                      prod_desc  \
0       2.49          1        0.0  LVCH BABY FOOD MANGO CHKN ORG   
1       2.49          1        0.0             ACSE APL STRW PUFF   
2       2.49          1        0.0        ACSE PURP CAR BLUB PUFF   
3       2.49          1        0.0   LOVE CHILD HEARTY BLGNSE 6M+   
4       2.49          1        0.0   LCO SNACKS LV DUCKS CHS HRBS   

  prod_section prod_category prod_subcategory      prod_type  \
0         Baby          Baby   Infant Feeding  Baby Strained   
1         Baby          

In [11]:
df = transaction.merge(baby_products, on="prod_id", how="inner")


df_baby = df[df["prod_section"].str.upper() == "BABY"].copy()

# Step 3: Brand standardization
df_baby["brand_code"] = df_baby["prod_mfc_brand_cd"].str.upper()

# Step 4: Define important groups
HUGGIES_CODE = "HUGG"  # Confirm Huggies code in your data
ACSE_CODE = "ACSE"

# 4a: Customers who bought ACSE-branded baby products
acse_buyers = set(df_baby[df_baby["brand_code"] == ACSE_CODE]["cust_id"])

# 4b: Customers who bought competitor brands (not Huggies or ACSE)
competitor_buyers = set(
    df_baby[
        (df_baby["brand_code"] != HUGGIES_CODE) &
        (df_baby["brand_code"] != ACSE_CODE)
    ]["cust_id"]
)

# 4c: Final target = competitors only, excluding ACSE buyers
target_customers = list(competitor_buyers - acse_buyers)


In [12]:
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()
df_baby["user_idx"] = user_encoder.fit_transform(df_baby["cust_id"])
df_baby["item_idx"] = item_encoder.fit_transform(df_baby["prod_id"])

# Step 6: Build binary interaction matrix
interaction_matrix = csr_matrix(
    (np.ones(len(df_baby)), (df_baby["user_idx"], df_baby["item_idx"])),
    shape=(df_baby["user_idx"].nunique(), df_baby["item_idx"].nunique())
)

# Step 7: Train SVD
svd = TruncatedSVD(n_components=20, random_state=42)
user_factors = svd.fit_transform(interaction_matrix)
item_factors = svd.components_.T

# Step 8: Prepare mapping
cust_id_to_idx = dict(zip(df_baby["cust_id"], df_baby["user_idx"]))
prod_id_to_desc = dict(zip(baby_products["prod_id"], baby_products["prod_desc"]))

# Filter Huggies products only
huggies_items = df_baby[df_baby["brand_code"] == HUGGIES_CODE][["prod_id", "item_idx"]].drop_duplicates()



In [13]:
recommendations = []

for cust_id in target_customers:
    if cust_id not in cust_id_to_idx:
        continue

    user_idx = cust_id_to_idx[cust_id]
    user_vec = user_factors[user_idx]

    scored_items = []
    for _, row in huggies_items.iterrows():
        item_idx = row["item_idx"]
        prod_id = row["prod_id"]
        score = np.dot(user_vec, item_factors[item_idx])
        scored_items.append((prod_id, score))

    # Top 3 Huggies products
    top_items = sorted(scored_items, key=lambda x: -x[1])[:3]
    for prod_id, score in top_items:
        recommendations.append({
            "cust_id": cust_id,
            "recommended_prod_id": prod_id,
            "prod_desc": prod_id_to_desc.get(prod_id, ""),
            "score": score
        })

# Step 10: Output sorted by score
recommendation_df = pd.DataFrame(recommendations)
recommendation_df = recommendation_df.sort_values(by="score", ascending=False)


In [14]:
# Look at a few customer IDs to test with
print(recommendation_df.head(10))


          cust_id  recommended_prod_id                       prod_desc  \
6741   1025384488             20557596  HUGG NATURAL CARE RFT 1X WIPES   
89040  1014215043             20557596  HUGG NATURAL CARE RFT 1X WIPES   
84147  1130993976             20799372  HUGG NATURAL CARE 3X FIT WIPES   
89041  1014215043             20799372  HUGG NATURAL CARE 3X FIT WIPES   
82692  1124438621             20799372  HUGG NATURAL CARE 3X FIT WIPES   
84148  1130993976          20428787004           GOODNITES MEGA SM BOY   
84149  1130993976             20758373  HUGG NATURAL CARE TUB 1X WIPES   
56166  1131745305             20799372  HUGG NATURAL CARE 3X FIT WIPES   
48231  1137109340             21172530         HUGG LIL MOVERS SBP SZ5   
48232  1137109340             21173592      HUGG LIL SNUGGLERS SBP SZ5   

          score  
6741   1.665875  
89040  1.486708  
84147  1.449204  
89041  1.398494  
82692  1.307609  
84148  1.295016  
84149  1.017747  
56166  0.992337  
48231  0.866521  
48232