In [12]:
## Import libraries & files

import pandas as pd
from ipyaggrid import Grid
from mlxtend.frequent_patterns import apriori, association_rules

excel_path = r"C:/Users/LuanHy/Desktop/FocusSpace/1. Interview/Test/UrBox_Test/Test_Answers/Test_Data.xlsx"


In [13]:
## Function

def data_framework(x):
    df = Grid(
        grid_data=x,
        grid_options={           # <-- bắt buộc có cái này
            "enableSorting": True,
            "enableFilter": True,
            "editable": False,
        },
        width='100%',
        height=600,
    )
    return df

def summarize_user(g):
    g = g.sort_values(["Redeemed_time_rank", "Min_redeemed_time"])

    rows_first = g.loc[g["Redeemed_time_rank"] == 1]
    if not rows_first.empty:
        first_brand_id = rows_first["brand_id"].iloc[0]
        first_brand_date = rows_first["Min_redeemed_time"].iloc[0]
    else:
        first_brand_id = None
        first_brand_date = None

    rows_second = g.loc[g["Redeemed_time_rank"] == 2]
    if not rows_second.empty:
        second_brand_id = rows_second["brand_id"].iloc[0]
        second_brand_date = rows_second["Min_redeemed_time"].iloc[0]
    else:
        second_brand_id = None
        second_brand_date = None

    last_row = g.iloc[-1]
    last_brand_id = last_row["brand_id"]
    last_redeemed_date = last_row["Max_redeemed_time"]
    number_of_brands = len(g)

    return pd.Series(
        {
            "first_brand_id": first_brand_id,
            "second_brand_id": second_brand_id,
            "first_brand_redeemed_date": first_brand_date,
            "second_brand_redeemed_date": second_brand_date,
            "last_brand_id": last_brand_id,
            "last_redeemed_date": last_redeemed_date,
            "number_of_brands": number_of_brands,
        }
    )



In [14]:
## Transaction_table

Transactions_Data = pd.read_excel(excel_path,sheet_name="Data for Part 1")

In [15]:
## User_table

User_Data = (
    Transactions_Data
        .groupby(["user_id","brand_id"],as_index=False)
        .agg(
            Min_redeemed_time = ("voucher_redeemed_at","min"),
            Max_redeemed_time = ("voucher_redeemed_at","max"),
        )
        .sort_values(["user_id", "Min_redeemed_time"])
        .assign(
            Redeemed_time_rank = lambda d:d
                .groupby("user_id")["Min_redeemed_time"]
                .rank(method="dense",ascending=True)
        )
)

User_Data = (
    User_Data
      .groupby("user_id")
      .apply(summarize_user, include_groups=False)
      .reset_index()
)

User_Data = (
    User_Data
        .assign(
            user_age = (
                pd.to_datetime(User_Data["last_redeemed_date"])
                - pd.to_datetime(User_Data["first_brand_redeemed_date"])
            ).dt.days,
        age_group = lambda d: pd.cut(
                (
                    pd.to_datetime(d["last_redeemed_date"])
                    - pd.to_datetime(d["first_brand_redeemed_date"])
                ).dt.days,
                bins=[-1, 7, 30, 90,9999],
                labels=["New User", "Early User", "Growth User","Loyal User"]
            )
        )
    ) 
User_Data = User_Data.sort_values("user_age",ascending=True)
data_framework(User_Data)


Grid(columns_fit='size_to_fit', compress_data=True, export_mode='disabled', height='600px', menu={'buttons': […

In [16]:
# User_and_Transaction_Numbers_Pivot

User_and_Transaction_Numbers = (
    Transactions_Data
        .merge(
            User_Data[["user_id", "age_group"]],
            on="user_id",
            how="left"
        )
        .groupby("age_group")
        .agg(
            user_count = ("user_id", "nunique"),
            transaction_count = ("transaction_id", "count"),
        )
        .assign(
            avg_transactions_per_user = lambda d: (d["transaction_count"] / d["user_count"]).round(1)
        )
        .reset_index()
)

User_and_Transaction_Numbers


  .groupby("age_group")


Unnamed: 0,age_group,user_count,transaction_count,avg_transactions_per_user
0,New User,481,898,1.9
1,Early User,80,416,5.2
2,Growth User,185,1303,7.0
3,Loyal User,254,3035,11.9


In [17]:
## Join 2 tables

Merge_Table = (
    Transactions_Data
        .merge(
            User_Data[["user_id", "age_group"]],
            on="user_id",
            how="left"
        )
    )

data_framework(Merge_Table)


Grid(columns_fit='size_to_fit', compress_data=True, export_mode='disabled', height='600px', menu={'buttons': […

In [18]:
## Brand count by age_group

Data_table = (
    Merge_Table
        .groupby(["age_group"],observed=False)
        .agg(
            brand_count=("brand_id", "nunique"),
        )
        .reset_index()
)
Data_table


Unnamed: 0,age_group,brand_count
0,New User,48
1,Early User,43
2,Growth User,47
3,Loyal User,59


In [19]:
## Top5_By_AgeBrand_Data

Data_table = (
    Merge_Table
        .groupby(["age_group", "brand_id"],observed=False)
        .agg(
            transaction_count=("transaction_id", "count"),
        )
        .reset_index()
)

Top5_By_AgeBrand = (
    Data_table
        .sort_values(["age_group", "transaction_count"], ascending=[True, False])
        .groupby("age_group",observed=False)
        .head(5)   # lấy 5 dòng đầu của mỗi age_group sau khi sort
        .reset_index(drop=True)
)

Top5_By_AgeBrand



Unnamed: 0,age_group,brand_id,transaction_count
0,New User,1511,276
1,New User,395,99
2,New User,82,93
3,New User,882,52
4,New User,910,38
5,Early User,82,71
6,Early User,1511,48
7,Early User,882,38
8,Early User,395,36
9,Early User,552,34


In [None]:
## Apply Apriori methods for Basket Analysis

Transactions_Data = Transactions_Data.dropna(subset=['user_id', 'brand_id'])

Transactions_Data_unique = Transactions_Data.drop_duplicates(subset=['user_id', 'brand_id']).copy()

## User with list brand check
print("------ USER WITH LIST BRAND CHECK ------")
print(Transactions_Data_unique.groupby('user_id')['brand_id'].apply(list).head(10))

## Create Maxtrix for Apriori
Transactions_Data_unique['value'] = 1
basket = Transactions_Data_unique.pivot_table(
    index='user_id',
    columns='brand_id',
    values='value',
    fill_value=0
)
print("\n\n------ MATRIX CHECK ------")
print(basket.head())

## Run Apriori and setup min_support = 0.02 to find itemset >= 2% users
freq_items = apriori(
    basket,
    min_support=0.02,
    use_colnames=True
)
freq_items = freq_items.sort_values(by='support', ascending=False)
print("\n\n=== FREQUENT ITEMSETS CHECK===")
print(freq_items.head(20))

## 7. Create ASSOCIATION RULES (SUPPORT, CONFIDENCE, LIFT)
rules = association_rules(
    freq_items,
    metric="lift",
    min_threshold=1.0
)

rules = rules.sort_values(by='lift', ascending=False)

print("\n\n------ ASSOCIATION RULES CHECK ------")
print(rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']].head(30))

## STRONG RULES
strong_rules = rules[
    (rules['support'] >= 0.02) &
    (rules['confidence'] >= 0.3) &
    (rules['lift'] >= 1.2)
].copy()

strong_rules = strong_rules.sort_values(by='lift', ascending=False)

print("\n\n------ STRONG RULES CHECK ------")
print(strong_rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']].head(10))


------ USER WITH LIST BRAND CHECK ------
user_id
108465           [397]
135042           [890]
218587            [23]
410610            [82]
427212     [992, 1511]
539864          [1511]
735746           [396]
890259           [593]
1385395      [34, 315]
1537715          [395]
Name: brand_id, dtype: object


------ MATRIX CHECK ------
brand_id  0     1     7     23    24    27    30    34    35    36    ...  \
user_id                                                               ...   
108465     0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
135042     0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
218587     0.0   0.0   0.0   1.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
410610     0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
427212     0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   

brand_id  1054  1068  1070  1110  1170  1256  1292  1511  1529  1841  
user_id                                            

