In [261]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans

# 정규화
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

df = pd.read_csv('Ecommerce_Consumer_Behavior_Analysis_Data.csv')
# 나이, 성별, 구매 카테고리 
df = df[['Age', 'Gender', 'Purchase_Category']].drop_duplicates()
df = df.rename(columns={'Purchase_Category':'Category'})

In [262]:
# 카테고리 확인
df['Category'].drop_duplicates()

0          Gardening & Outdoors
1              Food & Beverages
2               Office Supplies
3               Home Appliances
4                     Furniture
7                         Books
9             Sports & Outdoors
10           Mobile Accessories
12                 Luxury Goods
13                  Animal Feed
14                  Health Care
15                       Hotels
18                    Packages)
19                  Electronics
20              Software & Apps
21                Baby Products
22                 Toys & Games
26                Arts & Crafts
29           Health Supplements
36                    Groceries
48                     Clothing
50       Beauty & Personal Care
52    Travel & Leisure (Flights
77        Jewelry & Accessories
Name: Category, dtype: object

In [263]:
# Category를 묶을 bert 알고리즘
bert_model = SentenceTransformer('all-MiniLM-L6-v2') 

categories = df['Category'].tolist()
embeddings = bert_model.encode(categories)

# KMeans로 군집화
k = 5
kmeans = KMeans(n_clusters=k, random_state=42)
labels = kmeans.fit_predict(embeddings)

df['Cluster'] = labels

In [264]:
cluster_name_mapping = {
                            0: "Food & Daily Life",
                            1: "Home / Lifestyle / Luxury",
                            2: "Outdoor & Leisure",
                            3: "Health & Beauty",
                            4: "Electronics & Appliances"
                        }

df['Category_Grouped'] = df['Cluster'].map(cluster_name_mapping)

df = df[['Age', 'Gender', 'Category_Grouped']]

In [266]:
# 숫자 정규화
# 나이 데이터
scaler = StandardScaler()
df[['Age']] = scaler.fit_transform(df[['Age']])

In [267]:
# 원핫인코딩
ct = ColumnTransformer( [('encoder', OneHotEncoder(), [1, 2])], remainder='passthrough')
X = ct.fit_transform(df)
X = X.toarray()


In [247]:
from sklearn.mixture import GaussianMixture

gmm = GaussianMixture(n_components=3, random_state=42)
gmm.fit(X)

GaussianMixture(n_components=3, random_state=42)

In [None]:
df

Unnamed: 0,Age,Gender,Category_Grouped
0,-1.334426,Female,Health & Beauty
1,1.580206,Male,Outdoor & Leisure
2,-1.118527,Female,Food & Daily Life
3,-0.578781,Female,Food & Daily Life
4,-0.146983,Female,Outdoor & Leisure
...,...,...,...
991,0.608662,Bigender,Outdoor & Leisure
993,-1.010578,Female,Outdoor & Leisure
996,1.688155,Female,Health & Beauty
998,-1.442375,Female,Outdoor & Leisure


In [273]:
import joblib
joblib.dump(scaler, "age_scaler.pkl")
joblib.dump(ct, "ct_encoder.pkl")
joblib.dump(gmm, "gmm.pkl")


['gmm.pkl']

In [None]:
age_scaler = joblib.load("age_scaler.pkl")
ct_encoder = joblib.load("ct_encoder.pkl")
gmm_loaded = joblib.load("gmm.pkl")

In [279]:
new_user = pd.DataFrame({'Age':[25], 'Gender':['Female'], 'Category_Grouped':['Health & Beauty']})
new_user[['Age']] = age_scaler.transform(new_user[['Age']])
X_new = ct_encoder.transform(new_user).toarray()
cluster_label = gmm_loaded.predict(X_new)
print(cluster_label)

[1]
