In [2]:
# cài đặt thư viện cần thiết
!pip install sklearn fuzzy-c-means



In [10]:
# load thư viện cần thiết

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from fcmeans import FCM

In [65]:
# random state
rs = 12

In [4]:
# đọc dữ liệu
df = pd.read_csv('../data/rfm_data.csv')
df

Unnamed: 0,CustomerID,Recency,Frequency,Moneytary
0,101000281,0.0,0.008787,0.198388
1,101000282,0.0,0.005272,0.014864
2,101000283,0.0,0.007030,0.018580
3,101000320,0.0,0.000000,0.007432
4,101000325,0.0,0.000000,0.037294
...,...,...,...,...
12359,107000620,1.0,0.000000,0.021311
12360,107000623,1.0,0.000000,0.003716
12361,107000624,1.0,0.000000,0.021311
12362,107000625,1.0,0.000000,0.021311


In [5]:
# load trọng số lên
rfm_weight = pd.read_csv('../data/rfm_weight.csv')
rfm_weight

Unnamed: 0,variable,weight
0,Recency,0.137
1,Frequency,0.328
2,Moneytary,0.535


In [6]:
# lấy trọng số ra
w = rfm_weight.set_index('variable').to_dict()['weight']
w

{'Recency': 0.137, 'Frequency': 0.328, 'Moneytary': 0.535}

In [7]:
# nhân trọng số vào mỗi cột
df.iloc[:, 1:] = df.iloc[:, 1:].apply(lambda x: x * w[x.name])

In [8]:
df

Unnamed: 0,CustomerID,Recency,Frequency,Moneytary
0,101000281,0.000,0.002882,0.106138
1,101000282,0.000,0.001729,0.007952
2,101000283,0.000,0.002306,0.009940
3,101000320,0.000,0.000000,0.003976
4,101000325,0.000,0.000000,0.019952
...,...,...,...,...
12359,107000620,0.137,0.000000,0.011401
12360,107000623,0.137,0.000000,0.001988
12361,107000624,0.137,0.000000,0.011401
12362,107000625,0.137,0.000000,0.011401


## Chuẩn bị dữ liệu

In [12]:
cust_id = df.iloc[:, 0].values
X = df.iloc[:, 1:].values
cust_id, X

(array([101000281, 101000282, 101000283, ..., 107000624, 107000625,
        107000626], dtype=int64),
 array([[0.        , 0.00288225, 0.10613779],
        [0.        , 0.00172935, 0.00795232],
        [0.        , 0.0023058 , 0.0099404 ],
        ...,
        [0.137     , 0.        , 0.01140117],
        [0.137     , 0.        , 0.01140117],
        [0.137     , 0.        , 0.01140117]]))

# Phân cụm

## KMeans

In [18]:
# dùng thang đo silhouette để chọn số cụm
# để số cụm từ 3 -> 10

range_k = np.arange(2, 11)
range_k

array([ 2,  3,  4,  5,  6,  7,  8,  9, 10])

In [75]:
def make_kmean(k: int):
    model = KMeans(n_clusters=k, random_state=rs)
    model.fit(X)
    cluster_labels = model.predict(X)
    sil_score = silhouette_score(X, cluster_labels)
    print(f"K: {k} done")
    return dict(
        k=k,
        cluster_labels=cluster_labels,
        sil_score=sil_score
    )

In [76]:
kmean_final_cluster = [make_kmean(k) for k in range_k]

K: 2 done
K: 3 done
K: 4 done
K: 5 done
K: 6 done
K: 7 done
K: 8 done
K: 9 done
K: 10 done


In [77]:
# sắp xếp các mô hình KMean theo chỉ số sihouette (từ cao xuống thấp)
sorted_kmean_final_cluster = sorted(kmean_final_cluster, key=lambda x: x['sil_score'], reverse=True)

In [84]:
sorted_kmean_final_cluster

[{'k': 3,
  'cluster_labels': array([2, 2, 2, ..., 1, 1, 1]),
  'sil_score': 0.6116619074690446},
 {'k': 5,
  'cluster_labels': array([2, 3, 3, ..., 0, 0, 0]),
  'sil_score': 0.5928988170286317},
 {'k': 4,
  'cluster_labels': array([3, 3, 3, ..., 2, 2, 2]),
  'sil_score': 0.58774732047867},
 {'k': 2,
  'cluster_labels': array([1, 1, 1, ..., 0, 0, 0]),
  'sil_score': 0.5783160307359263},
 {'k': 9,
  'cluster_labels': array([2, 1, 1, ..., 3, 3, 3]),
  'sil_score': 0.5360868267549826},
 {'k': 6,
  'cluster_labels': array([5, 0, 0, ..., 4, 4, 4]),
  'sil_score': 0.5289474893998432},
 {'k': 10,
  'cluster_labels': array([7, 2, 2, ..., 1, 1, 1]),
  'sil_score': 0.4995192937946345},
 {'k': 8,
  'cluster_labels': array([3, 1, 1, ..., 0, 0, 0]),
  'sil_score': 0.49403670067184413},
 {'k': 7,
  'cluster_labels': array([5, 1, 1, ..., 3, 3, 3]),
  'sil_score': 0.4743109485356541}]

In [78]:
# lấy mô hình có giá trị sihouette tốt nhất

best_k_kmean = sorted_kmean_final_cluster[0]['k']
kmean_cluster_labels = sorted_kmean_final_cluster[0]['cluster_labels']
kmean_sil_score = sorted_kmean_final_cluster[0]['sil_score']
print(f'Số cụm tốt nhất cho KMeans là {best_k_kmean} với giá trị sihouette: {kmean_sil_score}')

Số cụm tốt nhất cho KMeans là 3 với giá trị sihouette: 0.6116619074690446


In [79]:
kmean_cluster_df = pd.DataFrame(
    {'CustomerID': cust_id, 'Cluster Label': kmean_cluster_labels}
)
df.merge(kmean_cluster_df, on='CustomerID').to_csv(
    '../data/cluster_kmeans.csv', index=False)

### Fuzzy C-Mean

In [80]:
def make_fuzzycmean(k: int):
    model = FCM(n_clusters=k, random_state=rs)
    model.fit(X)
    cluster_labels = model.predict(X)
    sil_score = silhouette_score(X, cluster_labels)
    print(f"K: {k} done")
    return dict(
        k=k,
        cluster_labels=cluster_labels,
        sil_score=sil_score
    )

In [81]:
fuzzycmean_final_cluster = [make_fuzzycmean(k) for k in range_k]

K: 2 done
K: 3 done
K: 4 done
K: 5 done
K: 6 done
K: 7 done
K: 8 done
K: 9 done
K: 10 done


In [82]:
# sắp xếp các mô hình Fuzzy C Mean theo chỉ số sihouette (từ cao xuống thấp)
sorted_fuzzycmean_final_cluster = sorted(fuzzycmean_final_cluster, key=lambda x: x['sil_score'], reverse=True)

In [83]:
sorted_fuzzycmean_final_cluster

[{'k': 2,
  'cluster_labels': array([1, 1, 1, ..., 0, 0, 0], dtype=int64),
  'sil_score': 0.5786705987260033},
 {'k': 3,
  'cluster_labels': array([2, 2, 2, ..., 0, 0, 0], dtype=int64),
  'sil_score': 0.5327575117658847},
 {'k': 4,
  'cluster_labels': array([2, 2, 2, ..., 0, 0, 0], dtype=int64),
  'sil_score': 0.4677845169109287},
 {'k': 5,
  'cluster_labels': array([3, 3, 3, ..., 4, 4, 4], dtype=int64),
  'sil_score': 0.42309354954017947},
 {'k': 10,
  'cluster_labels': array([7, 7, 7, ..., 8, 8, 8], dtype=int64),
  'sil_score': 0.39506491646153696},
 {'k': 8,
  'cluster_labels': array([6, 6, 6, ..., 2, 2, 2], dtype=int64),
  'sil_score': 0.370863482095966},
 {'k': 6,
  'cluster_labels': array([1, 1, 1, ..., 3, 3, 3], dtype=int64),
  'sil_score': 0.3112871120843759},
 {'k': 7,
  'cluster_labels': array([2, 2, 2, ..., 6, 6, 6], dtype=int64),
  'sil_score': 0.2962601267763827},
 {'k': 9,
  'cluster_labels': array([7, 7, 7, ..., 6, 6, 6], dtype=int64),
  'sil_score': 0.2221153676618968}]

In [85]:
# lấy mô hình có giá trị sihouette tốt nhất

best_k_fuzzycmean = sorted_fuzzycmean_final_cluster[0]['k']
fuzzycmean_cluster_labels = sorted_fuzzycmean_final_cluster[0]['cluster_labels']
fuzzycmean_sil_score = sorted_fuzzycmean_final_cluster[0]['sil_score']
print(f'Số cụm tốt nhất cho KMeans là {best_k_fuzzycmean} với giá trị sihouette: {fuzzycmean_sil_score}')

Số cụm tốt nhất cho KMeans là 2 với giá trị sihouette: 0.5786705987260033


In [86]:
fuzzycmean_cluster_df = pd.DataFrame(
    {'CustomerID': cust_id, 'Cluster Label': fuzzycmean_cluster_labels}
)
df.merge(fuzzycmean_cluster_df, on='CustomerID').to_csv(
    '../data/cluster_fuzzycmean.csv', index=False)

In [93]:
total_match = (fuzzycmean_cluster_df.iloc[:, -1] == kmean_cluster_df.iloc[:, -1]).sum()
print("Số lượng khách hàng có giá trị cụm ở cả hai thuật toán bằng nhau:", total_match)

Số lượng khách hàng có giá trị cụm ở cả hai thuật toán bằng nhau: 201
