In [1]:
import numpy as np
import pandas as pd
from kmodes.kprototypes import KPrototypes
import matplotlib.pyplot as plt
from itertools import combinations
import math
import warnings

In [2]:
df_students = pd.read_csv('/Users/althaaframadhan/Documents/Skripsi/Penelitian/preprocessing/trends_200.csv')

In [3]:
print(df_students.head())
print(df_students.info())

    Platform     Hashtag Content_Type   Region    Views   Likes  Shares  \
0  Instagram     #Comedy        Video   Canada  3239663  288436   94297   
1  Instagram  #Education         Post    Japan  3759036   49515   94487   
2  Instagram     #Gaming       Shorts   Brazil  3697216  209784   41633   
3     TikTok    #Fitness         Post   Brazil  2878436  149765    9840   
4     TikTok    #Fashion  Live Stream  Germany  1213710  468518    1746   

   Comments Engagement_Level  
0      5399             High  
1     19525           Medium  
2     19831           Medium  
3     42118             High  
4     24547             High  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Platform          200 non-null    object
 1   Hashtag           200 non-null    object
 2   Content_Type      200 non-null    object
 3   Region            200 no

In [4]:
df = df_students.copy()
num_cols = df.select_dtypes(include=['int64','float64']).columns.tolist()
cat_cols = df.select_dtypes(include=['object','category']).columns.tolist()

print("Numeric columns:", num_cols)
print("Categorical columns:", cat_cols)

Numeric columns: ['Views', 'Likes', 'Shares', 'Comments']
Categorical columns: ['Platform', 'Hashtag', 'Content_Type', 'Region', 'Engagement_Level']


In [5]:
df_encoded = df.copy()
cat_mappings = {}
for col in cat_cols:
    df_encoded[col], mapping = df[col].astype('category').factorize()
    cat_mappings[col] = mapping

In [6]:
X = df_encoded.to_numpy()
num_idx = [df_encoded.columns.get_loc(col) for col in num_cols]
cat_idx = [df_encoded.columns.get_loc(col) for col in cat_cols]


In [7]:
print("num_idx:", num_idx)
print("cat_idx:", cat_idx)
print("X.shape:", X.shape)

num_idx: [4, 5, 6, 7]
cat_idx: [0, 1, 2, 3, 8]
X.shape: (200, 9)


In [8]:
if len(num_idx) > 0:
    gamma = float(np.nanmean(np.std(X[:, num_idx].astype(float), axis=0)))
else:
    gamma = 1.0
print("gamma chosen =", gamma)


gamma chosen = 427966.2535745349


In [9]:
def mixed_dissim(a, b, num_idx, cat_idx, gamma):
    # numeric: Euclidean
    if len(num_idx) > 0:
        diff_num = a[num_idx].astype(float) - b[num_idx].astype(float)
        num_dist = math.sqrt(np.sum(diff_num ** 2))
    else:
        num_dist = 0.0
    # categorical: simple matching (count mismatches)
    if len(cat_idx) > 0:
        cat_dist = np.sum(a[cat_idx] != b[cat_idx])
    else:
        cat_dist = 0.0
    return num_dist + gamma * cat_dist

In [10]:
def silhouette_mixed(X, labels, num_idx, cat_idx, gamma):
    labels = np.array(labels)
    n = X.shape[0]
    unique_labels = np.unique(labels)
    clusters = {lab: np.where(labels == lab)[0] for lab in unique_labels}
    sil_vals = np.zeros(n)
    for i in range(n):
        xi = X[i]
        own = labels[i]
        members = clusters[own]
        if len(members) == 1:
            a_i = 0.0
        else:
            others = members[members != i]
            a_i = np.mean([mixed_dissim(xi, X[j], num_idx, cat_idx, gamma) for j in others])
        b_vals = []
        for lab in unique_labels:
            if lab == own:
                continue
            mem = clusters[lab]
            if len(mem)==0: continue
            avg = np.mean([mixed_dissim(xi, X[j], num_idx, cat_idx, gamma) for j in mem])
            b_vals.append(avg)
        b_i = min(b_vals) if len(b_vals)>0 else 0.0
        denom = max(a_i, b_i)
        sil_vals[i] = 0.0 if denom==0 else (b_i - a_i) / denom
    return np.mean(sil_vals)

In [11]:
def dunn_index_mixed(X, labels, num_idx, cat_idx, gamma):
    labels = np.array(labels)
    unique_labels = np.unique(labels)
    # intra diameters
    diameters = []
    for lab in unique_labels:
        members = np.where(labels==lab)[0]
        if len(members) <= 1:
            diameters.append(0.0)
            continue
        maxd = 0.0
        for i in range(len(members)):
            for j in range(i+1, len(members)):
                d = mixed_dissim(X[members[i]], X[members[j]], num_idx, cat_idx, gamma)
                if d > maxd:
                    maxd = d
        diameters.append(maxd)
    max_diameter = max(diameters) if len(diameters)>0 else 0.0
    # inter-cluster min distance
    min_inter = float('inf')
    for a, b in combinations(unique_labels, 2):
        mem_a = np.where(labels==a)[0]
        mem_b = np.where(labels==b)[0]
        if len(mem_a)==0 or len(mem_b)==0:
            continue
        mind = float('inf')
        for i in mem_a:
            for j in mem_b:
                d = mixed_dissim(X[i], X[j], num_idx, cat_idx, gamma)
                if d < mind:
                    mind = d
        if mind < min_inter:
            min_inter = mind
    if max_diameter == 0 or min_inter==float('inf'):
        return 0.0
    return min_inter / max_diameter

In [12]:
results = []
labels_dict = {}
max_k = 10 if X.shape[0] >= 10 else max(2, X.shape[0]-1)

for k in range(2, max_k+1):
    print(f"Fitting k={k} ...")
    model = KPrototypes(n_clusters=k, init='random', max_iter=100, verbose=0, gamma=gamma)
    clusters = model.fit_predict(X, categorical=cat_idx)
    # cost (elbow)
    cost = getattr(model, 'cost_', None)
    if cost is None:
        centroids = model.cluster_centroids_
        cost = sum(mixed_dissim(X[i], centroids[clusters[i]], num_idx, cat_idx, gamma) for i in range(X.shape[0]))
    sil = silhouette_mixed(X, clusters, num_idx, cat_idx, gamma)
    dunn = dunn_index_mixed(X, clusters, num_idx, cat_idx, gamma)
    results.append({'k': k, 'cost': cost, 'silhouette': sil, 'dunn': dunn})
    labels_dict[k] = clusters.copy()


Fitting k=2 ...
Fitting k=3 ...
Fitting k=4 ...
Fitting k=5 ...
Fitting k=6 ...
Fitting k=7 ...
Fitting k=8 ...
Fitting k=9 ...
Fitting k=10 ...


In [13]:
results_df = pd.DataFrame(results)
print("\nResults summary:")
print(results_df)


Results summary:
    k          cost  silhouette      dunn
0   2  1.135129e+14    0.386246  0.216395
1   3  4.086369e+13    0.317734  0.251928
2   4  2.816868e+13    0.239058  0.282110
3   5  1.983790e+13    0.200129  0.207655
4   6  1.335221e+13    0.158905  0.233359
5   7  1.089230e+13    0.136171  0.242948
6   8  9.663487e+12    0.114447  0.193980
7   9  8.640119e+12    0.106438  0.195194
8  10  8.061319e+12    0.094124  0.195194


In [14]:
k_values = range(2, 9)

cluster_distributions = []  

for k in k_values:
    model = KPrototypes(n_clusters=k, random_state=42)
    clusters = model.fit_predict(X, categorical=cat_idx)
    
    counts = pd.Series(clusters).value_counts().sort_index().values
    cluster_distributions.append(counts)

for k, dist in zip(k_values, cluster_distributions):
    print(f"k = {k}: {dist}")


k = 2: [114  86]
k = 3: [69 64 67]
k = 4: [30 44 65 61]
k = 5: [36 61 33 27 43]
k = 6: [33 36 40 35 29 27]
k = 7: [28 35 24 26 19 32 36]
k = 8: [25 28 19 22 24 36 32 14]


In [15]:
import pandas as pd
import numpy as np
from scipy.stats import entropy

k_optimal = 4
clusters = labels_dict[k_optimal]

# Pastikan DataFrame
df = pd.DataFrame(X).copy()
df['cluster'] = clusters.astype(int)

# Kolom numerik & kategorikal
numeric_cols = [df.columns[i] for i in num_idx]
categorical_cols = [df.columns[i] for i in cat_idx]

results = []

for c in sorted(df['cluster'].unique()):
    df_c = df[df['cluster'] == c]

    # ====== Numeric STD ======
    numeric_std = df_c[numeric_cols].std()

    # Normalisasi ke 0-1 (min-max std across ALL clusters)
    std_norm = (numeric_std - numeric_std.min()) / (numeric_std.max() - numeric_std.min())
    numeric_variation = std_norm.mean()  # rata rata variasi numerik cluster

    # ====== Categorical Variance (Entropy) ======
    entropies = []
    for col in categorical_cols:
        probs = df_c[col].value_counts(normalize=True)
        entropies.append(entropy(probs, base=2))  # log base-2 â†’ skala 0â€“1
    
    categorical_variation = np.mean(entropies)

    # ===== Total Variation per cluster =====
    total_variation = (numeric_variation + categorical_variation) / 2

    results.append({
        "cluster": c,
        "numeric_var": numeric_variation,
        "categorical_var": categorical_variation,
        "total_cluster_variation": total_variation
    })

variation_df = pd.DataFrame(results)

print("ðŸ“Œ Variasi Tiap Cluster:")
display(variation_df)

# ===== Model Variation Score =====
model_variation_score = variation_df["total_cluster_variation"].mean()
print(f"\nðŸŽ¯ Model Variation Score (k={k_optimal}): {model_variation_score:.4f}")


ðŸ“Œ Variasi Tiap Cluster:


Unnamed: 0,cluster,numeric_var,categorical_var,total_cluster_variation
0,0,0.377183,2.374005,1.375594
1,1,0.377107,2.393765,1.385436
2,2,0.35435,2.417435,1.385892
3,3,0.34373,2.423043,1.383387



ðŸŽ¯ Model Variation Score (k=4): 1.3826
