In [19]:
import numpy as np
import pandas as pd
from kmodes.kprototypes import KPrototypes
import matplotlib.pyplot as plt
from itertools import combinations
import math
import warnings

In [20]:
df_students = pd.read_csv('/Users/althaaframadhan/Documents/Skripsi/Penelitian/preprocessing/students_200.csv')

In [21]:
print(df_students.head())
print(df_students.info())

   Age  Gender Academic_Level     Country  Avg_Daily_Usage_Hours  \
0   20  Female  Undergraduate      France                    3.0   
1   19  Female  Undergraduate  Azerbaijan                    4.8   
2   22  Female       Graduate  Kyrgyzstan                    2.9   
3   19  Female  Undergraduate  Bangladesh                    4.2   
4   22    Male       Graduate     Germany                    3.1   

  Most_Used_Platform Affects_Academic_Performance  Sleep_Hours_Per_Night  \
0          Instagram                           No                    9.0   
1             TikTok                          Yes                    5.7   
2           Facebook                           No                    7.0   
3          Instagram                          Yes                    7.8   
4           Facebook                           No                    8.4   

   Mental_Health_Score  Conflicts_Over_Social_Media  Addicted_Score  
0                    7                            2             

In [22]:
df = df_students.copy()
num_cols = df.select_dtypes(include=['int64','float64']).columns.tolist()
cat_cols = df.select_dtypes(include=['object','category']).columns.tolist()

print("Numeric columns:", num_cols)
print("Categorical columns:", cat_cols)

Numeric columns: ['Age', 'Avg_Daily_Usage_Hours', 'Sleep_Hours_Per_Night', 'Mental_Health_Score', 'Conflicts_Over_Social_Media', 'Addicted_Score']
Categorical columns: ['Gender', 'Academic_Level', 'Country', 'Most_Used_Platform', 'Affects_Academic_Performance']


In [23]:
df_encoded = df.copy()
cat_mappings = {}
for col in cat_cols:
    df_encoded[col], mapping = df[col].astype('category').factorize()
    cat_mappings[col] = mapping

In [24]:
X = df_encoded.to_numpy()
num_idx = [df_encoded.columns.get_loc(col) for col in num_cols]
cat_idx = [df_encoded.columns.get_loc(col) for col in cat_cols]


In [25]:
print("num_idx:", num_idx)
print("cat_idx:", cat_idx)
print("X.shape:", X.shape)

num_idx: [0, 4, 7, 8, 9, 10]
cat_idx: [1, 2, 3, 5, 6]
X.shape: (200, 11)


In [26]:
if len(num_idx) > 0:
    gamma = float(np.nanmean(np.std(X[:, num_idx].astype(float), axis=0)))
else:
    gamma = 1.0
print("gamma chosen =", gamma)


gamma chosen = 1.2221218607038884


In [27]:
def mixed_dissim(a, b, num_idx, cat_idx, gamma):
    # numeric: Euclidean
    if len(num_idx) > 0:
        diff_num = a[num_idx].astype(float) - b[num_idx].astype(float)
        num_dist = math.sqrt(np.sum(diff_num ** 2))
    else:
        num_dist = 0.0
    # categorical: simple matching (count mismatches)
    if len(cat_idx) > 0:
        cat_dist = np.sum(a[cat_idx] != b[cat_idx])
    else:
        cat_dist = 0.0
    return num_dist + gamma * cat_dist

In [28]:
def silhouette_mixed(X, labels, num_idx, cat_idx, gamma):
    labels = np.array(labels)
    n = X.shape[0]
    unique_labels = np.unique(labels)
    clusters = {lab: np.where(labels == lab)[0] for lab in unique_labels}
    sil_vals = np.zeros(n)
    for i in range(n):
        xi = X[i]
        own = labels[i]
        members = clusters[own]
        if len(members) == 1:
            a_i = 0.0
        else:
            others = members[members != i]
            a_i = np.mean([mixed_dissim(xi, X[j], num_idx, cat_idx, gamma) for j in others])
        b_vals = []
        for lab in unique_labels:
            if lab == own:
                continue
            mem = clusters[lab]
            if len(mem)==0: continue
            avg = np.mean([mixed_dissim(xi, X[j], num_idx, cat_idx, gamma) for j in mem])
            b_vals.append(avg)
        b_i = min(b_vals) if len(b_vals)>0 else 0.0
        denom = max(a_i, b_i)
        sil_vals[i] = 0.0 if denom==0 else (b_i - a_i) / denom
    return np.mean(sil_vals)

In [29]:
def dunn_index_mixed(X, labels, num_idx, cat_idx, gamma):
    labels = np.array(labels)
    unique_labels = np.unique(labels)
    # intra diameters
    diameters = []
    for lab in unique_labels:
        members = np.where(labels==lab)[0]
        if len(members) <= 1:
            diameters.append(0.0)
            continue
        maxd = 0.0
        for i in range(len(members)):
            for j in range(i+1, len(members)):
                d = mixed_dissim(X[members[i]], X[members[j]], num_idx, cat_idx, gamma)
                if d > maxd:
                    maxd = d
        diameters.append(maxd)
    max_diameter = max(diameters) if len(diameters)>0 else 0.0
    # inter-cluster min distance
    min_inter = float('inf')
    for a, b in combinations(unique_labels, 2):
        mem_a = np.where(labels==a)[0]
        mem_b = np.where(labels==b)[0]
        if len(mem_a)==0 or len(mem_b)==0:
            continue
        mind = float('inf')
        for i in mem_a:
            for j in mem_b:
                d = mixed_dissim(X[i], X[j], num_idx, cat_idx, gamma)
                if d < mind:
                    mind = d
        if mind < min_inter:
            min_inter = mind
    if max_diameter == 0 or min_inter==float('inf'):
        return 0.0
    return min_inter / max_diameter

In [30]:
results = []
labels_dict = {}
max_k = 10 if X.shape[0] >= 10 else max(2, X.shape[0]-1)

for k in range(2, max_k+1):
    print(f"Fitting k={k} ...")
    model = KPrototypes(n_clusters=k, init='random', max_iter=100, verbose=0, gamma=gamma)
    clusters = model.fit_predict(X, categorical=cat_idx)
    # cost (elbow)
    cost = getattr(model, 'cost_', None)
    if cost is None:
        centroids = model.cluster_centroids_
        cost = sum(mixed_dissim(X[i], centroids[clusters[i]], num_idx, cat_idx, gamma) for i in range(X.shape[0]))
    sil = silhouette_mixed(X, clusters, num_idx, cat_idx, gamma)
    dunn = dunn_index_mixed(X, clusters, num_idx, cat_idx, gamma)
    results.append({'k': k, 'cost': cost, 'silhouette': sil, 'dunn': dunn})
    labels_dict[k] = clusters.copy()


Fitting k=2 ...
Fitting k=3 ...
Fitting k=4 ...
Fitting k=5 ...
Fitting k=6 ...
Fitting k=7 ...
Fitting k=8 ...
Fitting k=9 ...
Fitting k=10 ...


In [31]:
results_df = pd.DataFrame(results)
print("\nResults summary:")
print(results_df)


Results summary:
    k         cost  silhouette      dunn
0   2  1508.287878    0.343990  0.042846
1   3  1192.658053    0.313780  0.064269
2   4  1001.248250    0.323578  0.142459
3   5   863.575150    0.309508  0.143294
4   6   789.279665    0.279893  0.058987
5   7   749.930065    0.251352  0.157874
6   8   713.839153    0.297305  0.232749
7   9   690.999616    0.268088  0.088480
8  10   657.416967    0.254090  0.168475


In [None]:
k_values = range(2, 9)

cluster_distributions = []  

for k in k_values:
    model = KPrototypes(n_clusters=k, random_state=42)
    clusters = model.fit_predict(X, categorical=cat_idx)
    
    counts = pd.Series(clusters).value_counts().sort_index().values
    cluster_distributions.append(counts)

for k, dist in zip(k_values, cluster_distributions):
    print(f"k = {k}: {dist}")


k = 2: [121  79]
k = 3: [74 57 69]
k = 4: [73 50 41 36]
k = 5: [51 35 44 29 41]
k = 6: [31 21 35 41 34 38]
k = 7: [35 29 12 38 20 35 31]
k = 8: [37 29 33 16 10 42 13 20]


In [50]:
import pandas as pd
import numpy as np
from scipy.stats import entropy

k_optimal = 4
clusters = labels_dict[k_optimal]

# Pastikan DataFrame
df = pd.DataFrame(X).copy()
df['cluster'] = clusters.astype(int)

# Kolom numerik & kategorikal
numeric_cols = [df.columns[i] for i in num_idx]
categorical_cols = [df.columns[i] for i in cat_idx]

results = []

for c in sorted(df['cluster'].unique()):
    df_c = df[df['cluster'] == c]

    # ====== Numeric STD ======
    numeric_std = df_c[numeric_cols].std()

    # Normalisasi ke 0-1 (min-max std across ALL clusters)
    std_norm = (numeric_std - numeric_std.min()) / (numeric_std.max() - numeric_std.min())
    numeric_variation = std_norm.mean()  # rata rata variasi numerik cluster

    # ====== Categorical Variance (Entropy) ======
    entropies = []
    for col in categorical_cols:
        probs = df_c[col].value_counts(normalize=True)
        entropies.append(entropy(probs, base=2))  # log base-2 â†’ skala 0â€“1
    
    categorical_variation = np.mean(entropies)

    # ===== Total Variation per cluster =====
    total_variation = (numeric_variation + categorical_variation) / 2

    results.append({
        "cluster": c,
        "numeric_var": numeric_variation,
        "categorical_var": categorical_variation,
        "total_cluster_variation": total_variation
    })

variation_df = pd.DataFrame(results)

print("ðŸ“Œ Variasi Tiap Cluster:")
display(variation_df)

# ===== Model Variation Score =====
model_variation_score = variation_df["total_cluster_variation"].mean()
print(f"\nðŸŽ¯ Model Variation Score (k={k_optimal}): {model_variation_score:.4f}")


ðŸ“Œ Variasi Tiap Cluster:


Unnamed: 0,cluster,numeric_var,categorical_var,total_cluster_variation
0,0,0.533445,1.559209,1.046327
1,1,0.391454,1.368864,0.880159
2,2,0.500485,1.317892,0.909188
3,3,0.377912,1.617111,0.997511



ðŸŽ¯ Model Variation Score (k=4): 0.9583
