In [1]:
import pandas as pd
import numpy as np
from item_cluster_analysis.ICLUST import iclust_cluster
from item_cluster_analysis.correlation_computation import compute_mixed_corr_matrix

In [2]:
data = {
    'Income_Annual': [40000, 65000, 30000, 110000, 55000, 90000, 45000, 70000],
    'Service_Rating': ['Low', 'Medium', 'Low', 'High', 'Medium', 'High', 'Medium', 'Low'],
    'Region_EastWest': ['East', 'West', 'East', 'East', 'West', 'West', 'East', 'West'],
    'Age_Years': [35, 52, 22, 60, 41, 48, 29, 55],
    'Job_Satisfaction': [5, 3, 2, 5, 4, 3, 4, 2]
}

df = pd.DataFrame(data)

rating_order = ['Low', 'Medium', 'High']
df['Service_Rating'] = pd.Categorical(df['Service_Rating'], categories=rating_order, ordered=True)

numerical_vars = ['Income_Annual', 'Age_Years']
ordinal_vars = ['Service_Rating', 'Job_Satisfaction']
categorical_vars = ['Region_EastWest']

df

Unnamed: 0,Income_Annual,Service_Rating,Region_EastWest,Age_Years,Job_Satisfaction
0,40000,Low,East,35,5
1,65000,Medium,West,52,3
2,30000,Low,East,22,2
3,110000,High,East,60,5
4,55000,Medium,West,41,4
5,90000,High,West,48,3
6,45000,Medium,East,29,4
7,70000,Low,West,55,2


In [3]:
correlation_matrix = compute_mixed_corr_matrix(
    df,
    categorical_vars=categorical_vars,
    numerical_vars=numerical_vars,
    ordinal_vars=ordinal_vars
)

correlation_matrix

Unnamed: 0,Region_EastWest,Income_Annual,Age_Years,Service_Rating,Job_Satisfaction
Region_EastWest,1.0,0.275086,0.501305,0.160128,0.447214
Income_Annual,0.275086,1.0,0.875051,-0.491354,0.09759
Age_Years,0.501305,0.875051,1.0,-0.365366,0.09759
Service_Rating,0.160128,-0.491354,-0.365366,1.0,-0.05164
Job_Satisfaction,0.447214,0.09759,0.09759,-0.05164,1.0


In [4]:
p = 12
n = 3000

phi = np.array([
    [1.0, 0.35, 0.20],
    [0.35, 1.0, 0.30],
    [0.20, 0.30, 1.0]
])
L_f = np.linalg.cholesky(phi)
F = np.random.randn(n, 3) @ L_f.T
loadings = np.zeros((p, 3))
loadings[0:4, 0] = [0.85, 0.80, 0.75, 0.60]
loadings[4:8, 1] = [0.88, 0.82, 0.78, 0.62]
loadings[8:12, 2] = [0.86, 0.77, 0.70, 0.58]
loadings[3, 1] = 0.35
loadings[7, 2] = 0.30
loadings[9, 0] = 0.25

uniq = np.array([0.40, 0.45, 0.50, 0.55, 0.38, 0.44, 0.50, 0.60, 0.42, 0.52, 0.58, 0.64])
E = np.random.randn(n, p) * np.sqrt(uniq)

X = F @ loadings.T + E

X[:, 2] *= -1
X[:, 8] *= -1

cols = [f"I{i+1:02d}" for i in range(p)]
df = pd.DataFrame(X, columns=cols)

R = df.corr()


res = iclust_cluster(
    R,
    n_clusters=3,
    alpha_rule=3,
    beta_rule=1,
    alpha_size=3,
    beta_size=4,
    correct=True,
    reverse=True,
    beta_min=0.5,
    verbose=True
)

print("cluster:")
print(res["clusters"])

step 1: merge V6 + V5 -> C1 | r12=0.654, alpha=0.791, beta=0.791, size=2 | items=['I05', 'I06']
step 2: merge V10 + V9 -> C2 | r12=-0.602, alpha=0.752, beta=0.752, size=2 | items=['I09', 'I10']
step 3: merge V11 + C2 -> C3 | r12=-0.585, alpha=0.786, beta=0.738, size=3 | items=['I09', 'I10', 'I11']
step 4: merge V12 + C3 -> C4 | r12=-0.521, alpha=0.795, beta=0.685, size=4 | items=['I09', 'I10', 'I11', 'I12']
step 5: merge V8 + C4 -> C5 | r12=-0.411, alpha=0.787, beta=0.582, size=5 | items=['I08', 'I09', 'I10', 'I11', 'I12']
step 6: merge C5 + C1 -> C6 | r12=0.377, alpha=0.793, beta=0.547, size=7 | items=['I05', 'I06', 'I08', 'I09', 'I10', 'I11', 'I12']
step 7: merge V2 + V1 -> C7 | r12=0.602, alpha=0.751, beta=0.751, size=2 | items=['I01', 'I02']
step 8: merge V7 + C6 -> C8 | r12=0.498, alpha=0.810, beta=0.665, size=8 | items=['I05', 'I06', 'I07', 'I08', 'I09', 'I10', 'I11', 'I12']
step 9: merge V3 + C7 -> C9 | r12=-0.650, alpha=0.811, beta=0.788, size=3 | items=['I01', 'I02', 'I03']
cl