In [5]:
from clipp2 import *
# File path
file_path = '/Users/yuding/Dropbox/GitHub/Multi_Region_CliPP/multi_clipp_simulation_data/simulation_data_cluster_3_region_1_read_depth_100_replica_2.tsv'

# Read the data into a DataFrame
df = pd.read_csv(file_path, sep='\t')
rho = 0.8
omega = 1
n = df.shape[0]
m = 1
gamma_list = [0.01, 0.03, 0.05, 0.075, 0.1, 0.125, 0.15, 0.175, 0.2, 0.225, 0.25]
control_large = 5
precision = 0.01
max_iteration = 10000

In [6]:
gamma = 0.01

In [7]:
CliPP2(df, rho, gamma, omega, n, m)

{'phi': array([[0.17132847],
        [0.26446537],
        [0.66901667],
        [0.11807679],
        [0.66901667],
        [0.26446537],
        [0.19047619],
        [0.66901667],
        [0.19047619],
        [0.11807679],
        [0.66901667],
        [0.37486146],
        [0.52061357],
        [0.26446537],
        [0.32998243],
        [0.32998243],
        [0.52061357],
        [0.32998243],
        [0.19047619],
        [0.06849536],
        [0.37486146],
        [0.43674641],
        [0.22728675],
        [0.26446537],
        [0.11807679],
        [0.26446537],
        [0.14918669],
        [0.22728675],
        [0.22728675],
        [0.20584861],
        [0.17132847],
        [0.66901667],
        [0.43674641],
        [0.19047619],
        [0.37486146],
        [0.11807679],
        [0.32998243],
        [0.11807679],
        [0.11807679],
        [0.22728675],
        [0.20584861],
        [0.43674641],
        [0.37486146],
        [0.52061357],
        [0.06849536],
   

In [8]:
# Initialize combinations and mappings
sets = {i for i in range(n)}
combinations_2 = list(itertools.combinations(sets, 2))
pairs_mapping = {combination: index for index, combination in enumerate(combinations_2)}
pairs_mapping_inverse = {index: combination for index, combination in enumerate(combinations_2)}

# Get matrices
read_mat = get_read_mat(df)
total_read_mat = get_total_read_mat(df)
c_mat = get_c_mat(df)
bb_mat = get_b_mat(df)
tumor_cn_mat = get_tumor_cn_mat(df)
linearApprox = get_linear_approximation(c_mat)

# Initialize variables
# 12/12/2024 example step
phi_hat = (read_mat / (total_read_mat * c_mat))
scale_parameter = np.max([1, np.max(phi_hat)])
phi_hat = phi_hat / scale_parameter
phi_hat[phi_hat > sigmoid(control_large)] = sigmoid(control_large)
phi_hat[phi_hat < sigmoid(-control_large)] = sigmoid(-control_large)
p = inverse_sigmoid(phi_hat)
p[p > control_large] = control_large
p[p < -control_large] = -control_large
p = p.reshape([n * m])

v = np.zeros([len(combinations_2) * m])
for i in range(len(combinations_2)):
    pair = combinations_2[i]
    index_v = pairs_mapping[pair]
    start_v = index_v * m
    end_v = (index_v + 1) * m
    l1, l2 = pairs_mapping_inverse[index_v]
    a_mat = a_mat_generator(l1, l2, n, m)
    v[start_v: end_v] = matmul_by_torch(a_mat, p)
    
y = np.ones([len(combinations_2) * m])
omega = np.ones([len(combinations_2)])
k = 0

control_large = 5
wcut = np.array(linearApprox[0])
coef = np.array(linearApprox[1])
temp = 100

# ADMM
while k < max_iteration and precision < temp:
    # Update p
    p = update_p(
        p, v, y, n, m,
        read_mat, total_read_mat, bb_mat, tumor_cn_mat,
        coef, wcut, combinations_2, pairs_mapping,
        rho, control_large
    )
    # Compute residual
    temp = 0
    # Update v, y
    for i in range(len(combinations_2)):
        pair = combinations_2[i]
        index_v = pairs_mapping[pair]
        start_v = index_v * m
        end_v = (index_v + 1) * m

        # SCAD-based v update
        v[start_v: end_v] = update_v_SCAD(
            index_v, pairs_mapping_inverse, p, y,
            n, m, rho, omega[i], gamma
        )
        # Dual variable update
        y[start_v: end_v] = update_y(
            y[start_v: end_v],
            v[start_v: end_v], i, pairs_mapping_inverse,
            p, n, m, rho
        )
        l1, l2 = pairs_mapping_inverse[index_v]
        a_mat = a_mat_generator(l1, l2, n, m)
        # Check the difference norm
        temp = max(temp, np.linalg.norm(matmul_by_torch(a_mat, p) - v[start_v: end_v]))

    rho = 1.02 * rho
    k = k + 1
    # e.g. print progress if desired
    print('\r', k, ',', temp, end="")


 13 , 0.008431190170545801

In [9]:
# Clustering logic
diff = np.zeros((n, n))
class_label = -np.ones(n)
class_label[0] = 0
group_size = [1]
labl = 1
least_mut = np.ceil(0.05 * n)

# Fill 'diff' matrix with norms
for i in range(1, n):
    for j in range(i):
        index_v = pairs_mapping[(j, i)]
        start_v = index_v * m
        end_v = (index_v + 1) * m
        diff_val = np.linalg.norm(v[start_v: end_v])
        diff[j, i] = diff_val if diff_val > 0.05 else 0
        diff[i, j] = diff[j, i]

# Initial cluster assignment
for i in range(1, n):
    for j in range(i):
        if diff[j, i] == 0:
            class_label[i] = class_label[j]
            group_size[int(class_label[j])] += 1
            break
    if class_label[i] == -1:
        class_label[i] = labl
        labl += 1
        group_size.append(1)

# ----------------------------------------------------------
#  PART A: Refine small clusters by reassigning to closest
# ----------------------------------------------------------
tmp_size = np.min(np.array(group_size)[np.array(group_size) > 0])
tmp_grp  = np.where(group_size == tmp_size)

refine = False
if tmp_size < least_mut:
    refine = True

while refine:
    refine = False
    smallest_cluster = tmp_grp[0][0]
    tmp_col = np.where(class_label == smallest_cluster)[0]
    
    for i in range(len(tmp_col)):
        mut_idx = tmp_col[i]

        # Gather distance from mut_idx to all other SNVs
        if mut_idx != 0 and mut_idx != (n - 1):
            tmp_diff = np.abs(
                np.concatenate((
                    diff[0:mut_idx, mut_idx].ravel(),
                    [100],  # placeholder for self
                    diff[mut_idx, (mut_idx+1):n].ravel()
                ))
            )
            # Increase distances to cluster-mates
            tmp_diff[tmp_col] += 100

            diff[0:mut_idx, mut_idx] = tmp_diff[0:mut_idx]
            diff[mut_idx, (mut_idx+1):n] = tmp_diff[(mut_idx+1):n]

        elif mut_idx == 0:
            # Edge case: first row
            tmp_diff = np.append(100, diff[0, 1:n])
            tmp_diff[tmp_col] += 100
            diff[0, 1:n] = tmp_diff[1:n]

        else:
            # Edge case: last row
            tmp_diff = np.append(diff[0:(n-1), n-1], 100)
            tmp_diff[tmp_col] += 100
            diff[0:(n-1), n-1] = tmp_diff[0:(n-1)]

        # Reassign
        ind = tmp_diff.argmin()
        old_clust = int(class_label[mut_idx])
        group_size[old_clust] -= 1
        new_clust = int(class_label[ind])
        class_label[mut_idx] = new_clust
        group_size[new_clust] += 1

    tmp_size = np.min(np.array(group_size)[np.array(group_size) > 0])
    tmp_grp  = np.where(group_size == tmp_size)
    refine   = (tmp_size < least_mut)


In [10]:
# ----------------------------------------------------------
#  PART B: Recompute cluster centroids
# ----------------------------------------------------------
labels = np.unique(class_label)
phi_out = np.zeros((len(labels), m))

for i in range(len(labels)):
    cluster_id = labels[i]
    ind = np.where(class_label == cluster_id)[0]
    class_label[ind] = i
    numerator   = np.sum(phi_hat[ind, :] * total_read_mat[ind, :], axis=0)
    denominator = np.sum(total_read_mat[ind, :], axis=0)
    phi_out[i, :] = numerator / denominator