# 01 - Q bar calculation

In [19]:
from scipy.sparse import csc_matrix
import numpy as np
import dask.array as da
import math

In [68]:
row = np.array([0, 2, 2, 0, 1, 2])
col = np.array([0, 0, 1, 2, 2, 2])
data = np.array([1, 2, 3, 4, 5, 6])

m = csc_matrix((data, (row, col)), shape=(3, 3), dtype='float32')

In [107]:
def calculate_Q(X: csc_matrix, update_in_place=False) -> np.ndarray:
    """
    :param X: a CSC matrix of shape (rows=features, cols=observations).
    :param update_in_place: whether or not to return a new Numpy array or modify the input X.
    :return: the word-word correlation matrix Q as a dense Numpy ndarray.
    """

    n_features, n_observations = X.shape

    diagonal = np.zeros(n_features)

    if not update_in_place:
        X = X.copy()

    for col_idx in range(X.indptr.size - 1):
        col_start = X.indptr[col_idx]
        col_end = X.indptr[col_idx + 1]

        col_entries = X.data[col_start:col_end]
        col_sum = np.sum(col_entries)

        row_indices = X.indices[col_start:col_end]

        diagonal[row_indices] += col_entries / (col_sum * (col_sum - 1))
        X.data[col_start:col_end] = col_entries / math.sqrt(col_sum * (col_sum - 1))

    Q = X * X.T / n_observations    
    Q = np.array(Q.todense(), copy=False)
    
    diagonal = diagonal / n_observations
    Q = Q - np.diag(diagonal)

    return Q

In [108]:
Q_bar = calculate_Q(m)

In [109]:
Q_bar

array([[ 0.01904761,  0.03174603,  0.14920634],
       [ 0.03174603,  0.03174603,  0.04761905],
       [ 0.14920634,  0.04761905,  0.49206347]])

In [110]:
m.toarray()

array([[ 1.,  0.,  4.],
       [ 0.,  0.,  5.],
       [ 2.,  3.,  6.]], dtype=float32)