# **Fit GLM to IBL data**
---
We first fit normal GLM to the dataset.

## **HPC setting**
Ashwood's original script is written in python scirpts. Here, we rewrite it in Jupyter to make it more user-friendly to run on HPC with `dask`. [This](https://github.com/pierreglaser/hpc-tutorial/tree/main) is very useful resource to get familiar with `dask`.

In [49]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
# allocate the computing resources
from dask_jobqueue import SLURMCluster
from distributed import Client
from joblib import Memory, Parallel, delayed, parallel_backend
from threadpoolctl import threadpool_limits

cluster = SLURMCluster(
    workers=0,      # create the workers "lazily" (upon cluster.scal)
    memory='32g',   # amount of RAM per worker
    processes=1,    # number of execution units per worker (threads and processes)
    cores=4,        # among those execution units, number of processes
    worker_extra_args=["--resources GPU=2"], # the only way to add GPUs
    local_directory='/nfs/nhome/live/skuroda/jobs', # set your path to save log
    log_directory='/nfs/nhome/live/skuroda/jobs' # set your path to save log
)   

memory = Memory('/nfs/nhome/live/skuroda/joblib-cache') # set your path

cluster.scale(5)
client = Client(cluster)
client

  from distributed.utils import format_bytes, parse_bytes, tmpfile, get_ip_interface
  from distributed.utils import format_bytes, parse_bytes, tmpfile, get_ip_interface
  from distributed.utils import format_bytes, parse_bytes, tmpfile, get_ip_interface
  from distributed.utils import parse_bytes


0,1
Connection method: Cluster object,Cluster type: dask_jobqueue.SLURMCluster
Dashboard: http://192.168.234.51:8787/status,

0,1
Dashboard: http://192.168.234.51:8787/status,Workers: 0
Total threads: 0,Total memory: 0 B

0,1
Comm: tcp://192.168.234.51:34421,Workers: 0
Dashboard: http://192.168.234.51:8787/status,Total threads: 0
Started: Just now,Total memory: 0 B


## **Fit GLM to all animals**
---

In [54]:
# ------- load modules -------
import autograd.numpy as np
import autograd.numpy.random as npr
from glm_utils import load_session_fold_lookup, load_data, fit_glm, \
    plot_input_vectors, append_zeros
import os
from functools import partial

In [4]:
# ------- setup variables -------
C = 2  # number of output types/categories
N_initializations = 10
num_folds = 5
npr.seed(65)  # set seed in case of randomization

In [9]:
# ------- setup path and load data -------
data_dir = '../../data/ibl/data_for_cluster/'
# Create directory for results:
results_dir = '../../results/ibl_global_fit_2/'
if not os.path.exists(results_dir):
    os.makedirs(results_dir)

animal_file = data_dir + 'all_animals_concat.npz'
inpt, y, session = load_data(animal_file)
session_fold_lookup_table = load_session_fold_lookup(
    data_dir + 'all_animals_concat_session_fold_lookup.npz')

In [10]:
os.getcwd()

'/nfs/nhome/live/skuroda/Workstation2023/glm-hmm/2_fit_models/fit_glm'

In [11]:
num_folds

5

In [12]:
def fit_GLM(inpt,y,session,session_fold_lookup_table,fold):
    # Subset to relevant covariates for covar set of interest:
    labels_for_plot = ['stim', 'P_C', 'WSLS', 'bias']
    y = y.astype('int')
    figure_directory = results_dir + "GLM/fold_" + str(fold) + '/'
    if not os.path.exists(figure_directory):
        os.makedirs(figure_directory)

    # Subset to sessions of interest for fold
    sessions_to_keep = session_fold_lookup_table[np.where(
        session_fold_lookup_table[:, 1] != fold), 0]
    idx_this_fold = [
        str(sess) in sessions_to_keep and y[id, 0] != -1
        for id, sess in enumerate(session)
    ]
    this_inpt, this_y, this_session = inpt[idx_this_fold, :], y[
        idx_this_fold, :], session[idx_this_fold]
    assert len(
        np.unique(this_y)
    ) == 2, "choice vector should only include 2 possible values"
    train_size = this_inpt.shape[0]

    M = this_inpt.shape[1]
    loglikelihood_train_vector = []

    for iter in range(N_initializations):  # GLM fitting should be
        # independent of initialization, so fitting multiple
        # initializations is a good way to check that everything is
        # working correctly
        loglikelihood_train, recovered_weights = fit_glm([this_inpt],
                                                            [this_y], M, C)
        weights_for_plotting = append_zeros(recovered_weights)
        plot_input_vectors(weights_for_plotting,
                            figure_directory,
                            title="GLM fit; Final LL = " +
                            str(loglikelihood_train),
                            save_title='init' + str(iter),
                            labels_for_plot=labels_for_plot)
        loglikelihood_train_vector.append(loglikelihood_train)
        np.savez(
            figure_directory + 'variables_of_interest_iter_' + str(iter) +
            '.npz', loglikelihood_train, recovered_weights)

fit_GLM_eachfold = partial(fit_GLM, inpt, y, session, session_fold_lookup_table)        
fit_GLM_eachfold_cached = memory.cache(fit_GLM_eachfold)

In [40]:
%%time

with threadpool_limits(limits=1, user_api='blas'):
    with parallel_backend('dask'):
        Parallel(verbose=100)(delayed(fit_GLM_eachfold_cached)(fold) for fold in range(num_folds))

[Parallel(n_jobs=-1)]: Using backend DaskDistributedBackend with 20 concurrent workers.


AssertionError: 

In [16]:
C = 2
M= 3

In [17]:
input = npr.randn(1, C , M + 1)

In [20]:
input.shape[0]

1

In [22]:
input

array([[[-0.83366482, -0.54212088, -0.86017339, -0.50179652],
        [ 1.01336559,  0.74390291, -1.59309501,  0.08775184]]])

In [35]:
np.ones((1, C, input.shape[0]))

array([[[1.],
        [1.]]])

In [26]:
input.shape

(1, 2, 4)

In [37]:
input

array([[[-0.83366482, -0.54212088, -0.86017339, -0.50179652],
        [ 1.01336559,  0.74390291, -1.59309501,  0.08775184]]])

In [38]:
np.ones((1, C, input.shape[0]))

array([[[1.],
        [1.]]])

In [39]:
np.concatenate((input, np.ones((1, C, input.shape[0]))), axis=2)

array([[[-0.83366482, -0.54212088, -0.86017339, -0.50179652,
          1.        ],
        [ 1.01336559,  0.74390291, -1.59309501,  0.08775184,
          1.        ]]])

In [25]:
np.appendk(input, np.ones((input.shape[0], C)), axis=1)

TypeError: hstack() got an unexpected keyword argument 'axis'

In [41]:
fold = 0
iter = 0

In [42]:
labels_for_plot = ['stim', 'P_C', 'WSLS', 'bias']
y = y.astype('int')
figure_directory = results_dir + "GLM/fold_" + str(fold) + '/'
if not os.path.exists(figure_directory):
    os.makedirs(figure_directory)

# Subset to sessions of interest for fold
sessions_to_keep = session_fold_lookup_table[np.where(
    session_fold_lookup_table[:, 1] != fold), 0]
idx_this_fold = [
    str(sess) in sessions_to_keep and y[id, 0] != -1
    for id, sess in enumerate(session)
]
this_inpt, this_y, this_session = inpt[idx_this_fold, :], y[
    idx_this_fold, :], session[idx_this_fold]
assert len(
    np.unique(this_y)
) == 2, "choice vector should only include 2 possible values"
train_size = this_inpt.shape[0]

M = this_inpt.shape[1]
loglikelihood_train_vector = []


In [55]:
loglikelihood_train, recovered_weights = fit_glm([this_inpt],
                                                    [this_y], M, C)

Fitting with BFGS.
weights
(1, 2, 4)
(1, 2, 4)
(145491, 1, 2)
loglikelyfood
(145491, 1)
(145491, 1, 2)
(145491, 1)
[[-0.49518822]
 [-0.71258392]
 [-0.02148294]
 ...
 [-0.276356  ]
 [-0.276356  ]
 [-0.06357796]]
-82277.49222868323
weights
(1, 2, 4)
(1, 2, 4)
(145491, 1, 2)
loglikelyfood
(145491, 1)
(145491, 1, 2)
(145491, 1)
Autograd ArrayBox with value [[-0.49518822]
 [-0.71258392]
 [-0.02148294]
 ...
 [-0.276356  ]
 [-0.276356  ]
 [-0.06357796]]
Autograd ArrayBox with value -82277.49222868323
weights
(1, 2, 4)
(1, 2, 4)
(145491, 1, 2)
loglikelyfood
(145491, 1)
(145491, 1, 2)
(145491, 1)
[[-0.16534663]
 [-0.26904464]
 [-0.00407925]
 ...
 [-0.76627257]
 [-0.76627257]
 [-0.01143076]]
-77355.73993121725
weights
(1, 2, 4)
(1, 2, 4)
(145491, 1, 2)
loglikelyfood
(145491, 1)
(145491, 1, 2)
(145491, 1)
Autograd ArrayBox with value [[-0.16534663]
 [-0.26904464]
 [-0.00407925]
 ...
 [-0.76627257]
 [-0.76627257]
 [-0.01143076]]
Autograd ArrayBox with value -77355.73993121725
weights
(1, 2, 4)
(1,

Desired error not necessarily achieved due to precision loss.
  warn("{} failed with message:\n{}".format(method, result.message))
  result = getattr(asarray(obj), method)(*args, **kwds)


ValueError: axes don't match array

In [56]:
loglikelihood_train, recovered_weights = fit_glm([this_inpt],
                                                    [this_y], M, C)

Fitting with BFGS.
weights
(1, 1, 4)
(1, 2, 4)
(145491, 1, 2)
loglikelyfood
(145491, 1)
(145491, 1, 2)
(145491, 1)
[[-0.55347994]
 [-0.54979189]
 [-0.57981533]
 ...
 [-0.08150181]
 [-0.08150181]
 [-2.61191613]]
-129678.89694033861
weights
(1, 1, 4)
(1, 2, 4)
(145491, 1, 2)
loglikelyfood
(145491, 1)
(145491, 1, 2)
(145491, 1)
Autograd ArrayBox with value [[-0.55347994]
 [-0.54979189]
 [-0.57981533]
 ...
 [-0.08150181]
 [-0.08150181]
 [-2.61191613]]
Autograd ArrayBox with value -129678.89694033861
weights
(1, 1, 4)
(1, 2, 4)
(145491, 1, 2)
loglikelyfood
(145491, 1)
(145491, 1, 2)
(145491, 1)
[[-0.53761213]
 [-0.61716478]
 [-0.18208319]
 ...
 [-0.23889969]
 [-0.23889969]
 [-0.62430894]]
-89788.04634278134
weights
(1, 1, 4)
(1, 2, 4)
(145491, 1, 2)
loglikelyfood
(145491, 1)
(145491, 1, 2)
(145491, 1)
Autograd ArrayBox with value [[-0.53761213]
 [-0.61716478]
 [-0.18208319]
 ...
 [-0.23889969]
 [-0.23889969]
 [-0.62430894]]
Autograd ArrayBox with value -89788.04634278134
weights
(1, 1, 4)
(

ValueError: axes don't match array

In [60]:
AA = npr.randn(1, C, M + 1)

In [61]:
AA

array([[[-0.25609847, -0.17918829, -0.22901809,  0.17096207],
        [-0.96300882, -0.10848472, -2.46392604,  1.48829195]]])

In [62]:
AA.shape

(1, 2, 4)

In [69]:
B = np.array([-0.93754722, -0.21990237, -1.15707008, -0.16567326,  1.11724798,
        0.4216844 , -1.29619832, -0.24837141])

In [82]:
loglikelihood_train, recovered_weights = fit_glm([this_inpt],
                                                    [this_y], M, C)

Fitting with BFGS.
[[[-0.83366482 -0.54212088 -0.86017339 -0.50179652]
  [ 1.01336559  0.74390291 -1.59309501  0.08775184]]]
weights
(1, 2, 4)
(1, 2, 4)
(145491, 1, 2)
loglikelyfood
(145491, 1)
(145491, 1, 2)
(145491, 1)
[[-0.49518822]
 [-0.71258392]
 [-0.02148294]
 ...
 [-0.276356  ]
 [-0.276356  ]
 [-0.06357796]]
-82277.49222868323
Autograd ArrayBox with value [[[-0.83366482 -0.54212088 -0.86017339 -0.50179652]
  [ 1.01336559  0.74390291 -1.59309501  0.08775184]]]
weights
(1, 2, 4)
(1, 2, 4)
(145491, 1, 2)
loglikelyfood
(145491, 1)
(145491, 1, 2)
(145491, 1)
Autograd ArrayBox with value [[-0.49518822]
 [-0.71258392]
 [-0.02148294]
 ...
 [-0.276356  ]
 [-0.276356  ]
 [-0.06357796]]
Autograd ArrayBox with value -82277.49222868323
[[[-0.94258479 -0.30147545 -1.08278662  0.12327913]
  [ 1.12228555  0.50325748 -1.37048178 -0.53732381]]]
weights
(1, 2, 4)
(1, 2, 4)
(145491, 1, 2)
loglikelyfood
(145491, 1)
(145491, 1, 2)
(145491, 1)
[[-0.16534663]
 [-0.26904464]
 [-0.00407925]
 ...
 [-0.766

Desired error not necessarily achieved due to precision loss.
  warn("{} failed with message:\n{}".format(method, result.message))


In [83]:
recovered_weights

array([[[-0.93754722, -0.21990237, -1.15707008, -0.16567326],
        [ 1.11724798,  0.4216844 , -1.29619832, -0.24837141]]])

In [2]:
import numpy as np

x = np.random.randn(100)
y = np.random.randn(100)

np.corrcoef(x, y)

array([[1.        , 0.02654473],
       [0.02654473, 1.        ]])

In [10]:
from scipy import stats
stats.spearmanr(x, y)

In [13]:
x = np.random.randn(100)
y = np.random.randn(100)

In [14]:
stats.spearmanr(x, y).correlation

0.16048004800480045

In [23]:
x= np.array([0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0])
y= np.array([0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0])
stats.spearmanr(x, y).correlation

-0.1111111111111111

In [18]:
x= np.array([0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0])
y= np.array([0,1,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0])
stats.spearmanr(x, y).correlation

-0.28867513459481287

In [21]:
x= np.array([0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0])
y= np.array([0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0])
np.corrcoef(x, y)

array([[ 1.        , -0.21004201],
       [-0.21004201,  1.        ]])

In [38]:
x= np.array([1,0,0,1,1,1,0])
y= np.array([0,1,0,0,1,1,0])
stats.spearmanr(x, y).correlation

0.16666666666666666

In [39]:
row = 1- np.sum(np.absolute(x-y))/len(x)
row

0.5714285714285714

In [34]:
np.absolute(x-y)

array([1, 1, 0, 1, 0, 1, 0])

In [None]:
def binary_choice_(x,y):
    
    return similarity

In [25]:
x-y

array([ 1, -1,  1,  1,  1, -1, -1])

In [19]:
def jaccard_binary(x,y):
    """A function for finding the similarity between two binary vectors"""
    intersection = np.logical_and(x, y)
    union = np.logical_or(x, y)
    similarity = intersection.sum() / float(union.sum())
    return similarity

In [22]:
jaccard_binary(x,y)

0.0

In [88]:
plot_input_vectors(recovered_weights,
                    figure_directory,
                    title="GLM fit; Final LL = " +
                    str(loglikelihood_train),
                    save_title='initnew' + str(iter),
                    labels_for_plot=labels_for_plot)

In [90]:
-0.21990237-0.4216844

-0.64158677

In [75]:
recovered_weights

array([[[-2.05479513, -0.64158676,  0.13912822,  0.08269815]]])

In [37]:
loglikelihood_train

-73031.97764662051

In [35]:
recovered_weights

array([[[-2.05479513, -0.64158676,  0.13912822,  0.08269815]]])

In [47]:
def one_hot(z, K):
    z = np.atleast_1d(z).astype(int)
    assert np.all(z >= 0) and np.all(z < K)
    shp = z.shape
    N = z.size
    zoh = np.zeros((N, K))
    zoh[np.arange(N), np.arange(K)[np.ravel(z)]] = 1
    zoh = np.reshape(zoh, shp + (K,))
    return zoh
    
def categorical_logpdf(data, logits, mask=None):
    """
    Compute the log probability density of a categorical distribution.
    This will broadcast as long as data and logits have the same
    (or at least compatible) leading dimensions.

    Parameters
    ----------
    data : array_like (..., D) int (0 <= data < C)
        The points at which to evaluate the log density

    lambdas : array_like (..., D, C)
        The logits of the categorical distribution(s) with C classes

    mask : array_like (..., D) bool
        Optional mask indicating which entries in the data are observed

    Returns
    -------
    lps : array_like (...,)
        Log probabilities under the categorical distribution(s).
    """
    D = data.shape[-1]
    C = logits.shape[-1]
    assert data.dtype in (int, np.int8, np.int16, np.int32, np.int64)
    assert np.all((data >= 0) & (data < C))
    assert logits.shape[-2] == D

    # Check mask
    mask = mask if mask is not None else np.ones_like(data, dtype=bool)
    assert mask.shape == data.shape

    logits = logits - logsumexp(logits, axis=-1, keepdims=True)      # (..., D, C)
    x = one_hot(data, C)                                             # (..., D, C)
    lls = np.sum(x * logits, axis=-1)                                # (..., D)
    return np.sum(lls * mask, axis=-1)                               # (...,)

In [50]:
one_hot([[1], [0], [2]], 3)  

array([[[0., 1., 0.]],

       [[1., 0., 0.]],

       [[0., 0., 1.]]])

In [9]:
# Once finished, shut down the cluster and the client
cluster.close()
client.close()

  with ignoring(RuntimeError):  # deleting job when job already gone
  with ignoring(RuntimeError):  # deleting job when job already gone
  with ignoring(RuntimeError):  # deleting job when job already gone
  with ignoring(RuntimeError):  # deleting job when job already gone
