In [1]:
import theano
import theano.tensor as tt

import numpy as np
import pandas as pd
import scipy.sparse as sp

import pymc3 as pm

import sparsedot

In [2]:
import string

strings = list(string.ascii_lowercase) + list(string.ascii_uppercase)
len(strings)

52

In [3]:
class Matrix:
    def __init__(self, X):
        X_sp = sp.csr_matrix(X)
        self.offset = X_sp.indptr
        self.column = X_sp.indices
        self.value = X_sp.data
        self.length = X.shape[0]
    
    def dot(self, other):
        result = np.zeros(self.length, dtype=other.dtype)
        sparsedot.matrix_vector(self.offset, self.column, self.value, other, result)
        return result

np.int32

In [4]:
class MatrixDotVector(theano.graph.op.Op):
    itypes = [tt.wscalar, tt.ivector, tt.ivector, tt.ivector, tt.dvector]
    otypes = [tt.dvector]
           
    def perform(self, node, inputs, outputs):
        length, offset, column, value, other = inputs
        result = np.zeros(length, dtype=other.dtype)
        sparsedot.matrix_vector(offset, column, value, other, result)
        outputs[0][0] = result

matrix_dot_vector = MatrixDotVector()

In [5]:
data = pd.DataFrame(
    {
        "y": np.random.normal(size=10000),
        "x": np.random.choice(strings, size=10000)
    }
)

In [6]:
X = np.asarray(pd.get_dummies(data["x"]))
matrix = Matrix(X)

In [7]:
with pm.Model() as model:
    b = pm.Normal("b", shape=52)
    sigma = pm.Exponential("sigma", lam=1)
    mu = 0 
    mu = matrix_dot_vector(
        tt.as_tensor_variable(np.int16(matrix.length)), 
        tt.as_tensor_variable(np.int32(matrix.offset)), 
        tt.as_tensor_variable(np.int32(matrix.column)), 
        tt.as_tensor_variable(np.int32(matrix.value)), 
        b
    )
    pm.Normal("y", mu=mu, sigma=sigma, observed=data["y"])

In [8]:
with model:
    idata = pm.sample(return_inferencedata=True)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Initializing NUTS failed. Falling back to elementwise auto-assignment.
Multiprocess sampling (2 chains in 2 jobs)
CompoundStep
>Slice: [b]
>NUTS: [sigma]


Sampling 2 chains for 1_000 tune and 1_000 draw iterations (2_000 + 2_000 draws total) took 56 seconds.


In [9]:
with pm.Model() as model:
    b = pm.Normal("b", shape=52)
    sigma = pm.Exponential("sigma", lam=1)
    mu = pm.math.dot(X, b)
    pm.Normal("y", mu=mu, sigma=sigma, observed=data["y"])

In [10]:
with model:
    idata = pm.sample(return_inferencedata=True)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (2 chains in 2 jobs)
NUTS: [sigma, b]


Sampling 2 chains for 1_000 tune and 1_000 draw iterations (2_000 + 2_000 draws total) took 87 seconds.
