In [54]:
from pathlib import Path

import arviz as az
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pymc as pm
import pymc_bart as pmb

import cloudpickle as cpkl

print(f"Running on PyMC v{pm.__version__}")

print(f"Running on PyMC-BART v{pmb.__version__}")

Running on PyMC v5.9.0
Running on PyMC-BART v0.5.1


In [4]:

try:
    bikes = pd.read_csv(Path("..", "data", "bikes.csv"))
except FileNotFoundError:
    bikes = pd.read_csv(pm.get_data("bikes.csv"))

features = ["hour", "temperature", "humidity", "workingday"]

X = bikes[features]
Y = bikes["count"]

xt = X[0:10]
yt = Y[0:10]

In [81]:
with pm.Model() as model_bikes:
    xdata = pm.MutableData("xdata", X)
    a = pm.Exponential("a", 1)
    mu_ = pmb.BART("mu_", xdata, np.log(Y), m=20)
    mu = pm.Deterministic("mu", pm.math.exp(mu_))
    y = pm.NegativeBinomial("y", mu=mu, alpha=a, observed=Y, shape=xdata.shape[0])
    idata_bikes = pm.sample(random_seed=99, draws=100, tune=100, compute_convergence_checks=False)
idata_bikes

Only 100 samples in chain.
Multiprocess sampling (4 chains in 4 jobs)
CompoundStep
>NUTS: [a]
>PGBART: [mu_]


Sampling 4 chains for 100 tune and 100 draw iterations (400 + 400 draws total) took 3 seconds.


In [82]:
# idata_bikes.to_netcdf("test3.nc")
# idata2 = az.from_netcdf("test3.nc")

# pickle
with open('test4.pkl', mode='wb') as file:
   cpkl.dump(idata_bikes, file)

with open("test4.pkl", mode="rb") as file:
    idata4 = cpkl.load(file)


In [88]:
with model_bikes:
    pm.set_data({"xdata": xt})
    post1 = pm.sample_posterior_predictive(idata_bikes, var_names=["mu", "y"])

with model_bikes:
    pm.set_data({"xdata": xt})
    post2 = pm.sample_posterior_predictive(idata4, var_names=["mu", "y"])

# WORKS WITH OLD MODEL
#s

Sampling: [mu_, y]


Sampling: [mu_, y]


In [89]:
print(post1.posterior_predictive["mu"].values.mean((0,1)))
print(post2.posterior_predictive["mu"].values.mean((0,1)))
# post2

[29.89727173 49.11120318 50.47067189 46.83670032 42.78398617 38.61454919
 35.57313739 29.31785712 29.31785712 43.2423914 ]
[30.85734391 47.89703106 51.42968096 47.04174149 42.81633842 38.50513225
 35.5118569  30.09280733 30.09280733 44.21621381]


In [109]:
# TRY with new model with the train data
with pm.Model() as model2:
    xdata2 = pm.MutableData("xdata", X)
    a2 = pm.Exponential("a", 1)
    mu_2 = pmb.BART("mu_", xdata2, np.log(Y), m=50)
    mu2 = pm.Deterministic("mu", pm.math.exp(mu_2))
    y2 = pm.NegativeBinomial("y", mu=mu2, alpha=a2, observed=Y, shape=xdata2.shape[0])

In [110]:
# try posterior sample with the model and the old and save idata
# works but predicts on the full size dataset
with model2:
    post3 = pm.sample_posterior_predictive(idata_bikes, var_names=["mu", "y"], )

with model2:
    # pm.set_data({"xdata": xt})
    post5 = pm.sample_posterior_predictive(idata4, var_names=["mu", "y"], )


Sampling: [y]


Sampling: [y]


In [114]:
print(post1.posterior_predictive["mu"].values.mean((0,1)))
print(post2.posterior_predictive["mu"].values.mean((0,1)))
print(post3.posterior_predictive["mu"].values.mean((0,1))[0:10])
print(post5.posterior_predictive["mu"].values.mean((0,1))[0:10])

# predictions are fairly similar to from the original model with the og idata and saved idata and the new model with original and saved idata

[29.89727173 49.11120318 50.47067189 46.83670032 42.78398617 38.61454919
 35.57313739 29.31785712 29.31785712 43.2423914 ]
[30.85734391 47.89703106 51.42968096 47.04174149 42.81633842 38.50513225
 35.5118569  30.09280733 30.09280733 44.21621381]
[31.03705431 48.69262632 51.54160761 47.57670433 43.29908304 38.78436537
 36.18903356 30.27229723 30.27229723 44.84348581]
[31.03705431 48.69262632 51.54160761 47.57670433 43.29908304 38.78436537
 36.18903356 30.27229723 30.27229723 44.84348581]


In [121]:
# Throws a shape error
with model2:
    pm.set_data({"xdata": xt})
    post4 = pm.sample_posterior_predictive(idata_bikes, var_names=["mu", "y"])

with model2:
    pm.set_data({"xdata": xt})
    post6 = pm.sample_posterior_predictive(idata4, var_names=["mu", "y"])

# The set data doesn't wan't to accept the new shape, possibly related to the shape of y not being correct anymore

Sampling: [mu_, y]


ValueError: size does not match the broadcast shape of the parameters. (10,), (10,), (348,)
Apply node that caused the error: nbinom_rv{0, (0, 0), int64, True}(RandomGeneratorSharedVariable(<Generator(PCG64) at 0x7FF34DED1540>), MakeVector{dtype='int64'}.0, 4, a, Composite{...}.1)
Toposort index: 5
Inputs types: [RandomGeneratorType, TensorType(int64, shape=(1,)), TensorType(int64, shape=()), TensorType(float64, shape=()), TensorType(float64, shape=(None,))]
Inputs shapes: ['No shapes', (1,), (), (), (348,)]
Inputs strides: ['No strides', (8,), (), (), (8,)]
Inputs values: [Generator(PCG64) at 0x7FF34DED1540, array([10]), array(4), array(1.77183826), 'not shown']
Outputs clients: [['output'], ['output']]

HINT: Re-running with most PyTensor optimizations disabled could provide a back-trace showing when this node was created. This can be done by setting the PyTensor flag 'optimizer=fast_compile'. If that does not work, PyTensor optimizations can be disabled with 'optimizer=None'.
HINT: Use the PyTensor flag `exception_verbosity=high` for a debug print-out and storage map footprint of this Apply node.

In [116]:
# Create model3 with the test data so the model is created with the correct shapes
with pm.Model() as model3:
    xdata2 = pm.MutableData("xdata", xt)
    a2 = pm.Exponential("a", 1)
    mu_2 = pmb.BART("mu_", xdata2, np.log(yt), m=50)
    mu2 = pm.Deterministic("mu", pm.math.exp(mu_2))
    y2 = pm.NegativeBinomial("y", mu=mu2, alpha=a2, observed=yt, shape=xdata2.shape[0])

In [122]:
with model3:
    # pm.set_data({"xdata": xt})
    post6 = pm.sample_posterior_predictive(idata4, var_names=["mu", "y"], )


with model3:
    # pm.set_data({"xdata": xt})
    post4 = pm.sample_posterior_predictive(idata_bikes, var_names=["mu", "y"], )

# this runs

Sampling: [mu_, y]


Sampling: [mu_, y]


In [126]:
# however the new outputs aren't correct. They don't seam to even making predicitons
# possibly assuming that the mu values are the baseline value of the bart variable with the data
print(post4.posterior_predictive["mu"].values.mean((0,1)))
print(post6.posterior_predictive["mu"].values.mean((0,1)))
print(post4.posterior_predictive["y"].values.mean((0,1)))
print(post6.posterior_predictive["y"].values.mean((0,1)))

[31.40818843 31.40818843 31.40818843 31.40818843 31.40818843 31.40818843
 31.40818843 31.40818843 31.40818843 31.40818843]
[31.40818843 31.40818843 31.40818843 31.40818843 31.40818843 31.40818843
 31.40818843 31.40818843 31.40818843 31.40818843]
[30.605  31.8    33.1575 31.2375 31.6    32.8975 32.6325 31.2025 33.125
 31.84  ]
[31.265  31.0225 31.645  31.155  30.9425 30.0925 30.33   30.925  31.9375
 33.255 ]


# Try Saving the whole model

In [138]:
with open("test_bikes.pkl", mode="wb") as file:
    cpkl.dump(model_bikes, file)
with open("test_bikes.pkl", mode='rb') as file:
    model_3 = cpkl.load(file)

FileNotFoundError: [Errno 2] No such file or directory

In [134]:
with model_3:
    post7 = pm.sample_posterior_predictive(idata4, var_names=["mu", "y"], )


Sampling: [mu_, y]


FileNotFoundError: [Errno 2] No such file or directory
Apply node that caused the error: BART_rv{1, (2, 1, 0, 0, 0, 1), floatX, True}(RandomGeneratorSharedVariable(<Generator(PCG64) at 0x7FF34E558900>), [], 11, xdata, [2.7725887 ... .49650756], 20, 0.95, 2.0, [])
Toposort index: 1
Inputs types: [RandomGeneratorType, TensorType(int64, shape=(0,)), TensorType(int64, shape=()), TensorType(float64, shape=(None, None)), TensorType(float64, shape=(348,)), TensorType(int8, shape=()), TensorType(float64, shape=()), TensorType(float32, shape=()), TensorType(float64, shape=(0,))]
Inputs shapes: ['No shapes', (0,), (), (10, 4), (348,), (), (), (), (0,)]
Inputs strides: ['No strides', (8,), (), (8, 80), (8,), (), (), (), (8,)]
Inputs values: [Generator(PCG64) at 0x7FF34E558900, array([], dtype=int64), array(11), 'not shown', 'not shown', array(20, dtype=int8), array(0.95), array(2., dtype=float32), array([], dtype=float64)]
Outputs clients: [['output'], [Composite{...}(mu_, ExpandDims{axis=0}.0)]]

HINT: Re-running with most PyTensor optimizations disabled could provide a back-trace showing when this node was created. This can be done by setting the PyTensor flag 'optimizer=fast_compile'. If that does not work, PyTensor optimizations can be disabled with 'optimizer=None'.
HINT: Use the PyTensor flag `exception_verbosity=high` for a debug print-out and storage map footprint of this Apply node.