## M&M MUDEL

Laadime teegid ning vaatlused

In [None]:
import numpy as np
import pandas as pd

import pymc as pm
import arviz as az

import altair as alt

n_sinised = [28, 21, 19, 19, 15, 18, 19, 14]
n_total = [83, 78, 82, 79, 74, 78, 75, 79]

sinised = np.sum(n_sinised)
total = np.sum(n_total)

sample = np.concatenate((np.repeat(1, sinised), np.repeat(0, total - sinised)))

sample

#### Prior predictive check

Simuleerime eeljaotuse eeldusel, et värvide jaotus on ühtlane ning mudeldame sinist värvi kommi valimise tõenäosusjaotuse. Selleks tõmbame binoom-jaotusest 1000 korda meie vaatluste arvuga võrdlse sämpli ning arvutame värv=sinine proportsiooni igas simuleeritud sämplis.

In [None]:

prior = np.random.binomial(total, 1/6, 1000) / total

data = pd.DataFrame(prior, columns=['value'])

prior_plot = alt.Chart(data).transform_density(
    'value',
    as_=['value', 'density'],
    #bandwidth = 0.05
).mark_area(opacity=0.5).encode(
    x="value:Q",
    y='density:Q',
)

prior_plot

Kirjeldame mudeli, mis sämplib Bernoulli jaotust meie vaatlusandmete põhjal. Sinise värvi esinemissagedusele määrame *flat priori* (p.Uniform()), lubades sellel varieeruda vahemikus nullist üheni. Sellega väljendame, et meil ei ole mingeid eeldusi värvide jaotuse osas - peame eeldusena ühtviisi tõenäoliseks seda, et pakis ei pruugi olla ühtki sinist kommi, et kõik kommid pakis on sinised või et neid on seal 1/6 kõigist kommidest.

pm.Uniform() on "blondi prior", mis annab mudelile maksimaalse paindlikkuse. Reeglina on meil võimalik valida oma eelteadmiste põhjal paremaid prioreid (antud juhul oleks selleks tõenäosuse väljendamiseks beta-jaotus, või siniste kommide arvu väljendamiseks Poissoni jaotus), aga nii lihtsa mudeli puhul ei ole sellel praktilist mõju. Samuti ei püüa me siin lihtsuse nimel modelleerida erinevate värvide üheaegseid jaotusi, vaid ainult tõenäosust, et pakist juhuslikult valitud komm on sinine.

In [None]:

with pm.Model() as mm_model:
    p = pm.Uniform('p', lower=0, upper=1)
    y = pm.Bernoulli('y', p=p, observed=sample)
    
pm.model_to_graphviz(mm_model)
    


Mudel sämplib meie poolt antud priori ning vaatluste alusel Bernoulli jaotusest neljas eraldi ahelas 1000 väärtust kalibreerimiseks ning seejärel 2000 väärtust posteriori.

In [None]:
with mm_model:
    trace = pm.sample(draws=2000, tune=1000, target_accept=0.9)

pm.plot_trace(trace)

Kontrollime diagnostikat ning joonistame järeljaotuse (*posterior distribution*) vastavalt vaatlustele.

In [None]:
print(pm.summary(trace))

az.plot_posterior(trace, ref_val=0.22)

Joonistame eel- ja järeljaotuste võrdluse.

In [None]:

post_plot = alt.Chart(pd.DataFrame(trace.posterior.p[0], columns=['value']), width=400).transform_density(
    'value',
    as_=['value', 'density'],
    #bandwidth = 0.05
).mark_area(opacity=0.5, color='pink').encode(
    x="value:Q",
    y='density:Q',
)

post_plot + prior_plot

In [None]:
import scipy.stats as stats
from matplotlib import pyplot as plt 

def posterior_grid(grid=10, a=2, b=2, heads=5, trials=20): 
    grid = np.linspace(0, 1, grid)
    prior = stats.beta(a, b).pdf(grid)
    likelihood = stats.binom.pmf(heads, trials, grid) 
    posterior = likelihood * prior
    posterior /= posterior.sum()

    _, ax = plt.subplots(1, 3, sharex=True, figsize=(16, 4)) 
    ax[0].set_title(f"sinine = {heads}\ntrials = {trials}") 
    for i, (e, e_n) in enumerate(zip(
            [prior, likelihood, posterior],
            ["prior", "likelihood", "posterior"])):
        ax[i].set_yticks([])
        ax[i].plot(grid, e, "o-", label=e_n)
        ax[i].legend(fontsize=14)

#az.style.use("arviz-viridish")
posterior_grid(heads=sinised, trials=total)

In [None]:
%matplotlib inline
from IPython.core.pylabtools import figsize

figsize(11, 7)

dist = stats.beta
n_trials = [0, 1, 2, 3, 4, 5, 20, 50, 500]
data = stats.bernoulli.rvs(0.24, size=n_trials[-1])
x = np.linspace(0, 1, 100)

for k, N in enumerate(n_trials):
    sx = plt.subplot(len(n_trials)//2, 3, k+1)
    plt.xlabel("$p$, tõenäosus, et komm on sinine") \
        if k in [0, len(n_trials)-1] else None
    plt.setp(sx.get_yticklabels(), visible=False)
    heads = data[:N].sum()
    y = dist.pdf(x, 1 + heads, 1 + N - heads)
    plt.plot(x, y, label="võetud %d kommi,\n %d sinised" % (N, heads))
    plt.fill_between(x, 0, y, color="#348ABD", alpha=0.4)
    plt.vlines(0.5, 0, 4, color="k", linestyles="--", lw=1)

    leg = plt.legend()
    leg.get_frame().set_alpha(0.4)
    plt.autoscale(tight=True)


plt.suptitle("Bayesian järeltõenäosuste uuendamine",
             y=1.02,
             fontsize=14)

plt.tight_layout()

In [None]:
import numpy as np
import pandas as pd
import pymc as pm
import arviz as az

# Example data: Replace this with your actual dataset
data = {
    "Red": [12, 15, 13, 10, 14],
    "Blue": [10, 9, 11, 12, 8],
    "Green": [8, 11, 10, 13, 10],
    "Yellow": [6, 7, 5, 8, 9],
    "Orange": [4, 3, 6, 4, 5]
}
df = pd.DataFrame(data)

# Prepare data for modeling
data_counts = df.values
num_bags, num_colors = data_counts.shape
color_names = df.columns.tolist()

# Multinomial regression with PyMC
with pm.Model(coords={"colors": color_names}) as model:
    # Priors for the category probabilities (logit scale)
    intercept = pm.Normal("intercept", mu=0, sigma=2, dims="colors")

    # Softmax transformation to convert logit scale to probabilities
    p = pm.Deterministic("p", pm.math.softmax(intercept), dims="colors")

    # Multinomial likelihood
    likelihood = pm.Multinomial(
        "likelihood",
        n=data_counts.sum(axis=1),  # Total counts per bag
        p=p,
        observed=data_counts
    )

    # Sampling
    trace = pm.sample(2000, tune=1000, target_accept=0.9, return_inferencedata=True)

# Summarize the results
az.plot_trace(trace)


In [None]:
az.summary(trace)

In [None]:
df.mean() / df.sum(axis=1).mean()

In [None]:

# Multinomial regression with PyMC
with pm.Model(coords={"colors": color_names}) as model:
    # Dirichlet prior for the category probabilities
    alpha = pm.HalfNormal("alpha", sigma=2, shape=num_colors)
    p = pm.Dirichlet("p", a=alpha, dims="colors")

    # Multinomial likelihood
    likelihood = pm.Multinomial(
        "likelihood",
        n=data_counts.sum(axis=1),  # Total counts per bag
        p=p,
        observed=data_counts
    )

    # Sampling
    trace = pm.sample(2000, tune=1000, target_accept=0.9, return_inferencedata=True)

# Summarize the results
az.plot_trace(trace, kind='rank_bars')



In [None]:
az.summary(trace, var_names=["p"])
