In [None]:
import pandas as pd
import numpy as np
import altair as alt

alt.data_transformers.enable("vegafusion") # so that Altair would not complain over too large dataset

# Read and format the data
hw = pd.read_csv('./data/height_weight_gender.csv')
hw.Height*=100
hw.replace({0: 'Male', 1: 'Female'}, inplace=True)
hw.columns = hw.columns.str.lower()

print(hw.head())
print(hw.shape)


Let's zoom into height and weight. 

They both are normally distributed, so let's use Gaussian as our priors -- and let's keep things simple and get the parameters from the data (and yes, we did say in teh seminar that this is what you should not do in general). But the next step is also silly anyway :)

We then first sample from both distributions *independently*, combine results into a dataframe and visualise it as scatterplot.

In [None]:
# Get distribution means and standard deviations for height and weight from the data
mean_h, sigma_h = hw.height.mean(), hw.height.std()
mean_w, sigma_w = hw.weight.mean(), hw.weight.std()

# number of draws
n = 5000

# Generate random samples
h_values = np.random.normal(mean_h, sigma_h, n)
w_values = np.random.normal(mean_w, sigma_w, n)

# Create the simulation dataframe
df_sim = pd.DataFrame({'height': h_values, 'weight': w_values})

alt.Chart(df_sim).mark_circle(size=7, opacity=0.5).encode(
    x=alt.X('weight').scale(domain=[20, 130]),
    y=alt.Y('height').scale(domain=[130,210])
).properties(height=400, width=400)

So this is what the data would look like if height and weight was not related. But of course we know that it in fact is. If we sample the actual data we see that it looks like we would expect it to -- there is a strong relationship and in order to visualise and quantify it we can fit a regression, like this:

In [None]:
# Create a base for plots
base = alt.Chart(hw).properties(
    width=400,
    height=400
) 

# Create scatter plot of weights and heights in the data
points = base.mark_circle(size=5, opacity=0.3).encode(
    alt.X('weight:Q').scale(domain=[25,125]),
    alt.Y('height:Q').scale(domain=[120,220]),
     tooltip=['height', 'weight']
).properties(height=400, width=400)

# ... and fit linear regression to see the relationship
(points + points.transform_regression('weight', 'height').mark_line(color='orange', size=1.5))

So let's take a step back and first isolate height and plot the actual distribution:

In [None]:

p = base.mark_bar(opacity=0.75, thickness=100).encode(
    alt.X('height')
        .bin(step=5) 
        .scale(domain=[120, 220])
        .title(None),
    alt.Y('count()')
        .stack(None)
        .title(None)
        .axis(None),
).properties(height=300, width=400)

p.show()

print(f'std: {hw.height.std():.2f}, mean: {hw.height.mean():.2f}')

As expected, the height distribution is normal. So we can create a Gaussian PDF function that will return a dataframe 
of values for a given mean and standard deviation in order to generate synthetic data.

In [None]:

def gaussian_pdf(mean, sigma):

    # Create a range of x values for heights
    x_values = np.linspace(mean - 4*sigma, mean + 4*sigma, 500)

    # Calculate the corresponding y values for the Gaussian PDF
    y_values = (1/(sigma * np.sqrt(2 * np.pi))) * np.exp(-0.5 * ((x_values - mean)/sigma)**2)

    # Return a dataframe of PDF values
    return pd.DataFrame({
        'x': x_values,
        'y': y_values
    })

mean = 168.6
sigma = 9.8

chart = alt.Chart(gaussian_pdf(mean, sigma)).mark_line(color='red').encode(
    x=alt.X('x', title='x'),
    y=alt.Y('y', title='Probability Density')
    .scale(domain=[0, 0.04])
).properties(
    title=f"Gaussian PDF (μ={mean}, σ={sigma})",
    width=400
)

# Check the fit
(p + chart).resolve_scale(y='independent')


It's pretty decent fit, so let's go on and plot the actual data along with the marginal distributions of both height and weight.


In [None]:

hist_x = base.mark_bar(opacity=0.75, thickness=100, orient='horizontal').encode(
    alt.X('weight')
        .bin(step=5) # step keeps bin size the same
        .scale(domain=[25,125]).axis(None),
    alt.Y('count()')
        .stack(None)
        .axis(None)
).properties(height=70, width=400)

hist_y = base.mark_bar(opacity=0.75, thickness=100).encode(
    alt.Y('height')
        .bin(step=5)
        .scale(domain=[120, 220])
        .axis(None),
    alt.X('count()')
        .stack(None)
        .axis(None),
).properties(width=70, height=400)

(hist_x & (points + points.transform_regression('weight', 'height').mark_line(color='orange', size=1.5) | hist_y)).configure_axis(
        title=None).configure_view(strokeWidth=0) 

The weight distribution looks suspicious, let's isolate and plot it separately.

In [None]:

p = base.mark_bar(opacity=0.75, thickness=100, orient='horizontal').encode(
    alt.X('weight')
        .bin(step=5) 
        .scale(domain=[25,125]),
    alt.Y('count()')
        .stack(None)
        .axis(None)
)

p.show()

print(f'std: {hw.weight.std():.2f}, mean: {hw.weight.mean():.2f}')

This is clearly not normal, but let's try to fit a Gaussian distribution to it.


In [None]:

mean = hw.weight.mean()
sigma = hw.weight.std()

chart = alt.Chart(gaussian_pdf(mean, sigma)).mark_line(color='red').encode(
    x=alt.X('x', title='weight'),
    y=alt.Y('y', title='Probability Density')
).properties(
    title=f"Gaussian PDF (μ={mean:.2f}, σ={sigma:.2f})"
)

(p + chart).resolve_scale(y='independent')

While not atrocious, the fit is not very good -- and of course in this particular case we can tell very easily that the actual distrbution of values is bimodal, and we can also tell easily what's the issue. So let's stratify the data by gender:

In [None]:

print(hw.groupby('gender').weight.mean(),'\n')
print(hw.groupby('gender').weight.std())


... and fit a separate Gaussian distribution to each gender.


In [None]:

mean_f, mean_m = hw.groupby('gender').weight.mean()
sigma_f, sigma_m = hw.groupby('gender').weight.std()

chart_f = alt.Chart(gaussian_pdf(mean_f, sigma_f)).mark_line(color='red').encode(
    x=alt.X('x').title('weight'),
    y=alt.Y('y').title('stratified pdf')
)

chart_m = alt.Chart(gaussian_pdf(mean_m, sigma_m)).mark_line(color='lightblue').encode(
    x=alt.X('x'),
    y=alt.Y('y').scale(domain=[0, 0.05]).axis(None)
).properties(
    #title=f"Gaussian PDF (μ={mean}, σ={sigma})"
)

((p + chart_f + chart_m).resolve_scale(y='independent'))

So there we are, that did it. In order to fit a single distribution we next create a mixture of two Gaussians.


In [None]:

from scipy.stats import norm

# Acquire gender proportions from the data
weights = hw.gender.value_counts(normalize=True)

# Weights for the mixture
w1, w2 = weights.Female, weights.Male  # Get gender weights from the sample

# Generate 500 x values between 30 and 120
x = np.linspace(30, 120, 500)

# Compute PDFs for both Gaussians
pdf1 = norm.pdf(x, mean_f, sigma_f)
pdf2 = norm.pdf(x, mean_m, sigma_m)

# Compute mixture
pdf_mixture = w1 * pdf1 + w2 * pdf2

gmm_df = pd.DataFrame({
    'weight': np.concatenate([x, x, x]),
    'pdf': np.concatenate([pdf1, pdf2, pdf_mixture]),
    'distribution': [f'Gaussian: μ={mean_f:.1f}, σ={sigma_f:.1f}'] * len(x) + [f'Gaussian: μ={mean_m:.1f}, σ={sigma_m:.1f}'] * len(x) + ['Mixture'] * len(x)
})

# Plot both gaussians and mixture
gmm = alt.Chart(gmm_df).mark_line().encode(
    x='weight',
    y='pdf',
    color='distribution:N' 
).properties(
    title="Mixture of Two Gaussians",
    width=400,
    height=400
)

gmm.show()


... and next fit it to the data.

In [None]:

gmm = alt.Chart(gmm_df[gmm_df.distribution=='Mixture'], title='Fit with Gaussian mixture').mark_line(color='red').encode(
    x=alt.X('weight:Q').title('weight'),
    y=alt.Y('pdf:Q').scale(domain=[0, 0.025])
)
((p + gmm).resolve_scale(y='independent'))

So voilà, we have a mixture model that fits the data well.
Let's plot the data again, this time with both marginal distributions stratified by gender and the fit separate regression lines for each gender.

In [None]:

selector = alt.selection_point(fields=['gender'])

color_scale = alt.Scale(domain=['Male', 'Female'],
                        range=['black', 'red']
                        )

base = alt.Chart(hw).properties(height=400, width=400).add_params(selector)

points = base.mark_point(filled=True, size=5, opacity=0.3).encode(
    alt.X('weight:Q').scale(domain=[25,125]),
    alt.Y('height:Q').scale(domain=[120,220]),
    color=alt.condition(
        selector,
        'gender:N',
        alt.value('lightgray'),
        scale=color_scale),
)

hist_x = base.mark_bar(opacity=0.5, thickness=100, orient='horizontal').encode(
    alt.X('weight')
        .bin(step=5) # step keeps bin size the same
        .scale(domain=[25,125]).axis(None),
    alt.Y('count()')
        .stack(None)
        .axis(None),
     alt.Color('gender:N').scale(color_scale).legend(title=None, orient='bottom')
).transform_filter(
    selector
).properties(height=65, width=400)

hist_y = base.mark_bar(opacity=0.5, thickness=100).encode(
    alt.Y('height')
        .bin(step=5) # step keeps bin size the same
        .scale(domain=[120, 220]).axis(None),
    alt.X('count()')
        .stack(None)
        .axis(None),
     alt.Color('gender:N').scale(color_scale)
).transform_filter(
    selector
).properties(width=65, height=400)

(hist_x & (points + points.transform_regression('weight', 'height', groupby=['gender']).mark_line(size=1)| hist_y)).configure_axis(
        title=None).configure_view(strokeWidth=0)

And there we go, we have created a really simple (not actually bayesian) conceptual generative model that would produce the data that we observed -- and that we can sample for any number of observations. So next we will turn to how to actually make it work.