In [98]:
%load_ext jupyter_black

In [63]:
import polars as pl
import math


# Create sample data, use it to populate a dataframe and define variables for the new data sample

In [94]:
data = {
    "gender": ["male", "male", "male", "male", "female", "female", "female", "female"], 
    "height": [6, 5.92, 5.58, 5.92, 5, 5.5, 5.42, 5.75], 
    "weight": [180, 190, 170, 165, 100, 150, 130, 150], 
    "foot_size": [12, 11, 12, 10, 6, 8, 7, 9]
}
df = pl.DataFrame(
    data, 
    schema={
        "gender": pl.String, 
        "height": pl.Float64, 
        "weight": pl.Float64, 
        "foot_size": pl.Float64
    })

new_height = 6
new_weight = 130
new_foot_size = 8


# Output the sample data dataframe

In [88]:
df

gender,height,weight,foot_size
str,f64,f64,f64
"""male""",6.0,180.0,12.0
"""male""",5.92,190.0,11.0
"""male""",5.58,170.0,12.0
"""male""",5.92,165.0,10.0
"""female""",5.0,100.0,6.0
"""female""",5.5,150.0,8.0
"""female""",5.42,130.0,7.0
"""female""",5.75,150.0,9.0


# Calculate the means for males and females and output the result

In [99]:
df_mean = pl.concat(
    [
        df.filter(pl.col("gender").eq("male"))
        .mean()
        .with_columns(pl.col("gender").fill_null("male")),
        df.filter(pl.col("gender").eq("female"))
        .mean()
        .with_columns(pl.col("gender").fill_null("female")),
    ]
)

df_mean

gender,height,weight,foot_size
str,f64,f64,f64
"""male""",5.855,176.25,11.25
"""female""",5.4175,132.5,7.5


# Join the sample data dataframe to the averages dataframe, rename columns and calculate the square of the difference

In [100]:
df_2 = (
    df.join(df_mean, on="gender")
    .rename(
        {
            "height_right": "height_avg",
            "weight_right": "weight_avg",
            "foot_size_right": "foot_size_avg",
        }
    )
    .with_columns(
        ((pl.col("height") - pl.col("height_avg")) ** 2).alias("height_sq_diff"),
        ((pl.col("weight") - pl.col("weight_avg")) ** 2).alias("weight_sq_diff"),
        ((pl.col("foot_size") - pl.col("foot_size_avg")) ** 2).alias(
            "foot_size_sq_diff"
        ),
    )
)
df_2

gender,height,weight,foot_size,height_avg,weight_avg,foot_size_avg,height_sq_diff,weight_sq_diff,foot_size_sq_diff
str,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""male""",6.0,180.0,12.0,5.855,176.25,11.25,0.021025,14.0625,0.5625
"""male""",5.92,190.0,11.0,5.855,176.25,11.25,0.004225,189.0625,0.0625
"""male""",5.58,170.0,12.0,5.855,176.25,11.25,0.075625,39.0625,0.5625
"""male""",5.92,165.0,10.0,5.855,176.25,11.25,0.004225,126.5625,1.5625
"""female""",5.0,100.0,6.0,5.4175,132.5,7.5,0.174306,1056.25,2.25
"""female""",5.5,150.0,8.0,5.4175,132.5,7.5,0.006806,306.25,0.25
"""female""",5.42,130.0,7.0,5.4175,132.5,7.5,6e-06,6.25,0.25
"""female""",5.75,150.0,9.0,5.4175,132.5,7.5,0.110556,306.25,2.25


# Sum the square of the differences for height, weight and foot size

In [101]:
sq_diff_sum = pl.concat(
    [
        df_2.filter(pl.col("gender").eq("male"))
        .sum()
        .drop(
            [
                "height",
                "weight",
                "foot_size",
                "height_avg",
                "weight_avg",
                "foot_size_avg",
            ]
        )
        .with_columns(pl.col("gender").fill_null("male")),
        df_2.filter(pl.col("gender").eq("female"))
        .sum()
        .drop(
            [
                "height",
                "weight",
                "foot_size",
                "height_avg",
                "weight_avg",
                "foot_size_avg",
            ]
        )
        .with_columns(pl.col("gender").fill_null("female")),
    ]
).rename(
    {
        "height_sq_diff": "height_sq_diff_sum",
        "weight_sq_diff": "weight_sq_diff_sum",
        "foot_size_sq_diff": "foot_size_sq_diff_sum",
    }
)
sq_diff_sum

gender,height_sq_diff_sum,weight_sq_diff_sum,foot_size_sq_diff_sum
str,f64,f64,f64
"""male""",0.1051,368.75,2.75
"""female""",0.291675,1675.0,5.0


# Get the sample sizes

In [102]:
count_males = df.filter(pl.col("gender").eq("male")).shape[0] - 1
count_females = df.filter(pl.col("gender").eq("female")).shape[0] - 1

print(count_males)
print(count_females)

3
3


# Calculate the variances for height, weight and foot size and join to the sq_diff_sum dataframe so all the data we need is in one dataframe

In [103]:
variance_df = sq_diff_sum.join(df_mean, on="gender").select(
    pl.col("gender"),
    height_avg=pl.col("height"),
    weight_avg=pl.col("weight"),
    foot_size_avg=pl.col("foot_size"),
    height_variance=pl.col("height_sq_diff_sum") / count_males,
    weight_variance=pl.col("weight_sq_diff_sum") / count_males,
    foot_size_variance=pl.col("foot_size_sq_diff_sum") / count_males,
)
variance_df

gender,height_avg,weight_avg,foot_size_avg,height_variance,weight_variance,foot_size_variance
str,f64,f64,f64,f64,f64,f64
"""male""",5.855,176.25,11.25,0.035033,122.916667,0.916667
"""female""",5.4175,132.5,7.5,0.097225,558.333333,1.666667


# Calculate the probability distribution

In [104]:
prob_dist_df = variance_df.select(
    pl.col("gender"),
    (
        1
        / ((2 * math.pi * variance_df["height_variance"]) ** 0.5)
        * pl.lit(math.e).pow(
            (-((new_height - variance_df["height_avg"]) ** 2))
            / (2 * variance_df["height_variance"])
        )
    ).alias("prob_distribution_height"),
    (
        1
        / ((2 * math.pi * variance_df["weight_variance"]) ** 0.5)
        * pl.lit(math.e).pow(
            (-((new_weight - variance_df["weight_avg"]) ** 2))
            / (2 * variance_df["weight_variance"])
        )
    ).alias("prob_distribution_weight"),
    (
        1
        / ((2 * math.pi * variance_df["foot_size_variance"]) ** 0.5)
        * pl.lit(math.e).pow(
            (-((new_foot_size - variance_df["foot_size_avg"]) ** 2))
            / (2 * variance_df["foot_size_variance"])
        )
    ).alias("prob_distribution_foot_size"),
)
prob_dist_df

gender,prob_distribution_height,prob_distribution_weight,prob_distribution_foot_size
str,f64,f64,f64
"""male""",1.578883,6e-06,0.001311
"""female""",0.223459,0.016789,0.286691


# Calculate the posterior numerators

In [105]:
posterior_numerator_df = prob_dist_df.select(
    pl.col("gender"),
    (
        0.5
        * pl.col("prob_distribution_height")
        * pl.col("prob_distribution_weight")
        * pl.col("prob_distribution_foot_size")
    ).alias("posterior_numerator"),
)
posterior_numerator_df

gender,posterior_numerator
str,f64
"""male""",6.1971e-09
"""female""",0.000538


# Classify the new data sample

In [106]:
male_pn = posterior_numerator_df.filter(pl.col("gender").eq("male"))[
    "posterior_numerator"
][0]
female_pn = posterior_numerator_df.filter(pl.col("gender").eq("female"))[
    "posterior_numerator"
][0]

print(f"{male_pn} vs {female_pn}")

print(f"Most likely {'male' if male_pn - female_pn > 0 else 'female'}")

6.197071843878087e-09 vs 0.0005377909183630024
Most likely female
