# Wikipedia example using sklearn

In [7]:
%load_ext jupyter_black

The jupyter_black extension is already loaded. To reload it, use:
  %reload_ext jupyter_black


In [8]:
import polars as pl
import math

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

### Create sample data, use it to populate a dataframe and define variables for the new data sample
This example is from Wikipedia. The example can be found here: https://en.wikipedia.org/wiki/Naive_Bayes_classifier#Examples

In [9]:
data = {
    "gender": ["male", "male", "male", "male", "female", "female", "female", "female"],
    "height": [6, 5.92, 5.58, 5.92, 5, 5.5, 5.42, 5.75],
    "weight": [180, 190, 170, 165, 100, 150, 130, 150],
    "foot_size": [12, 11, 12, 10, 6, 8, 7, 9],
}
df = pl.DataFrame(
    data,
    schema={
        "gender": pl.String,
        "height": pl.Float64,
        "weight": pl.Float64,
        "foot_size": pl.Float64,
    },
)

# Convert the string column (Gender) to a numeric value
We need to assign a numeric value to the gender values for sklearn.

In [22]:
# Use a LabelEncoder
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html
le = LabelEncoder()

# List of columns that need to be assigned numeric values
columns_to_encode = ["gender"]

# Convert values in the columns that should be encoded into encoded values
pl_series = [
    pl.Series(feature + "_num", le.fit_transform(df[feature]))
    for feature in columns_to_encode
]

# Create a new dataframe with the encoded data and the sample data
df = df.with_columns(pl_series)
df

gender,height,weight,foot_size,gender_num
str,f64,f64,f64,i64
"""male""",6.0,180.0,12.0,1
"""male""",5.92,190.0,11.0,1
"""male""",5.58,170.0,12.0,1
"""male""",5.92,165.0,10.0,1
"""female""",5.0,100.0,6.0,0
"""female""",5.5,150.0,8.0,0
"""female""",5.42,130.0,7.0,0
"""female""",5.75,150.0,9.0,0


# Get a mapping between the string and numeric values

In [23]:
gender_mapping = (
    df.select(
        [
            "gender",
            "gender_num",
        ]
    )
    .group_by("gender")
    .agg(pl.col("gender_num").first())
)
gender_mapping

gender,gender_num
str,i64
"""female""",0
"""male""",1


# Use sklearn to train the classifier and test it
1. Initialize, train & test
2.  Print the accuracy

In [38]:
# Extract features and labels
X = df.drop(
    [
        "gender",
        "gender_num",
    ]
)
y = df["gender_num"]

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y)

# Initialize the Gaussian Naive Bayes classifier
gnb = GaussianNB()

# Train the model
gnb.fit(X_train, y_train)

# Predict the labels for the test set
y_pred = gnb.predict(X_test)
print(f"Predicted values: {y_pred}")

# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Predicted values: [0 0]
Accuracy: 0.0


### Classify the new data sample

In [39]:
# Create a record to classify
p1 = {
    "height": [6],
    "weight": [130],
    "foot_size": [8],
}
df_p1 = pl.DataFrame(
    p1,
    schema={
        # "gender": pl.String,
        "height": pl.Float64,
        "weight": pl.Float64,
        "foot_size": pl.Float64,
    },
)

# Classify the record
predicted = gnb.predict(df_p1)

# Convert the encoded label to the string
predicted = gender_mapping.filter(pl.col("gender_num") == predicted).select("gender")
print(f"predicted: {predicted}")

predicted: shape: (1, 1)
┌────────┐
│ gender │
│ ---    │
│ str    │
╞════════╡
│ female │
└────────┘
