In [27]:
import pandas as pd
import numpy as np
from scipy.stats import norm

# Recreating the dataset in a DataFrame
data = {
    "ID": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
    "SS-IN": [168, 156, 176, 256, 230, 116, 242, 242, 174, 1004, 1228, 964, 2008],
    "SED-IN": [3, 3, 3.5, 3, 5, 3, 7, 4.5, 2.5, 35, 46, 17, 32],
    "COND-IN": [1814, 1358, 2200, 2070, 1410, 1238, 1315, 1183, 1110, 1218, 1889, 2120, 1257],
    "SS-OUT": [15, 14, 16, 27, 131, 104, 104, 78, 73, 81, 82.4, 20, 13],
    "SED-OUT": [0.001, 0.01, 0.005, 0.2, 3.5, 0.06, 0.01, 0.02, 1.5, 1172, 1932, 1030, 1038],
    "COND-OUT": [1879, 1425, 2140, 2700, 1575, 1221, 1434, 1374, 1256, 33.3, 43.1, 1966, 1289],
    "STATUS": ["ok", "ok", "ok", "ok", "settler", "settler", "settler", "settler", "settler", "solids", "solids", "solids", "solids"]
}

df = pd.DataFrame(data)

# Calculate mean and standard deviation for each feature within each STATUS class
class_stats = df.groupby("STATUS").agg(["mean", "std"])

# Extract mean and std for later use in Naive Bayes prediction
features = ["SS-IN", "SED-IN", "COND-IN", "SS-OUT", "SED-OUT", "COND-OUT"]
class_stats = class_stats.loc[:, [(feature, 'mean') for feature in features] + [(feature, 'std') for feature in features]]

# Renaming columns for easier access
class_stats.columns = [f"{stat}_{feature}" for feature, stat in class_stats.columns]

# Setting the prior probabilities as per the answer key
priors = {
    "ok": 4/13,
    "settler": 5/13,
    "solids": 4/13
}

In [28]:
# Updated function to calculate class probabilities including priors
def calculate_class_probabilities_with_prior(input_data, class_stats, priors):
    """
    Calculate the probability of input_data belonging to each class
    based on Gaussian Naive Bayes principles and given prior probabilities.
    """
    probabilities = {}
    for cls in class_stats.index:  # Iterate over each class
        probabilities[cls] = priors[cls]  # Initialize with the prior for each class
        for feature in features:  # Calculate probability for each feature
            # Retrieve mean and std for the feature and class
            mean = class_stats.loc[cls, f"mean_{feature}"]
            std = class_stats.loc[cls, f"std_{feature}"]
            # Compute the probability density
            probabilities[cls] *= norm.pdf(input_data[feature], mean, std)
    return probabilities

In [29]:
# New test data based on the user's query
test_data_query = {
    "SS-IN": 222, 
    "SED-IN": 4.5, 
    "COND-IN": 1518, 
    "SS-OUT": 74, 
    "SED-OUT": 0.25, 
    "COND-OUT": 1642
}

# Calculate probabilities for each class with the new query
class_probabilities_with_prior = calculate_class_probabilities_with_prior(test_data_query, class_stats, priors)

# Output the calculated probabilities
print(class_probabilities_with_prior)

{'ok': 3.4157745375710564e-36, 'settler': 1.538371789394484e-13, 'solids': 1.0066835222724292e-21}


In [17]:
from sklearn.naive_bayes import GaussianNB
import numpy as np
import pandas as pd

# Recreate the dataset
data = {
    "ID": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
    "SS-IN": [168, 156, 176, 256, 230, 116, 242, 242, 174, 1004, 1228, 964, 2008],
    "SED-IN": [3, 3, 3.5, 3, 5, 3, 7, 4.5, 2.5, 35, 46, 17, 32],
    "COND-IN": [1814, 1358, 2200, 2070, 1410, 1238, 1315, 1183, 1110, 1218, 1889, 2120, 1257],
    "SS-OUT": [15, 14, 16, 27, 131, 104, 104, 78, 73, 81, 82.4, 20, 13],
    "SED-OUT": [0.001, 0.01, 0.005, 0.2, 3.5, 0.06, 0.01, 0.02, 1.5, 1172, 1932, 1030, 1038],
    "COND-OUT": [1879, 1425, 2140, 2700, 1575, 1221, 1434, 1374, 1256, 33.3, 43.1, 1966, 1289],
    "STATUS": ["ok", "ok", "ok", "ok", "settler", "settler", "settler", "settler", "settler", "solids", "solids", "solids", "solids"]
}

df = pd.DataFrame(data)

# Prepare the features (X) and target (y)
X = df[["SS-IN", "SED-IN", "COND-IN", "SS-OUT", "SED-OUT", "COND-OUT"]]
y = df["STATUS"]

# Define priors as per the answer key (normalized sum to 1 for sklearn compatibility)
priors = [0.3077, 0.3846, 0.3077]

# Initialize the Gaussian Naive Bayes model with the specified priors and adjusted smoothing
model = GaussianNB()

# Fit the model on the entire dataset
model.fit(X, y)

# Test data based on the user's query
test_data_query = np.array([[222, 4.5, 1518, 74, 0.25, 1642]])

# Predict the class for the test data
predicted_class = model.predict(test_data_query)
predicted_proba = model.predict_proba(test_data_query)

print("Predicted class:", predicted_class[0])
print("Predicted probabilities:", predicted_proba)


Predicted class: settler
Predicted probabilities: [[2.15053042e-31 9.99999999e-01 9.27264832e-10]]




In [25]:
import numpy as np
from scipy.stats import norm

# Input data based on the user's query
test_data_query = {
    "SS-IN": 222, 
    "SED-IN": 4.5, 
    "COND-IN": 1518, 
    "SS-OUT": 74, 
    "SED-OUT": 0.25, 
    "COND-OUT": 1642
}

class_stats_dict = {
    "ok": {
        "prior": 4/13,
        "SS-IN": {"mean": 189, "std": 45.42},
        "SED-IN": {"mean": 3.125, "std": 0.25},
        "COND-IN": {"mean": 1860.5, "std": 371.4},
        "SS-OUT": {"mean": 18, "std": 6.06},
        "SED-OUT": {"mean": 0.054, "std": 0.1},
        "COND-OUT": {"mean": 2036, "std": 532.19}
    },
    "settler": {
        "prior": 5/13,
        "SS-IN": {"mean": 200.8, "std": 55.13},
        "SED-IN": {"mean": 4.4, "std": 1.78},
        "COND-IN": {"mean": 1251.2, "std": 116.24},
        "SS-OUT": {"mean": 98, "std": 23.38},
        "SED-OUT": {"mean": 1.018, "std": 1.53},
        "COND-OUT": {"mean": 1372, "std": 142.58}
    },
    "solids": {
        "prior": 4/13,
        "SS-IN": {"mean": 1301, "std": 485.44},
        "SED-IN": {"mean": 32.5, "std": 11.96},
        "COND-IN": {"mean": 1621, "std": 453.04},
        "SS-OUT": {"mean": 49.1, "std": 37.76},
        "SED-OUT": {"mean": 1293, "std": 430.95},
        "COND-OUT": {"mean": 832.85, "std": 958.31}
    }
}

# Calculate normal pdfs
def calculate_probability_density(x, mean, std):
    return norm.pdf(x, mean, std)


# caluclate probabilities per class
def calculate_class_probabilities(test_data, class_params):
    probabilities = {}
    for cls, params in class_params.items():
        probabilities[cls] = params["prior"]
        # Multiply by the probability density for each feature
        for feature, value in test_data.items():
            mean = params[feature]["mean"]
            std = params[feature]["std"]
            probabilities[cls] *= calculate_probability_density(value, mean, std)
    return probabilities

test_data_query = {
    "SS-IN": 222, 
    "SED-IN": 4.5, 
    "COND-IN": 1518, 
    "SS-OUT": 74, 
    "SED-OUT": 0.25, 
    "COND-OUT": 1642
}
# find probabilites using test query
class_probabilities = calculate_class_probabilities(test_data_query, class_stats_dict)

print(class_probabilities)


{'ok': 3.9407563076782e-36, 'settler': 1.5372421577404095e-13, 'solids': 1.007257413722416e-21}


In [None]:
import pandas as pd

data = {
    "ID": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
    "SS-IN": [168, 156, 176, 256, 230, 116, 242, 242, 174, 1004, 1228, 964, 2008],
    "SED-IN": [3, 3, 3.5, 3, 5, 3, 7, 4.5, 2.5, 35, 46, 17, 32],
    "COND-IN": [1814, 1358, 2200, 2070, 1410, 1238, 1315, 1183, 1110, 1218, 1889, 2120, 1257],
    "SS-OUT": [15, 14, 16, 27, 131, 104, 104, 78, 73, 81, 82.4, 20, 13],
    "SED-OUT": [0.001, 0.01, 0.005, 0.2, 3.5, 0.06, 0.01, 0.02, 1.5, 1172, 1932, 1030, 1038],
    "COND-OUT": [1879, 1425, 2140, 2700, 1575, 1221, 1434, 1374, 1256, 33.3, 43.1, 1966, 1289],
    "STATUS": ["ok", "ok", "ok", "ok", "settler", "settler", "settler", "settler", "settler", "solids", "solids", "solids", "solids"]
}

df = pd.DataFrame(data)

# Calculate the mean and standard deviation for each feature and class
class_params = df.groupby("STATUS").agg(["mean", "std"])

In [21]:
import pandas as pd

data = {
    "ID": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
    "SS-IN": [168, 156, 176, 256, 230, 116, 242, 242, 174, 1004, 1228, 964, 2008],
    "SED-IN": [3, 3, 3.5, 3, 5, 3, 7, 4.5, 2.5, 35, 46, 17, 32],
    "COND-IN": [1814, 1358, 2200, 2070, 1410, 1238, 1315, 1183, 1110, 1218, 1889, 2120, 1257],
    "SS-OUT": [15, 14, 16, 27, 131, 104, 104, 78, 73, 81, 82.4, 20, 13],
    "SED-OUT": [0.001, 0.01, 0.005, 0.2, 3.5, 0.06, 0.01, 0.02, 1.5, 1172, 1932, 1030, 1038],
    "COND-OUT": [1879, 1425, 2140, 2700, 1575, 1221, 1434, 1374, 1256, 33.3, 43.1, 1966, 1289],
    "STATUS": ["ok", "ok", "ok", "ok", "settler", "settler", "settler", "settler", "settler", "solids", "solids", "solids", "solids"]
}

df = pd.DataFrame(data)

# Calculate the mean and standard deviation for each feature and class
class_stats = df.groupby("STATUS").agg(["mean", "std"])

# Change to dictionary
class_stats_dict = class_stats.to_dict()

{('ID', 'mean'): {'ok': 2.5, 'settler': 7.0, 'solids': 11.5},
 ('ID', 'std'): {'ok': 1.2909944487358056,
  'settler': 1.5811388300841898,
  'solids': 1.2909944487358056},
 ('SS-IN', 'mean'): {'ok': 189.0, 'settler': 200.8, 'solids': 1301.0},
 ('SS-IN', 'std'): {'ok': 45.41659021400293,
  'settler': 55.12893976850997,
  'solids': 485.44000659195774},
 ('SED-IN', 'mean'): {'ok': 3.125, 'settler': 4.4, 'solids': 32.5},
 ('SED-IN', 'std'): {'ok': 0.25,
  'settler': 1.7818529681205462,
  'solids': 11.958260743101397},
 ('COND-IN', 'mean'): {'ok': 1860.5, 'settler': 1251.2, 'solids': 1621.0},
 ('COND-IN', 'std'): {'ok': 371.40229760552995,
  'settler': 116.24413963723076,
  'solids': 453.03789392647207},
 ('SS-OUT', 'mean'): {'ok': 18.0, 'settler': 98.0, 'solids': 49.1},
 ('SS-OUT', 'std'): {'ok': 6.0553007081949835,
  'settler': 23.37733945512192,
  'solids': 37.75588254387212},
 ('SED-OUT', 'mean'): {'ok': 0.054000000000000006,
  'settler': 1.018,
  'solids': 1293.0},
 ('SED-OUT', 'std'): 